Пример #1
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_id_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with file containing OTUIDs/BIOM format:{}\n".format(ioe))

    with open(args.otu_id_fp, "rU") as otuF:
        if args.reverse_lookup:
            otu_ids = []
            for line in otuF.readlines():
                if line:
                    otu_ids.append(line.strip())
        else:
            otu_ids = [line.strip().split("\t") for line in otuF.readlines()]
    taxa = util.parse_taxonomy_table(args.taxonomy_fp)

    with open(args.output_fp, "w") as outF:
        for entry in otu_ids:
            if isinstance(entry, list):
                # check for comments in BIOM files
                if not entry[0][0] == "#":
                    ID = entry[0]
                else:
                    outF.write("{}\n".format("\t".join(entry)))
                    continue
            # instead of a BIOM file, a line-by-line list of OTU IDs
            else:
                ID = entry

            # for looking up OTUIDs
            if args.reverse_lookup:
                for id, fulltaxa in taxa.iteritems():
                    otuname = otuc.otu_name(fulltaxa.split("; "))
                    if otuname == ID:
                        taxa_id = id
            # for looking up OTU name
            else:
                if ID in taxa:
                    named_ID = otuc.otu_name(taxa[ID].split("; "))
                else:
                    print "Error: OTU ID {} not found in supplied taxonomy file.".format(ID)
                    return

            # write out to file
            out_str = "{}\t{}\n"
            if isinstance(entry, list):
                outF.write(out_str.format(named_ID, "\t".join(entry[1:])))
            else:
                if args.reverse_lookup:
                    outF.write("{}\n".format(taxa_id))
                else:
                    outF.write(out_str.format(ID, named_ID))
Пример #2
0
def main():
    args = handle_program_options()

    # Read biom format file
    try:
        biomf = biom.load_table(args.otu_table)
    except IOError as ioe:
        sys.exit("\nError with biom format file (-i): {}\n".format(ioe))

    # Read in otus file data
    try:
        with open(args.otu_id_fp, "rU") as inf:
            dialect = csv.Sniffer().sniff(inf.read(1024))
            inf.seek(0)
            csvr = csv.reader(inf, dialect)
            otu_ids = [line[0] for line in csvr]
    except IOError as ioe:
        sys.exit("\nError with file containing OTUs:{}\n".format(ioe))

    output = {}
    for val, idx, md in biomf.iter(axis="observation"):
        name = otuc.otu_name(md["taxonomy"])
        if args.reverse_lookup:
            if name in otu_ids:
                output[name] = idx   # Get otuids from otu names
        else:
            if idx in otu_ids:
                output[idx] = name  # Get otu name from otu IDs
    with open(args.output_fp, "w") as outf:
        outf.write("Input\tOutput\n")
        for k, v in output.items():
            outf.write("{0}\t{1}\n".format(k, v))
def write_relative_abundance(rel_abd, biomf, out_fn, sort_by=None):
    """
    Given a BIOM table, calculate per-sample relative abundance for
    each OTU and write out to a tab-separated file listing OTUs as
    rows and Samples as columns.
    :type biom: biom object
    :param biom: BIOM-formatted OTU/Sample abundance data
    :type out_fn: str
    :param out_fn: The full path to the desired output file.
    :type sort_by: function
    :param sort_by: A function acting as a sorting key that will determine
                     the order in which the Sample IDs appear as columns in
                     the output file.
    """
    with open(out_fn, 'w') as out_f:
        sids = sorted(set(biomf.ids()), key=sort_by)
        out_f.write('#OTU ID\t{}\n'.format('\t'.join(sids)))

        for otuid in biomf.ids(axis="observation"):
            otuName = oc.otu_name(biomf.metadata(otuid, "observation")
                                  ["taxonomy"])
            sabd = [str(rel_abd[sid][otuid])
                    if sid in rel_abd and otuid in rel_abd[sid] else '0'
                    for sid in sids]
            out_f.write('{}\t{}\n'.format(otuName, '\t'.join(sabd)))
Пример #4
0
    def test_otu_name(self):
        """
        Testing the otu_name() function of otu_calc.py.

        :return: Returns OK if the test goals were achieved, otherwise
                 raises error.
        """
        self.tax = {
            "Unclassified_Methanosarcinales":
            ["k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia",
            "o__Methanosarcinales", "f__",
            "g__", "s__concilii"
            ],
            "Campylobacter_gracilis":
            ["k__Bacteria", "p__Proteobacteria", "c__Epsilonproteobacteria", 
             "o__Campylobacterales", "f__Campylobacteraceae", "g__Campylobacter", 
             "s__gracilis"],
            "Escherichia_spp.":
            ["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", 
             "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", 
             "s__"]
        }

        for expected, test in self.tax.items():
            self.result = oc.otu_name(test)

            # Testing the validity of the otu_name() function
            self.assertEqual(
                self.result, expected,
                msg='Error! Expected result: {}. otu_name() result: {}'.format(
                                                         expected, self.result)
            )
Пример #5
0
def get_relative_abundance(biomfile):
    """
    Return arcsine transformed relative abundance from a BIOM format file.

    :type biomfile: BIOM format file
    :param biomfile: BIOM format file used to obtain relative abundances for each OTU in
                     a SampleID, which are used as node sizes in network plots.

    :type return: Dictionary of dictionaries.
    :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
             whose value is the arc sine tranfsormed relative abundance value for that
             SampleID-OTU Name pair.
    """
    biomf = biom.load_table(biomfile)
    norm_biomf = biomf.norm(inplace=False)
    rel_abd = {}
    for sid in norm_biomf.ids():
        rel_abd[sid] = {}
        for otuid in norm_biomf.ids("observation"):
            otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
            otuname = " ".join(otuname.split("_"))
            abd = norm_biomf.get_value_by_ids(otuid, sid)
            rel_abd[sid][otuname] = abd
    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return ast_rel_abd
def get_relative_abundance(biomfile):
    """
    Return arcsine transformed relative abundance from a BIOM format file.

    :type biomfile: BIOM format file
    :param biomfile: BIOM format file used to obtain relative abundances for each OTU in
                     a SampleID, which are used as node sizes in network plots.

    :type return: Dictionary of dictionaries.
    :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
             whose value is the arc sine tranfsormed relative abundance value for that
             SampleID-OTU Name pair.
    """
    biomf = biom.load_table(biomfile)
    norm_biomf = biomf.norm(inplace=False)
    rel_abd = {}
    for sid in norm_biomf.ids():
        rel_abd[sid] = {}
        for otuid in norm_biomf.ids("observation"):
            otuname = oc.otu_name(
                norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
            otuname = " ".join(otuname.split("_"))
            abd = norm_biomf.get_value_by_ids(otuid, sid)
            rel_abd[sid][otuname] = abd
    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return ast_rel_abd
Пример #7
0
def write_relative_abundance(rel_abd, biomf, out_fn, sort_by=None):
    """
    Given a BIOM table, calculate per-sample relative abundance for
    each OTU and write out to a tab-separated file listing OTUs as
    rows and Samples as columns.
    :type biom: biom object
    :param biom: BIOM-formatted OTU/Sample abundance data
    :type out_fn: str
    :param out_fn: The full path to the desired output file.
    :type sort_by: function
    :param sort_by: A function acting as a sorting key that will determine
                     the order in which the Sample IDs appear as columns in
                     the output file.
    """
    with open(out_fn, 'w') as out_f:
        sids = sorted(set(biomf.ids()), key=sort_by)
        out_f.write('#OTU ID\t{}\n'.format('\t'.join(sids)))

        for otuid in biomf.ids(axis="observation"):
            otuName = oc.otu_name(
                biomf.metadata(otuid, "observation")["taxonomy"])
            sabd = [
                str(rel_abd[sid][otuid])
                if sid in rel_abd and otuid in rel_abd[sid] else '0'
                for sid in sids
            ]
            out_f.write('{}\t{}\n'.format(otuName, '\t'.join(sabd)))
Пример #8
0
def calc_rel_abd(biomf, sampleIDs=None):
    """
    Calculate relative abundance from a biom table either based on sampleIDs or
    otuIDs.

    :type biomf: Biom file format
    :param biomf: Biom table loaded object

    :type sampleIDs: list
    :param sampleIDs: Only calculate relative abundances for these sampleIDs.
                      Default is None.

    :return type: defaultdict(dict)
    :return: A dict keyed on sampleID with its value denoting a dict keyed on
             otuID and abundance value for that [sampleID, otuID] pair.
    """
    norm_biomf = biomf.norm(inplace=False)
    if sampleIDs is None:
        sampleIDs = norm_biomf.ids()
    otuIDs = norm_biomf.ids(axis="observation")
    rel_abd = defaultdict(dict)

    for sample in sampleIDs:
        for otu in otuIDs:
            abd = norm_biomf.get_value_by_ids(otu, sample)
            otu_tax = norm_biomf.metadata(otu, "observation")["taxonomy"]
            otu_name = oc.otu_name(otu_tax)
            rel_abd[sample][otu_name] = abd
    trans_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return trans_rel_abd
Пример #9
0
def calc_rel_abd(biomf, sampleIDs=None):
    """
    Calculate relative abundance from a biom table either based on sampleIDs or
    otuIDs.

    :type biomf: Biom file format
    :param biomf: Biom table loaded object

    :type sampleIDs: list
    :param sampleIDs: Only calculate relative abundances for these sampleIDs.
                      Default is None.

    :return type: defaultdict(dict)
    :return: A dict keyed on sampleID with its value denoting a dict keyed on
             otuID and abundance value for that [sampleID, otuID] pair.
    """
    norm_biomf = biomf.norm(inplace=False)
    if sampleIDs is None:
        sampleIDs = norm_biomf.ids()
    otuIDs = norm_biomf.ids(axis="observation")
    rel_abd = defaultdict(dict)

    for sample in sampleIDs:
        for otu in otuIDs:
            abd = norm_biomf.get_value_by_ids(otu, sample)
            otu_tax = norm_biomf.metadata(otu, "observation")["taxonomy"]
            otu_name = oc.otu_name(otu_tax)
            rel_abd[sample][otu_name] = abd
    trans_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return trans_rel_abd
Пример #10
0
def newick_replace_otuids(tree, biomf):
    """
    Replace the OTU ids in the Newick phylogenetic tree format with truncated
    OTU names
    """
    for val, id_, md in biomf.iter(axis="observation"):
        otu_loc = find_otu(id_, tree)
        if otu_loc is not None:
            tree = tree[:otu_loc] + \
                   oc.otu_name(md["taxonomy"]) + \
                   tree[otu_loc + len(id_):]
    return tree
Пример #11
0
def newick_replace_otuids(tree, biomf):
    """
    Replace the OTU ids in the Newick phylogenetic tree format with truncated
    OTU names
    """
    for val, id_, md in biomf.iter(axis="observation"):
        otu_loc = find_otu(id_, tree)
        if otu_loc is not None:
            tree = tree[:otu_loc] + \
                   oc.otu_name(md["taxonomy"]) + \
                   tree[otu_loc + len(id_):]
    return tree
Пример #12
0
def get_relative_abundance(biomfile):
    """
    Return relative abundance from a OTU table. OTUIDs are converted to their
    genus-species identifier.
    """
    biomf = biom.load_table(biomfile)
    norm_biomf = biomf.norm(inplace=False)
    rel_abd = {}
    for sid in norm_biomf.ids():
        rel_abd[sid] = {}
        for otuid in norm_biomf.ids("observation"):
            otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
            abd = norm_biomf.get_value_by_ids(otuid, sid)
            rel_abd[sid][otuname] = abd
    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return ast_rel_abd
def load_core_file(core_fp):
    """
    For core OTU data file, returns Genus-species identifier for each data
    entry.
    :type core_fp: str
    :param core_fp: A file containing core OTU data.
    :rtype: str
    :return: Returns genus-species identifier based on identified taxonomical
             level.
    """
    core = {}
    with open(core_fp) as in_f:
        for line in in_f.read().splitlines():
            if not line.startswith("#"):
                otu_id, tax = line.split("\t")
                core[otu_id] = oc.otu_name(ast.literal_eval(tax))
    return core
def load_core_file(core_fp):
    """
    For core OTU data file, returns Genus-species identifier for each data
    entry.
    :type core_fp: str
    :param core_fp: A file containing core OTU data.
    :rtype: str
    :return: Returns genus-species identifier based on identified taxonomical
             level.
    """
    core = {}
    with open(core_fp) as in_f:
        for line in in_f.read().splitlines():
            if not line.startswith("#"):
                otu_id, tax = line.split("\t")
                core[otu_id] = oc.otu_name(ast.literal_eval(tax))
    return core
Пример #15
0
    def test_otu_name(self):
        """
        Testing otu_name() function of otu_calc.py.

        :return: Returns OK if the test goals were achieved, otherwise
                 raises error.
        """
        self.tax = [
            "k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia",
            "o__Methanosarcinales", "f__",
            "g__", "s__"
            ]
        self.result = oc.otu_name(self.tax)
        hand_calc = 'Unclassified_Methanosarcinales'

        # Testing the validity of the otu_name() function
        self.assertEqual(
            self.result, hand_calc,
            msg='Error! The output is not as expected.'
            )
Пример #16
0
    def test_otu_name(self):
        """
        Testing the otu_name() function of otu_calc.py.

        :return: Returns OK if the test goals were achieved, otherwise
                 raises error.
        """
        self.taxa = {
            "Unclassified_Methanosarcinales": [
                "k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia",
                "o__Methanosarcinales", "f__", "g__", "s__concilii"
            ],
            "Campylobacter_gracilis": [
                "k__Bacteria", "p__Proteobacteria", "c__Epsilonproteobacteria",
                "o__Campylobacterales", "f__Campylobacteraceae",
                "g__Campylobacter", "s__gracilis"
            ],
            "Escherichia_spp.": [
                "k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria",
                "o__Enterobacteriales", "f__Enterobacteriaceae",
                "g__Escherichia", "s__"
            ],
            "Fusobacterium_nucleatum": [
                "k__Bacteria", "p__Fusobacteria", "c__Fusobacteria", "o__",
                "f__", "g__Fusobacterium", "s__nucleatum"
            ],
            "Fusobacterium_spp.": [
                "k__Bacteria", "p__Fusobacteria", "c__Fusobacteria", "o__",
                "f__", "g__Fusobacterium", "s__"
            ]
        }

        for expected, test in self.taxa.items():
            self.result = oc.otu_name(test)

            # Testing the validity of the otu_name() function
            self.assertEqual(
                self.result,
                expected,
                msg="Error!\nExpected result: {}.\notu_name() result: {}".
                format(expected, self.result))
Пример #17
0
def main():
    args = handle_program_options()

    # Read gramox data from master file
    try:
        with open(args.master_fnh, "rU") as bgof:
            bgodata = {
                line.strip().split("\t")[1]:
                "\t".join(line.strip().split("\t")[2:4])
                for line in bgof.readlines()
            }
    except Exception as err:
        sys.exit(
            "\nError parsing master gramox file: {}. Please check master gramox data"
            " file and re-run this script.".format(err))

    # Read relative abundance data
    try:
        biomf = biom.load_table(args.biom_file)
    except Exception as err:
        sys.exit("\nError opening BIOM file: {}\n".format(err))
    else:
        otus = [
            otu_name(biomf.metadata(otuid, "observation")["taxonomy"])
            for otuid in biomf.ids("observation")
        ]

    # Write classified gramox data to tsv file
    print(
        "\nWriting out results to file. For OTUs with missing gramox data, please "
        "manually input the relevant information or fill in NA for unverified "
        "information. This is required before running pct_abd_gramox.py script.\n"
    )
    with open(args.out_fnh, "w") as gramoxout:
        gramoxout.write("#OTU\tGram Status\tOxygen Requirement\n")
        for otu in otus:
            if otu in bgodata.keys():
                gramoxout.write("{}\t{}\n".format(otu, bgodata[otu]))
            else:
                gramoxout.write("{}\n".format(otu))
Пример #18
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_id_fp):
            pass
    except IOError as ioe:
        sys.exit('\nError with file containing OTUIDs/BIOM format:{}\n'.format(ioe))

    with open(args.otu_id_fp, 'rU') as otuF:
        otu_ids = [line.strip().split('\t') for line in otuF.readlines()]
    taxa = util.parse_taxonomy_table(args.taxonomy_fp)

    with open(args.output_fp, 'w') as outF:
        for entry in otu_ids:
            if isinstance(entry, list):
                # check for comments in BIOM files
                if not entry[0][0] == '#':
                    ID = entry[0]
                else:
                    outF.write('{}\n'.format('\t'.join(entry)))
                    continue
            # instead of a BIOM file, a line-by-line list of OTU IDs
            else:
                ID = entry

            if ID in taxa:
                named_ID = otuc.otu_name(taxa[ID].split('; '))
            else:
                print 'Error: OTU ID {} not found in supplied taxonomy file; stopping...'.format(ID)
                return

            # write out to file
            out_str = '{}\t{}\n'
            if isinstance(entry, list):
                outF.write(out_str.format(named_ID, '\t'.join(entry[1:])))
            else:
                outF.write(out_str.format(ID, named_ID))
Пример #19
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with OTU_Sample abundance data file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # input data
    biomf = biom.load_table(args.otu_table)
    map_header, imap = util.parse_map_file(args.mapping)

    # rewrite tree file with otu names, skip if keep_otuids specified
    if args.input_tree and not args.keep_otuids:
        with open(args.input_tree) as treF, open(args.output_tre, "w") as outF:
            tree = treF.readline()
            if "'" in tree:
                tree = tree.replace("'", '')
            outF.write(newick_replace_otuids(tree, biomf))

    if not args.keep_otuids:
        oid_rows = {
            id_: md["taxonomy"]
            for val, id_, md in biomf.iter(axis="observation")
        }

    # calculate analysis results
    categories = None
    if args.map_categories is not None and args.analysis_metric != "raw":
        categories = args.map_categories.split(",")

    # set transform if --stabilize_variance is specfied
    tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None

    groups = util.gather_categories(imap, map_header, categories)
    for group in groups.values():
        if args.analysis_metric in ["MRA", "NMRA"]:
            results = bc.MRA(biomf, group.sids, transform=tform)
        elif args.analysis_metric == "raw":
            results = bc.transform_raw_abundance(biomf,
                                                 sampleIDs=group.sids,
                                                 sample_abd=False)
        if args.keep_otuids:
            group.results.update({oid: results[oid] for oid in results})
        else:
            group.results.update(
                {oc.otu_name(oid_rows[oid]): results[oid]
                 for oid in results})

    # write iTol data set file
    with open(args.output_itol_table, "w") as itolF:
        if args.analysis_metric == "raw":
            itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tLog Total Abundance\n")
            itolF.write("COLOR\t#000000\n")
            itolF.write("LEGEND_TITLE\tLog Total Abundance\n")
            itolF.write("LEGEND_SHAPES\t1\n")
            itolF.write("LEGEND_COLORS\t#000000\n")
            itolF.write("LEGEND_LABELS\tLog Total Abundance\n")
            itolF.write("COLOR_MIN\t#FFFFFF\n")
            itolF.write("COLOR_MAX\t#000000\n")
        else:
            itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\t{}\n".format(args.analysis_metric))
            itolF.write("FIELD_COLORS\t{}\n".format("\t".join(
                ["#ff0000" for _ in range(len(groups))])))
            itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys()) + "\n")
            itolF.write("LEGEND_TITLE\t{}\n".format(args.analysis_metric))
            itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(
                ["1" for _ in range(len(groups))])))
            itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(
                ["#ff0000" for _ in range(len(groups))])))
            itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys()) + "\n")
            itolF.write("WIDTH\t300\n")
        itolF.write("DATA\n")

        if args.keep_otuids:
            all_otus = frozenset(
                {id_
                 for id_ in biomf.ids(axis="observation")})
        else:
            all_otus = frozenset({
                oc.otu_name(md["taxonomy"])
                for val, id_, md in biomf.iter(axis="observation")
            })

        for oname in all_otus:
            row = ["{name}"]  # \t{s:.2f}\t{ns:.2f}\n"
            row_data = {"name": oname}
            msum = 0
            for name, group in groups.iteritems():
                row.append("{{{}:.5f}}".format(name))
                if oname in group.results:
                    row_data[name] = group.results[oname]
                else:
                    row_data[name] = 0.0
                msum += row_data[name]
            # normalize avg relative abundance data
            if args.analysis_metric == "NMRA" and msum > 0:
                row_data.update({
                    key: data / msum
                    for key, data in row_data.items() if key != "name"
                })
            itolF.write("\t".join(row).format(**row_data) + "\n")
Пример #20
0
def main():
    args = handle_program_options()

    # Parse and read mapping file
    try:
        header, imap = util.parse_map_file(args.map_fp)
        category_idx = header.index(args.group_by)
    except IOError as ioe:
        err_msg = "\nError in metadata mapping filepath (-m): {}\n"
        sys.exit(err_msg.format(ioe))
    # Obtain group colors
    class_colors = util.color_mapping(imap, header, args.group_by, args.color_by)

    # Get otus for LDA bubble plots
    try:
        bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0])
    except IOError as ioe:
        err_msg = "\nError in OTU IDs file (--bubble): {}\n"
        sys.exit(err_msg.format(ioe))

    # Load biom file and calculate relative abundance
    try:
        biomf = biom.load_table(args.otu_table)
    except IOError as ioe:
        err_msg = "\nError with biom format file (-d): {}\n"
        sys.exit(err_msg.format(ioe))

    # Get normalized relative abundances
    rel_abd = bc.relative_abundance(biomf)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0}
    bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by
    # Get abundance to the nearest 50
    bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]]

    # Set up input for LDA calc and get LDA transformed data
    if args.dist_matrix_file:
        try:
            uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0)
        except IOError as ioe:
            err_msg = "\nError with unifrac distance matrix file (-d): {}\n"
            sys.exit(err_msg.format(ioe))
        uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index])
        sampleids = uf_data.index
        if args.save_lda_input:
            uf_data.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(uf_data)
    else:
        df_rel_abd = pd.DataFrame(rel_abd).T
        df_rel_abd.insert(0, "Condition", [imap[sid][category_idx]
                                           for sid in df_rel_abd.index])
        sampleids = df_rel_abd.index
        if args.save_lda_input:
            df_rel_abd.to_csv(args.save_lda_input, sep="\t")
        # Run LDA
        X_lda, y_lda, exp_var = run_LDA(df_rel_abd)

    # Calculate position and size of SampleIDs to plot for each OTU
    for otuid in bubble_otus:
        otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"])
        plot_data = {cat: {"x": [], "y": [], "size": [], "label": []}
                     for cat in class_colors.keys()}
        for sid, data in zip(sampleids, X_lda):
            category = plot_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["x"].append(float(data[0]))
            category["y"].append(float(data[1]))
            category["size"].append(size)

        # Plot LDA bubble for each OTU
        fig = plt.figure(figsize=args.figsize)
        ax = fig.add_subplot(111)
        for i, cat in enumerate(plot_data):
            plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"],
                        s=plot_data[cat]["size"], label=cat, color=class_colors[cat],
                        alpha=0.85, edgecolors="k")
        if X_lda.shape[1] == 1:
            plt.ylim((0.5, 2.5))
        plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13)
        try:
            plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.xlabel("LD1", fontsize=13, labelpad=15)
        try:
            plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100),
                       fontsize=13, labelpad=15)
        except:
            plt.ylabel("LD2", fontsize=13, labelpad=15)

        lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13)
        for i in range(len(class_colors.keys())):
            lgnd1.legendHandles[i]._sizes = [80]  # Change the legend marker size manually
        # Add the legend manually to the current plot
        plt.gca().add_artist(lgnd1)

        c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range]
        plt.legend(c, ["{}".format(s2) for s2 in bubble_range],
                   title="Scaled Bubble\n       Sizes", frameon=True, labelspacing=2,
                   fontsize=13, loc=4, scatterpoints=1, borderpad=1.1)

        # Set style for LDA bubble plots
        if args.ggplot2_style:
            gu.ggplot2_style(ax)
            fc = "0.8"
        else:
            fc = "none"

        # Save LDA bubble plots to output directory
        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as,
                    facecolor=fc, edgecolor="none", dpi=300,
                    bbox_inches="tight", pad_inches=0.2)
        plt.close(fig)
Пример #21
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # check that the output dir exists, create it if not
    util.ensure_dir(args.output_dir)

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = bc.relative_abundance(biomtbl)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(
            biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {
            cat: {
                "pc1": [],
                "pc2": [],
                "size": []
            }
            for cat in category_ids
        }

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr,
                  yr, args.output_dir, args.save_as, args.ggplot2_style)
Пример #22
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with OTU_Sample abundance data file:{}\n"
            .format(ioe)
        )

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit(
            "\nError with mapping file:{}\n"
            .format(ioe)
        )

    # input data
    biomf = biom.load_table(args.otu_table)
    map_header, imap = util.parse_map_file(args.mapping)

    # rewrite tree file with otu names
    if args.input_tree:
        with open(args.input_tree) as treF, open(args.output_tre, "w") as outF:
            tree = treF.readline()
            if "'" in tree:
                tree = tree.replace("'", '')
            outF.write(newick_replace_otuids(tree, biomf))

    oid_rows = {id_: md["taxonomy"]
                for val, id_, md in biomf.iter(axis="observation")}

    # calculate analysis results
    categories = None
    if args.map_categories is not None:
        categories = args.map_categories.split(",")

    # set transform if --stabilize_variance is specfied
    tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None

    groups = util.gather_categories(imap, map_header, categories)
    for group in groups.values():
        if args.analysis_metric in ["MRA", "NMRA"]:
            results = bc.MRA(biomf, group.sids, transform=tform)
        elif args.analysis_metric == "raw":
            results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids,
                                                 sample_abd=False)
        group.results.update({oc.otu_name(oid_rows[oid]): results[oid]
                             for oid in results})

    # write iTol data set file
    with open(args.output_itol_table, "w") as itolF:
        if args.analysis_metric == "raw":
            itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tLog Total Abundance\n")
            itolF.write("COLOR\t#000000\n")
            itolF.write("LEGEND_TITLE\tLog Total Abundance\n")
            itolF.write("LEGEND_SHAPES\t1\n")
            itolF.write("LEGEND_COLORS\t#000000\n")
            itolF.write("LEGEND_LABELS\tLog Total Abundance\n")
            itolF.write("COLOR_MIN\t#FFFFFF\n")
            itolF.write("COLOR_MAX\t#000000\n")
        else:
            itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n")
            itolF.write("DATASET_LABEL\tNMRA\n")
            itolF.write("FIELD_COLORS\t{}\n".format("\t".join(["#ff0000"
                        for _ in range(len(groups))])))
            itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys())+"\n")
            itolF.write("LEGEND_TITLE\tNMRA\n")
            itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(["1"
                        for _ in range(len(groups))])))
            itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(["#ff0000"
                        for _ in range(len(groups))])))
            itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys())+"\n")
            itolF.write("WIDTH\t300\n")
        itolF.write("DATA\n")
        all_otus = frozenset({oc.otu_name(md["taxonomy"])
                              for val, id_, md in
                              biomf.iter(axis="observation")})

        for oname in all_otus:
            row = ["{name}"]        # \t{s:.2f}\t{ns:.2f}\n"
            row_data = {"name": oname}
            msum = 0
            for name, group in groups.iteritems():
                row.append("{{{}:.5f}}".format(name))
                if oname in group.results:
                    row_data[name] = group.results[oname]
                else:
                    row_data[name] = 0.0
                msum += row_data[name]
            # normalize avg relative abundance data
            if args.analysis_metric == "NMRA" and msum > 0:
                row_data.update({key: data/msum
                                for key, data in row_data.items()
                                if key != "name"})
            itolF.write("\t".join(row).format(**row_data) + "\n")
Пример #23
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    if not os.path.exists(args.output_dir):
        try:
            os.mkdir(args.output_dir)
        except OSError as oe:
            if os.errno == 2:
                msg = ("One or more directories in the path provided for " +
                       "--output-dir ({}) do not exist. If you are specifying " +
                       "a new directory for output, please ensure all other " +
                       "directories in the path currently exist.")
                sys.exit(msg.format(args.output_dir))
            else:
                msg = ("An error occurred trying to create the output " +
                       "directory ({}) with message: {}")
                sys.exit(msg.format(args.output_dir, oe.strerror))

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = get_relative_abundance(biomtbl)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {cat: {"pc1": [], "pc2": [], "size": []}
                    for cat in category_ids}

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print "{} not found in {} sample.".format(ke, sid)
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print "Saving chart for {}".format(" ".join(otuname.split("_")))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(),
                  color_map, xr, yr, args.output_dir,
                  args.save_as, args.ggplot2_style)
Пример #24
0
def main():
    args = handle_program_options()

    try:
        with open(args.otu_table):
            pass
    except IOError as ioe:
        sys.exit("\nError with BIOM format file:{}\n".format(ioe))

    try:
        with open(args.pcoa_fp):
            pass
    except IOError as ioe:
        sys.exit("\nError with principal coordinates file:{}\n".format(ioe))

    try:
        with open(args.mapping):
            pass
    except IOError as ioe:
        sys.exit("\nError with mapping file:{}\n".format(ioe))

    # check that the output dir exists, create it if not
    util.ensure_dir(args.output_dir)

    # load the BIOM table
    biomtbl = biom.load_table(args.otu_table)

    # Read unifrac principal coordinates file
    unifrac = util.parse_unifrac(args.pcoa_fp)

    # Read otu data file
    otus = set()
    with open(args.otu_ids_fp, "rU") as nciF:
        for line in nciF.readlines():
            line = line.strip()
            otus.add(line)

    # Gather categories from mapping file
    header, imap = util.parse_map_file(args.mapping)
    try:
        category_idx = header.index(args.group_by)
    except ValueError:
        msg = "Error: Specified mapping category '{}' not found."
        sys.exit(msg.format(args.group_by))
    category_ids = util.gather_categories(imap, header, [args.group_by])
    color_map = util.color_mapping(imap, header, args.group_by, args.colors)
    rel_abd = bc.relative_abundance(biomtbl)
    rel_abd = bc.arcsine_sqrt_transform(rel_abd)

    # plot samples based on relative abundance of some OTU ID
    for otuid in otus:
        otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"])
        cat_data = {cat: {"pc1": [], "pc2": [], "size": []}
                    for cat in category_ids}

        for sid in unifrac["pcd"]:
            category = cat_data[imap[sid][category_idx]]
            try:
                size = rel_abd[sid][otuid] * args.scale_by
            except KeyError as ke:
                print("{} not found in {} sample.".format(ke, sid))
                continue
            category["pc1"].append(float(unifrac["pcd"][sid][0]))
            category["pc2"].append(float(unifrac["pcd"][sid][1]))
            category["size"].append(size)

        if args.verbose:
            print("Saving chart for {}".format(" ".join(otuname.split("_"))))
        xr, yr = calculate_xy_range(cat_data)
        plot_PCoA(cat_data, otuname, unifrac, color_map.keys(),
                  color_map, xr, yr, args.output_dir,
                  args.save_as, args.ggplot2_style)