def main(): args = handle_program_options() try: with open(args.otu_id_fp): pass except IOError as ioe: sys.exit("\nError with file containing OTUIDs/BIOM format:{}\n".format(ioe)) with open(args.otu_id_fp, "rU") as otuF: if args.reverse_lookup: otu_ids = [] for line in otuF.readlines(): if line: otu_ids.append(line.strip()) else: otu_ids = [line.strip().split("\t") for line in otuF.readlines()] taxa = util.parse_taxonomy_table(args.taxonomy_fp) with open(args.output_fp, "w") as outF: for entry in otu_ids: if isinstance(entry, list): # check for comments in BIOM files if not entry[0][0] == "#": ID = entry[0] else: outF.write("{}\n".format("\t".join(entry))) continue # instead of a BIOM file, a line-by-line list of OTU IDs else: ID = entry # for looking up OTUIDs if args.reverse_lookup: for id, fulltaxa in taxa.iteritems(): otuname = otuc.otu_name(fulltaxa.split("; ")) if otuname == ID: taxa_id = id # for looking up OTU name else: if ID in taxa: named_ID = otuc.otu_name(taxa[ID].split("; ")) else: print "Error: OTU ID {} not found in supplied taxonomy file.".format(ID) return # write out to file out_str = "{}\t{}\n" if isinstance(entry, list): outF.write(out_str.format(named_ID, "\t".join(entry[1:]))) else: if args.reverse_lookup: outF.write("{}\n".format(taxa_id)) else: outF.write(out_str.format(ID, named_ID))
def main(): args = handle_program_options() # Read biom format file try: biomf = biom.load_table(args.otu_table) except IOError as ioe: sys.exit("\nError with biom format file (-i): {}\n".format(ioe)) # Read in otus file data try: with open(args.otu_id_fp, "rU") as inf: dialect = csv.Sniffer().sniff(inf.read(1024)) inf.seek(0) csvr = csv.reader(inf, dialect) otu_ids = [line[0] for line in csvr] except IOError as ioe: sys.exit("\nError with file containing OTUs:{}\n".format(ioe)) output = {} for val, idx, md in biomf.iter(axis="observation"): name = otuc.otu_name(md["taxonomy"]) if args.reverse_lookup: if name in otu_ids: output[name] = idx # Get otuids from otu names else: if idx in otu_ids: output[idx] = name # Get otu name from otu IDs with open(args.output_fp, "w") as outf: outf.write("Input\tOutput\n") for k, v in output.items(): outf.write("{0}\t{1}\n".format(k, v))
def write_relative_abundance(rel_abd, biomf, out_fn, sort_by=None): """ Given a BIOM table, calculate per-sample relative abundance for each OTU and write out to a tab-separated file listing OTUs as rows and Samples as columns. :type biom: biom object :param biom: BIOM-formatted OTU/Sample abundance data :type out_fn: str :param out_fn: The full path to the desired output file. :type sort_by: function :param sort_by: A function acting as a sorting key that will determine the order in which the Sample IDs appear as columns in the output file. """ with open(out_fn, 'w') as out_f: sids = sorted(set(biomf.ids()), key=sort_by) out_f.write('#OTU ID\t{}\n'.format('\t'.join(sids))) for otuid in biomf.ids(axis="observation"): otuName = oc.otu_name(biomf.metadata(otuid, "observation") ["taxonomy"]) sabd = [str(rel_abd[sid][otuid]) if sid in rel_abd and otuid in rel_abd[sid] else '0' for sid in sids] out_f.write('{}\t{}\n'.format(otuName, '\t'.join(sabd)))
def test_otu_name(self): """ Testing the otu_name() function of otu_calc.py. :return: Returns OK if the test goals were achieved, otherwise raises error. """ self.tax = { "Unclassified_Methanosarcinales": ["k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia", "o__Methanosarcinales", "f__", "g__", "s__concilii" ], "Campylobacter_gracilis": ["k__Bacteria", "p__Proteobacteria", "c__Epsilonproteobacteria", "o__Campylobacterales", "f__Campylobacteraceae", "g__Campylobacter", "s__gracilis"], "Escherichia_spp.": ["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"] } for expected, test in self.tax.items(): self.result = oc.otu_name(test) # Testing the validity of the otu_name() function self.assertEqual( self.result, expected, msg='Error! Expected result: {}. otu_name() result: {}'.format( expected, self.result) )
def get_relative_abundance(biomfile): """ Return arcsine transformed relative abundance from a BIOM format file. :type biomfile: BIOM format file :param biomfile: BIOM format file used to obtain relative abundances for each OTU in a SampleID, which are used as node sizes in network plots. :type return: Dictionary of dictionaries. :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name whose value is the arc sine tranfsormed relative abundance value for that SampleID-OTU Name pair. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) otuname = " ".join(otuname.split("_")) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def get_relative_abundance(biomfile): """ Return arcsine transformed relative abundance from a BIOM format file. :type biomfile: BIOM format file :param biomfile: BIOM format file used to obtain relative abundances for each OTU in a SampleID, which are used as node sizes in network plots. :type return: Dictionary of dictionaries. :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name whose value is the arc sine tranfsormed relative abundance value for that SampleID-OTU Name pair. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name( norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) otuname = " ".join(otuname.split("_")) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def write_relative_abundance(rel_abd, biomf, out_fn, sort_by=None): """ Given a BIOM table, calculate per-sample relative abundance for each OTU and write out to a tab-separated file listing OTUs as rows and Samples as columns. :type biom: biom object :param biom: BIOM-formatted OTU/Sample abundance data :type out_fn: str :param out_fn: The full path to the desired output file. :type sort_by: function :param sort_by: A function acting as a sorting key that will determine the order in which the Sample IDs appear as columns in the output file. """ with open(out_fn, 'w') as out_f: sids = sorted(set(biomf.ids()), key=sort_by) out_f.write('#OTU ID\t{}\n'.format('\t'.join(sids))) for otuid in biomf.ids(axis="observation"): otuName = oc.otu_name( biomf.metadata(otuid, "observation")["taxonomy"]) sabd = [ str(rel_abd[sid][otuid]) if sid in rel_abd and otuid in rel_abd[sid] else '0' for sid in sids ] out_f.write('{}\t{}\n'.format(otuName, '\t'.join(sabd)))
def calc_rel_abd(biomf, sampleIDs=None): """ Calculate relative abundance from a biom table either based on sampleIDs or otuIDs. :type biomf: Biom file format :param biomf: Biom table loaded object :type sampleIDs: list :param sampleIDs: Only calculate relative abundances for these sampleIDs. Default is None. :return type: defaultdict(dict) :return: A dict keyed on sampleID with its value denoting a dict keyed on otuID and abundance value for that [sampleID, otuID] pair. """ norm_biomf = biomf.norm(inplace=False) if sampleIDs is None: sampleIDs = norm_biomf.ids() otuIDs = norm_biomf.ids(axis="observation") rel_abd = defaultdict(dict) for sample in sampleIDs: for otu in otuIDs: abd = norm_biomf.get_value_by_ids(otu, sample) otu_tax = norm_biomf.metadata(otu, "observation")["taxonomy"] otu_name = oc.otu_name(otu_tax) rel_abd[sample][otu_name] = abd trans_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return trans_rel_abd
def newick_replace_otuids(tree, biomf): """ Replace the OTU ids in the Newick phylogenetic tree format with truncated OTU names """ for val, id_, md in biomf.iter(axis="observation"): otu_loc = find_otu(id_, tree) if otu_loc is not None: tree = tree[:otu_loc] + \ oc.otu_name(md["taxonomy"]) + \ tree[otu_loc + len(id_):] return tree
def get_relative_abundance(biomfile): """ Return relative abundance from a OTU table. OTUIDs are converted to their genus-species identifier. """ biomf = biom.load_table(biomfile) norm_biomf = biomf.norm(inplace=False) rel_abd = {} for sid in norm_biomf.ids(): rel_abd[sid] = {} for otuid in norm_biomf.ids("observation"): otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) abd = norm_biomf.get_value_by_ids(otuid, sid) rel_abd[sid][otuname] = abd ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) return ast_rel_abd
def load_core_file(core_fp): """ For core OTU data file, returns Genus-species identifier for each data entry. :type core_fp: str :param core_fp: A file containing core OTU data. :rtype: str :return: Returns genus-species identifier based on identified taxonomical level. """ core = {} with open(core_fp) as in_f: for line in in_f.read().splitlines(): if not line.startswith("#"): otu_id, tax = line.split("\t") core[otu_id] = oc.otu_name(ast.literal_eval(tax)) return core
def test_otu_name(self): """ Testing otu_name() function of otu_calc.py. :return: Returns OK if the test goals were achieved, otherwise raises error. """ self.tax = [ "k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia", "o__Methanosarcinales", "f__", "g__", "s__" ] self.result = oc.otu_name(self.tax) hand_calc = 'Unclassified_Methanosarcinales' # Testing the validity of the otu_name() function self.assertEqual( self.result, hand_calc, msg='Error! The output is not as expected.' )
def test_otu_name(self): """ Testing the otu_name() function of otu_calc.py. :return: Returns OK if the test goals were achieved, otherwise raises error. """ self.taxa = { "Unclassified_Methanosarcinales": [ "k__Archaea", "p__Euryarchaeota", "c__Methanomicrobia", "o__Methanosarcinales", "f__", "g__", "s__concilii" ], "Campylobacter_gracilis": [ "k__Bacteria", "p__Proteobacteria", "c__Epsilonproteobacteria", "o__Campylobacterales", "f__Campylobacteraceae", "g__Campylobacter", "s__gracilis" ], "Escherichia_spp.": [ "k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__" ], "Fusobacterium_nucleatum": [ "k__Bacteria", "p__Fusobacteria", "c__Fusobacteria", "o__", "f__", "g__Fusobacterium", "s__nucleatum" ], "Fusobacterium_spp.": [ "k__Bacteria", "p__Fusobacteria", "c__Fusobacteria", "o__", "f__", "g__Fusobacterium", "s__" ] } for expected, test in self.taxa.items(): self.result = oc.otu_name(test) # Testing the validity of the otu_name() function self.assertEqual( self.result, expected, msg="Error!\nExpected result: {}.\notu_name() result: {}". format(expected, self.result))
def main(): args = handle_program_options() # Read gramox data from master file try: with open(args.master_fnh, "rU") as bgof: bgodata = { line.strip().split("\t")[1]: "\t".join(line.strip().split("\t")[2:4]) for line in bgof.readlines() } except Exception as err: sys.exit( "\nError parsing master gramox file: {}. Please check master gramox data" " file and re-run this script.".format(err)) # Read relative abundance data try: biomf = biom.load_table(args.biom_file) except Exception as err: sys.exit("\nError opening BIOM file: {}\n".format(err)) else: otus = [ otu_name(biomf.metadata(otuid, "observation")["taxonomy"]) for otuid in biomf.ids("observation") ] # Write classified gramox data to tsv file print( "\nWriting out results to file. For OTUs with missing gramox data, please " "manually input the relevant information or fill in NA for unverified " "information. This is required before running pct_abd_gramox.py script.\n" ) with open(args.out_fnh, "w") as gramoxout: gramoxout.write("#OTU\tGram Status\tOxygen Requirement\n") for otu in otus: if otu in bgodata.keys(): gramoxout.write("{}\t{}\n".format(otu, bgodata[otu])) else: gramoxout.write("{}\n".format(otu))
def main(): args = handle_program_options() try: with open(args.otu_id_fp): pass except IOError as ioe: sys.exit('\nError with file containing OTUIDs/BIOM format:{}\n'.format(ioe)) with open(args.otu_id_fp, 'rU') as otuF: otu_ids = [line.strip().split('\t') for line in otuF.readlines()] taxa = util.parse_taxonomy_table(args.taxonomy_fp) with open(args.output_fp, 'w') as outF: for entry in otu_ids: if isinstance(entry, list): # check for comments in BIOM files if not entry[0][0] == '#': ID = entry[0] else: outF.write('{}\n'.format('\t'.join(entry))) continue # instead of a BIOM file, a line-by-line list of OTU IDs else: ID = entry if ID in taxa: named_ID = otuc.otu_name(taxa[ID].split('; ')) else: print 'Error: OTU ID {} not found in supplied taxonomy file; stopping...'.format(ID) return # write out to file out_str = '{}\t{}\n' if isinstance(entry, list): outF.write(out_str.format(named_ID, '\t'.join(entry[1:]))) else: outF.write(out_str.format(ID, named_ID))
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit( "\nError with OTU_Sample abundance data file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # input data biomf = biom.load_table(args.otu_table) map_header, imap = util.parse_map_file(args.mapping) # rewrite tree file with otu names, skip if keep_otuids specified if args.input_tree and not args.keep_otuids: with open(args.input_tree) as treF, open(args.output_tre, "w") as outF: tree = treF.readline() if "'" in tree: tree = tree.replace("'", '') outF.write(newick_replace_otuids(tree, biomf)) if not args.keep_otuids: oid_rows = { id_: md["taxonomy"] for val, id_, md in biomf.iter(axis="observation") } # calculate analysis results categories = None if args.map_categories is not None and args.analysis_metric != "raw": categories = args.map_categories.split(",") # set transform if --stabilize_variance is specfied tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None groups = util.gather_categories(imap, map_header, categories) for group in groups.values(): if args.analysis_metric in ["MRA", "NMRA"]: results = bc.MRA(biomf, group.sids, transform=tform) elif args.analysis_metric == "raw": results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids, sample_abd=False) if args.keep_otuids: group.results.update({oid: results[oid] for oid in results}) else: group.results.update( {oc.otu_name(oid_rows[oid]): results[oid] for oid in results}) # write iTol data set file with open(args.output_itol_table, "w") as itolF: if args.analysis_metric == "raw": itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tLog Total Abundance\n") itolF.write("COLOR\t#000000\n") itolF.write("LEGEND_TITLE\tLog Total Abundance\n") itolF.write("LEGEND_SHAPES\t1\n") itolF.write("LEGEND_COLORS\t#000000\n") itolF.write("LEGEND_LABELS\tLog Total Abundance\n") itolF.write("COLOR_MIN\t#FFFFFF\n") itolF.write("COLOR_MAX\t#000000\n") else: itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\t{}\n".format(args.analysis_metric)) itolF.write("FIELD_COLORS\t{}\n".format("\t".join( ["#ff0000" for _ in range(len(groups))]))) itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys()) + "\n") itolF.write("LEGEND_TITLE\t{}\n".format(args.analysis_metric)) itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join( ["1" for _ in range(len(groups))]))) itolF.write("LEGEND_COLORS\t{}\n".format("\t".join( ["#ff0000" for _ in range(len(groups))]))) itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys()) + "\n") itolF.write("WIDTH\t300\n") itolF.write("DATA\n") if args.keep_otuids: all_otus = frozenset( {id_ for id_ in biomf.ids(axis="observation")}) else: all_otus = frozenset({ oc.otu_name(md["taxonomy"]) for val, id_, md in biomf.iter(axis="observation") }) for oname in all_otus: row = ["{name}"] # \t{s:.2f}\t{ns:.2f}\n" row_data = {"name": oname} msum = 0 for name, group in groups.iteritems(): row.append("{{{}:.5f}}".format(name)) if oname in group.results: row_data[name] = group.results[oname] else: row_data[name] = 0.0 msum += row_data[name] # normalize avg relative abundance data if args.analysis_metric == "NMRA" and msum > 0: row_data.update({ key: data / msum for key, data in row_data.items() if key != "name" }) itolF.write("\t".join(row).format(**row_data) + "\n")
def main(): args = handle_program_options() # Parse and read mapping file try: header, imap = util.parse_map_file(args.map_fp) category_idx = header.index(args.group_by) except IOError as ioe: err_msg = "\nError in metadata mapping filepath (-m): {}\n" sys.exit(err_msg.format(ioe)) # Obtain group colors class_colors = util.color_mapping(imap, header, args.group_by, args.color_by) # Get otus for LDA bubble plots try: bubble_otus = set(pd.read_csv(args.otu_ids_fp, sep="\n", header=None)[0]) except IOError as ioe: err_msg = "\nError in OTU IDs file (--bubble): {}\n" sys.exit(err_msg.format(ioe)) # Load biom file and calculate relative abundance try: biomf = biom.load_table(args.otu_table) except IOError as ioe: err_msg = "\nError with biom format file (-d): {}\n" sys.exit(err_msg.format(ioe)) # Get normalized relative abundances rel_abd = bc.relative_abundance(biomf) rel_abd = bc.arcsine_sqrt_transform(rel_abd) abd_val = {abd for sid, v1 in rel_abd.items() for otuid, abd in v1.items() if abd > 0} bubble_range = np.linspace(min(abd_val), max(abd_val), num=5) * args.scale_by # Get abundance to the nearest 50 bubble_range = [int(50 * round(float(abd)/50)) for abd in bubble_range[1:]] # Set up input for LDA calc and get LDA transformed data if args.dist_matrix_file: try: uf_data = pd.read_csv(args.dist_matrix_file, sep="\t", index_col=0) except IOError as ioe: err_msg = "\nError with unifrac distance matrix file (-d): {}\n" sys.exit(err_msg.format(ioe)) uf_data.insert(0, "Condition", [imap[sid][category_idx] for sid in uf_data.index]) sampleids = uf_data.index if args.save_lda_input: uf_data.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(uf_data) else: df_rel_abd = pd.DataFrame(rel_abd).T df_rel_abd.insert(0, "Condition", [imap[sid][category_idx] for sid in df_rel_abd.index]) sampleids = df_rel_abd.index if args.save_lda_input: df_rel_abd.to_csv(args.save_lda_input, sep="\t") # Run LDA X_lda, y_lda, exp_var = run_LDA(df_rel_abd) # Calculate position and size of SampleIDs to plot for each OTU for otuid in bubble_otus: otuname = oc.otu_name(biomf.metadata(otuid, axis="observation")["taxonomy"]) plot_data = {cat: {"x": [], "y": [], "size": [], "label": []} for cat in class_colors.keys()} for sid, data in zip(sampleids, X_lda): category = plot_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["x"].append(float(data[0])) category["y"].append(float(data[1])) category["size"].append(size) # Plot LDA bubble for each OTU fig = plt.figure(figsize=args.figsize) ax = fig.add_subplot(111) for i, cat in enumerate(plot_data): plt.scatter(plot_data[cat]["x"], plot_data[cat]["y"], s=plot_data[cat]["size"], label=cat, color=class_colors[cat], alpha=0.85, edgecolors="k") if X_lda.shape[1] == 1: plt.ylim((0.5, 2.5)) plt.title(" ".join(otuname.split("_")), style="italic", fontsize=13) try: plt.xlabel("LD1 (Percent Explained Variance: {:.3f}%)".format(exp_var[0]*100), fontsize=13, labelpad=15) except: plt.xlabel("LD1", fontsize=13, labelpad=15) try: plt.ylabel("LD2 (Percent Explained Variance: {:.3f}%)".format(exp_var[1]*100), fontsize=13, labelpad=15) except: plt.ylabel("LD2", fontsize=13, labelpad=15) lgnd1 = plt.legend(loc="best", scatterpoints=3, fontsize=13) for i in range(len(class_colors.keys())): lgnd1.legendHandles[i]._sizes = [80] # Change the legend marker size manually # Add the legend manually to the current plot plt.gca().add_artist(lgnd1) c = [plt.scatter([], [], c="w", edgecolors="k", s=s1) for s1 in bubble_range] plt.legend(c, ["{}".format(s2) for s2 in bubble_range], title="Scaled Bubble\n Sizes", frameon=True, labelspacing=2, fontsize=13, loc=4, scatterpoints=1, borderpad=1.1) # Set style for LDA bubble plots if args.ggplot2_style: gu.ggplot2_style(ax) fc = "0.8" else: fc = "none" # Save LDA bubble plots to output directory if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) fig.savefig(pj(args.output_dir, "_".join(otuname.split())) + "." + args.save_as, facecolor=fc, edgecolor="none", dpi=300, bbox_inches="tight", pad_inches=0.2) plt.close(fig)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name( biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = { cat: { "pc1": [], "pc2": [], "size": [] } for cat in category_ids } for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit( "\nError with OTU_Sample abundance data file:{}\n" .format(ioe) ) try: with open(args.mapping): pass except IOError as ioe: sys.exit( "\nError with mapping file:{}\n" .format(ioe) ) # input data biomf = biom.load_table(args.otu_table) map_header, imap = util.parse_map_file(args.mapping) # rewrite tree file with otu names if args.input_tree: with open(args.input_tree) as treF, open(args.output_tre, "w") as outF: tree = treF.readline() if "'" in tree: tree = tree.replace("'", '') outF.write(newick_replace_otuids(tree, biomf)) oid_rows = {id_: md["taxonomy"] for val, id_, md in biomf.iter(axis="observation")} # calculate analysis results categories = None if args.map_categories is not None: categories = args.map_categories.split(",") # set transform if --stabilize_variance is specfied tform = bc.arcsine_sqrt_transform if args.stabilize_variance else None groups = util.gather_categories(imap, map_header, categories) for group in groups.values(): if args.analysis_metric in ["MRA", "NMRA"]: results = bc.MRA(biomf, group.sids, transform=tform) elif args.analysis_metric == "raw": results = bc.transform_raw_abundance(biomf, sampleIDs=group.sids, sample_abd=False) group.results.update({oc.otu_name(oid_rows[oid]): results[oid] for oid in results}) # write iTol data set file with open(args.output_itol_table, "w") as itolF: if args.analysis_metric == "raw": itolF.write("DATASET_GRADIENT\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tLog Total Abundance\n") itolF.write("COLOR\t#000000\n") itolF.write("LEGEND_TITLE\tLog Total Abundance\n") itolF.write("LEGEND_SHAPES\t1\n") itolF.write("LEGEND_COLORS\t#000000\n") itolF.write("LEGEND_LABELS\tLog Total Abundance\n") itolF.write("COLOR_MIN\t#FFFFFF\n") itolF.write("COLOR_MAX\t#000000\n") else: itolF.write("DATASET_MULTIBAR\nSEPARATOR TAB\n") itolF.write("DATASET_LABEL\tNMRA\n") itolF.write("FIELD_COLORS\t{}\n".format("\t".join(["#ff0000" for _ in range(len(groups))]))) itolF.write("FIELD_LABELS\t" + "\t".join(groups.keys())+"\n") itolF.write("LEGEND_TITLE\tNMRA\n") itolF.write("LEGEND_SHAPES\t{}\n".format("\t".join(["1" for _ in range(len(groups))]))) itolF.write("LEGEND_COLORS\t{}\n".format("\t".join(["#ff0000" for _ in range(len(groups))]))) itolF.write("LEGEND_LABELS\t" + "\t".join(groups.keys())+"\n") itolF.write("WIDTH\t300\n") itolF.write("DATA\n") all_otus = frozenset({oc.otu_name(md["taxonomy"]) for val, id_, md in biomf.iter(axis="observation")}) for oname in all_otus: row = ["{name}"] # \t{s:.2f}\t{ns:.2f}\n" row_data = {"name": oname} msum = 0 for name, group in groups.iteritems(): row.append("{{{}:.5f}}".format(name)) if oname in group.results: row_data[name] = group.results[oname] else: row_data[name] = 0.0 msum += row_data[name] # normalize avg relative abundance data if args.analysis_metric == "NMRA" and msum > 0: row_data.update({key: data/msum for key, data in row_data.items() if key != "name"}) itolF.write("\t".join(row).format(**row_data) + "\n")
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) if not os.path.exists(args.output_dir): try: os.mkdir(args.output_dir) except OSError as oe: if os.errno == 2: msg = ("One or more directories in the path provided for " + "--output-dir ({}) do not exist. If you are specifying " + "a new directory for output, please ensure all other " + "directories in the path currently exist.") sys.exit(msg.format(args.output_dir)) else: msg = ("An error occurred trying to create the output " + "directory ({}) with message: {}") sys.exit(msg.format(args.output_dir, oe.strerror)) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = get_relative_abundance(biomtbl) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print "{} not found in {} sample.".format(ke, sid) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print "Saving chart for {}".format(" ".join(otuname.split("_"))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)
def main(): args = handle_program_options() try: with open(args.otu_table): pass except IOError as ioe: sys.exit("\nError with BIOM format file:{}\n".format(ioe)) try: with open(args.pcoa_fp): pass except IOError as ioe: sys.exit("\nError with principal coordinates file:{}\n".format(ioe)) try: with open(args.mapping): pass except IOError as ioe: sys.exit("\nError with mapping file:{}\n".format(ioe)) # check that the output dir exists, create it if not util.ensure_dir(args.output_dir) # load the BIOM table biomtbl = biom.load_table(args.otu_table) # Read unifrac principal coordinates file unifrac = util.parse_unifrac(args.pcoa_fp) # Read otu data file otus = set() with open(args.otu_ids_fp, "rU") as nciF: for line in nciF.readlines(): line = line.strip() otus.add(line) # Gather categories from mapping file header, imap = util.parse_map_file(args.mapping) try: category_idx = header.index(args.group_by) except ValueError: msg = "Error: Specified mapping category '{}' not found." sys.exit(msg.format(args.group_by)) category_ids = util.gather_categories(imap, header, [args.group_by]) color_map = util.color_mapping(imap, header, args.group_by, args.colors) rel_abd = bc.relative_abundance(biomtbl) rel_abd = bc.arcsine_sqrt_transform(rel_abd) # plot samples based on relative abundance of some OTU ID for otuid in otus: otuname = oc.otu_name(biomtbl.metadata(otuid, axis="observation")["taxonomy"]) cat_data = {cat: {"pc1": [], "pc2": [], "size": []} for cat in category_ids} for sid in unifrac["pcd"]: category = cat_data[imap[sid][category_idx]] try: size = rel_abd[sid][otuid] * args.scale_by except KeyError as ke: print("{} not found in {} sample.".format(ke, sid)) continue category["pc1"].append(float(unifrac["pcd"][sid][0])) category["pc2"].append(float(unifrac["pcd"][sid][1])) category["size"].append(size) if args.verbose: print("Saving chart for {}".format(" ".join(otuname.split("_")))) xr, yr = calculate_xy_range(cat_data) plot_PCoA(cat_data, otuname, unifrac, color_map.keys(), color_map, xr, yr, args.output_dir, args.save_as, args.ggplot2_style)