def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" serialized_metacyc_taxa_ranges_tmp = "/tmp/metacyc_pwy_taxa_range.pk.tmp" try: if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges_tmp, "w") # get expected taxonomic ranges for each pathway for pwy in pwys: # printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange( pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() rename(serialized_metacyc_taxa_ranges_tmp, serialized_metacyc_taxa_ranges) else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[fields[0]] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: # printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup( pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ except: print """ Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """ # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("INFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([options.ncbi_tree], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf]) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([orf_lca[orf]]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([expected[0], pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([expected[0], pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([expected[0], pwy_lca[pwy][0]]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([min_taxa, pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [max_dist, observed, expected] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print "Had problems opening file: " + options.table_out # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print "Had problems closing file: " + options.table_out
def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('\n') printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" try: #print options.wtd, not path.isfile(serialized_metacyc_taxa_ranges), serialized_metacyc_taxa_ranges if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: insert_error(9) print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ except: print """ Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """ insert_error(9) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("\nINFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([ options.ncbi_tree ], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf] ) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} printf("INFO\tCalculating WTD\n") for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range and len(pwy_taxa_range[pwy]) : for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print "Had problems opening file: " + options.table_out insert_error(9) # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print "Had problems closing file: " + options.table_out insert_error(9)
def main(argv): global parser (opts, args) = parser.parse_args() if not check_arguments(opts, args): print usage sys.exit(0) # place to store list of expected taxonomic range(s) serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp try: print "Getting MetaCyc Expected Taxonomic Range(s)" # connect to Pathway Tools cyc = PythonCyc() cyc.setOrganism("meta") cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges, "w") # get expected taxonomic ranges for each pathway for pwy in pwys: my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() # close Pathway Tools cyc.stopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if opts.ncbi_megan_map: with open(opts.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[fields[0]] = fields[1] # get ORF to taxa map from annotation_table print "Getting ORF to Taxa Map from AnnotationTable" orf_lca = {} with open(opts.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: cyc = PythonCyc() cyc.setOrganism(opts.pgdb_name) cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() for pwy in pwys: genes = cyc.getPathwayORFs(pwy) rxns = cyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns cyc.stopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map print "Loading NCBI Taxonomy Map" lca = LCAComputation([opts.ncbi_tree]) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: id = lca.get_a_Valid_ID([orf_lca[orf]]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range: if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([expected[0], pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([expected[0], pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([expected[0], pwy_lca[pwy][0]]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([min_taxa, pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [max_dist, observed, expected] # write out pathway table try: out = open(opts.table_out, "w") except: print "Had problems opening file: " + opts.table_out # write appropreate header if opts.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = opts.pgdb_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if opts.wtd: if pwy in pwy_to_wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa else: line.append("NA") line.append("NA") line.append("NA") line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file except: print "Had problems closing file: " + opts.table_out
def main(argv): global parser (opts, args) = parser.parse_args() if not check_arguments(opts, args): print(usage) sys.exit(0) # place to store list of expected taxonomic range(s) serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp try: print('Getting MetaCyc Expected Taxonomic Range(s)') # connect to Pathway Tools cyc = PythonCyc() cyc.setOrganism('meta') cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() # close Pathway Tools cyc.stopPathwayTools() except: print( """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """) else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if opts.ncbi_megan_map: with open(opts.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table print("Getting ORF to Taxa Map from AnnotationTable") orf_lca = {} with open(opts.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: cyc = PythonCyc() cyc.setOrganism(opts.pgdb_name) cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() for pwy in pwys: genes = cyc.getPathwayORFs(pwy) rxns = cyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns cyc.stopPathwayTools() except: print(""" Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map print("Loading NCBI Taxonomy Map") lca = LCAComputation([ opts.ncbi_tree ]) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range: if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print("Not a valid distance") continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table try: out = open(opts.table_out, "w") except: print("Had problems opening file: " + opts.table_out) # write appropreate header if opts.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = opts.pgdb_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if opts.wtd: if pwy in pwy_to_wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa else: line.append("NA") line.append("NA") line.append("NA") line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file except: print("Had problems closing file: " + opts.table_out)