def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" serialized_metacyc_taxa_ranges_tmp = "/tmp/metacyc_pwy_taxa_range.pk.tmp" try: if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges_tmp, "w") # get expected taxonomic ranges for each pathway for pwy in pwys: # printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange( pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() rename(serialized_metacyc_taxa_ranges_tmp, serialized_metacyc_taxa_ranges) else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[fields[0]] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: # printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup( pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ except: print """ Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """ # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("INFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([options.ncbi_tree], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf]) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([orf_lca[orf]]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([expected[0], pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([expected[0], pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([expected[0], pwy_lca[pwy][0]]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([min_taxa, pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [max_dist, observed, expected] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print "Had problems opening file: " + options.table_out # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print "Had problems closing file: " + options.table_out
def main(argv, errorlogger=None, runstatslogger=None): global parser (opts, args) = parser.parse_args(argv) global opts_global opts_global = opts if not check_arguments(opts, args): print usage sys.exit(0) db_to_map_Maps = { 'cog': opts.input_cog_maps, 'seed': opts.input_seed_maps, 'kegg': opts.input_kegg_maps, 'cazy': opts.input_cazy_maps } results_dictionary = {} dbname_weight = {} checkOrCreateFolder(opts.output_dir) output_table_file = open( opts.output_dir + PATHDELIM + 'functional_and_taxonomic_table.txt', 'w') fprintf( output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n" ) output_table_file.close() # print "memory used = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000) listOfOrfs = get_list_of_queries(opts.input_annotated_gff) listOfOrfs.sort(key=lambda tup: tup, reverse=False) if opts.blastdir != None and opts.sample_name != None: try: database_names, input_blastouts, weight_dbs = getBlastFileNames( opts) except: print traceback.print_exc(10) pass else: database_names = opts.database_name input_blastouts = opts.input_blastout weight_dbs = opts.weight_db ##### uncomment the following lines for dbname, blastoutput in zip(database_names, input_blastouts): create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose=opts.verbose, errorlogger=errorlogger) ##### # process in blocks of size _stride lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) blastParsers = {} for dbname, blastoutput in zip(database_names, input_blastouts): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') blastParsers[dbname].setMaxErrorsLimit(5) blastParsers[dbname].setErrorAndWarningLogger(errorlogger) # this part of the code computes the occurence of each of the taxons # which is use in the later stage is used to evaluate the min support # as used in the MEGAN software start = 0 Length = len(listOfOrfs) _stride = 100000 Taxons = {} while start < Length: pickorfs = {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]] = 'root' start = last #print 'Num of Min support orfs ' + str(start) results_dictionary = {} for dbname, blastoutput in zip(database_names, input_blastouts): results = re.search(r'refseq', dbname, re.I) if results: #if True: try: results_dictionary[dbname] = {} process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) #print results_dictionary[dbname].keys()[1:5] lca.set_results_dictionary(results_dictionary) lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname=dbname) for key, taxon in pickorfs.iteritems(): Taxons[key] = taxon except: eprintf("ERROR: while training for min support tree %s\n", dbname) traceback.print_exc() blastParsers = {} for dbname, blastoutput in zip(database_names, input_blastouts): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') # this loop determines the actual/final taxonomy of each of the ORFs # taking into consideration the min support filePermTypes = {} start = 0 outputfile = open(opts.output_dir + '/ORF_annotation_table.txt', 'w') short_to_long_dbnames = {} for dbname in database_names: results = re.search(r'^seed', dbname, re.IGNORECASE) if results: short_to_long_dbnames['seed'] = dbname results = re.search(r'^cog', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cog'] = dbname results = re.search(r'^kegg', dbname, re.IGNORECASE) if results: short_to_long_dbnames['kegg'] = dbname results = re.search(r'^cazy', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cazy'] = dbname standard_dbs = ['cog', 'seed', 'kegg', 'cazy'] standard_db_maps = [ opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps ] field_to_description = {} hierarchical_map = {} for db in standard_dbs: if db in short_to_long_dbnames: field_to_description[db] = {} hierarchical_map[db] = {} for dbname in standard_dbs: if dbname in short_to_long_dbnames: try: read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname]) except: raise pass while start < Length: pickorfs = {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]] = True start = last gc.collect() eprintf( "\nMemory used = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000)) results_dictionary = {} for dbname, blastoutput in zip(database_names, input_blastouts): try: results_dictionary[dbname] = {} eprintf("Processing database : %s...", dbname) process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) eprintf("done\n") except: traceback.print_exc() eprintf("ERROR: %s\n", dbname) pass # print dbname + ' ' + str(len(results_dictionary[dbname])) eprintf("Num orfs processed : %s\n", str(start)) # create the annotations now orfToContig = {} create_annotation(results_dictionary, database_names, opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: create_table( results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname, opts.output_dir, hierarchical_map, field_to_description) # create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType) print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description, filePermType='w') outputfile.close() # now remove the temporary files for dbname, blastoutput in zip(database_names, input_blastouts): try: remove(blastoutput + '.tmp') except: pass
def main(argv, errorlogger = None, runstatslogger = None): global parser (opts, args) = parser.parse_args(argv) global opts_global opts_global = opts if not check_arguments(opts, args): print usage sys.exit(0) db_to_map_Maps = {'cog':opts.input_cog_maps, 'seed':opts.input_seed_maps, 'kegg':opts.input_kegg_maps, 'cazy':opts.input_cazy_maps} results_dictionary={} dbname_weight={} checkOrCreateFolder(opts.output_dir) output_table_file = open(opts.output_dir + PATHDELIM +'functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") output_table_file.close() # print "memory used = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000) listOfOrfs = get_list_of_queries(opts.input_annotated_gff) listOfOrfs.sort(key=lambda tup: tup, reverse=False) #printlist(listOfOrfs,5) #sys.exit(0) ##### uncomment the following lines for dbname, blastoutput in zip(opts.database_name, opts.input_blastout): create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose= opts.verbose, errorlogger = errorlogger) ##### # process in blocks of size _stride lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) blastParsers={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') blastParsers[dbname].setMaxErrorsLimit(5) blastParsers[dbname].setErrorAndWarningLogger(errorlogger) # this part of the code computes the occurence of each of the taxons # which is use in the later stage is used to evaluate the min support # as used in the MEGAN software start = 0 Length = len(listOfOrfs) _stride = 100000 Taxons = {} while start < Length: pickorfs= {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]]= 'root' start = last #print 'Num of Min support orfs ' + str(start) results_dictionary={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): results = re.search(r'refseq', dbname, re.I) if results: #if True: try: results_dictionary[dbname]={} process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) #print results_dictionary[dbname].keys()[1:5] lca.set_results_dictionary(results_dictionary) lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname = dbname ) for key, taxon in pickorfs.iteritems(): Taxons[key] = taxon except: eprintf("ERROR: while training for min support tree %s\n", dbname) import traceback traceback.print_exc() blastParsers={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') # this loop determines the actual/final taxonomy of each of the ORFs # taking into consideration the min support filePermTypes= {} start = 0 outputfile = open( opts.output_dir +'/ORF_annotation_table.txt', 'w') short_to_long_dbnames = {} for dbname in opts.database_name: results = re.search(r'^seed', dbname, re.IGNORECASE) if results: short_to_long_dbnames['seed'] = dbname results = re.search(r'^cog', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cog'] = dbname results = re.search(r'^kegg', dbname, re.IGNORECASE) if results: short_to_long_dbnames['kegg'] = dbname results = re.search(r'^cazy', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cazy'] = dbname standard_dbs = ['cog', 'seed', 'kegg', 'cazy'] standard_db_maps = [opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps] field_to_description = {} hierarchical_map = {} for db in standard_dbs: if db in short_to_long_dbnames: field_to_description[db] = {} hierarchical_map[db] = {} for dbname in standard_dbs: if dbname in short_to_long_dbnames: try: read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname]) except: raise pass while start < Length: pickorfs= {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]]= True start = last gc.collect() eprintf("\nMemory used = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000)) results_dictionary={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): try: results_dictionary[dbname]={} eprintf("Processing database %s...", dbname) process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) eprintf("done\n") except: import traceback traceback.print_exc() eprintf("ERROR: %s\n", dbname) pass # print dbname + ' ' + str(len(results_dictionary[dbname])) eprintf("Num orfs processed : %s\n", str(start)) # create the annotations now orfToContig = {} create_annotation(results_dictionary, opts.database_name, opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: create_table(results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname, opts.output_dir, hierarchical_map, field_to_description) # create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType) print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description, filePermType = 'w') outputfile.close() # now remove the temporary files for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): try: remove( blastoutput + '.tmp') except: pass
def main(argv): global parser (opts, args) = parser.parse_args() if not check_arguments(opts, args): print usage sys.exit(0) # place to store list of expected taxonomic range(s) serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp try: print "Getting MetaCyc Expected Taxonomic Range(s)" # connect to Pathway Tools cyc = PythonCyc() cyc.setOrganism("meta") cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges, "w") # get expected taxonomic ranges for each pathway for pwy in pwys: my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() # close Pathway Tools cyc.stopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if opts.ncbi_megan_map: with open(opts.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[fields[0]] = fields[1] # get ORF to taxa map from annotation_table print "Getting ORF to Taxa Map from AnnotationTable" orf_lca = {} with open(opts.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: cyc = PythonCyc() cyc.setOrganism(opts.pgdb_name) cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() for pwy in pwys: genes = cyc.getPathwayORFs(pwy) rxns = cyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns cyc.stopPathwayTools() except: print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map print "Loading NCBI Taxonomy Map" lca = LCAComputation([opts.ncbi_tree]) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: id = lca.get_a_Valid_ID([orf_lca[orf]]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range: if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([expected[0], pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([expected[0], pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([expected[0], pwy_lca[pwy][0]]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([min_taxa, pwy_lca[pwy][0]]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [max_dist, observed, expected] # write out pathway table try: out = open(opts.table_out, "w") except: print "Had problems opening file: " + opts.table_out # write appropreate header if opts.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = opts.pgdb_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if opts.wtd: if pwy in pwy_to_wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa else: line.append("NA") line.append("NA") line.append("NA") line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file except: print "Had problems closing file: " + opts.table_out
def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('\n') printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" try: #print options.wtd, not path.isfile(serialized_metacyc_taxa_ranges), serialized_metacyc_taxa_ranges if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: insert_error(9) print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ except: print """ Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """ insert_error(9) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("\nINFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([ options.ncbi_tree ], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf] ) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} printf("INFO\tCalculating WTD\n") for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range and len(pwy_taxa_range[pwy]) : for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print "Had problems opening file: " + options.table_out insert_error(9) # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print "Had problems closing file: " + options.table_out insert_error(9)
def main(argv): global parser (opts, args) = parser.parse_args() if not check_arguments(opts, args): print(usage) sys.exit(0) # place to store list of expected taxonomic range(s) serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp try: print('Getting MetaCyc Expected Taxonomic Range(s)') # connect to Pathway Tools cyc = PythonCyc() cyc.setOrganism('meta') cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() # close Pathway Tools cyc.stopPathwayTools() except: print( """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """) else: # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if opts.ncbi_megan_map: with open(opts.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table print("Getting ORF to Taxa Map from AnnotationTable") orf_lca = {} with open(opts.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: cyc = PythonCyc() cyc.setOrganism(opts.pgdb_name) cyc.setPToolsExec(opts.pathway_tools) cyc.startPathwayTools() pwys = cyc.getAllPathways() for pwy in pwys: genes = cyc.getPathwayORFs(pwy) rxns = cyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns cyc.stopPathwayTools() except: print(""" Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map print("Loading NCBI Taxonomy Map") lca = LCAComputation([ opts.ncbi_tree ]) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range: if len(pwy_taxa_range[pwy]) > 0: for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print("Not a valid distance") continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table try: out = open(opts.table_out, "w") except: print("Had problems opening file: " + opts.table_out) # write appropreate header if opts.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = opts.pgdb_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if opts.wtd: if pwy in pwy_to_wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa else: line.append("NA") line.append("NA") line.append("NA") line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file except: print("Had problems closing file: " + opts.table_out)