def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid, compact_output): try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line = orf_dictionary[contig][candidate_orf_pos]['seqname'] #if compact_output: output_line = ShortenContigId(output_line) for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t" + str( orf_dictionary[contig][candidate_orf_pos][field]) #if compact_output: try: attributes = "ID=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['id']) attributes += ";" + "locus_tag=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['locus_tag']) except: attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag=" + orf_dictionary[contig][ candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length=" + orf_dictionary[contig][ candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length=" + orf_dictionary[contig][ candidate_orf_pos]['orf_length'] attributes += ";" + "partial=" + orf_dictionary[contig][ candidate_orf_pos]['partial'] attributes += ";" + "sourcedb=" + candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[ candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue=" + str('0') attributes += ";" + "ec=" + str('') attributes += ";" + "product=" + 'hypothetical protein' output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line) except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) exit_process()
def insert_orf_into_dict(line, contig_dict, shortenorfid=False): rawfields = re.split('\t', line) fields = [] for field in rawfields: fields.append(field.strip()); if( len(fields) != 9): return attributes = {} seqname = fields[0] try: if shortenorfid: seqname = ShortenContigId(fields[0]) else: seqname = fields[0] except: seqname = fields[0] attributes['seqname'] = seqname # this is a bit of a duplication attributes['source'] = fields[1] attributes['feature'] = fields[2] attributes['start'] = int(fields[3]) attributes['end'] = int(fields[4]) try: attributes['score'] = float(fields[5]) except: attributes['score'] = fields[5] attributes['strand'] = fields[6] attributes['frame'] = fields[7] split_attributes(fields[8], attributes) if not seqname in contig_dict : contig_dict[seqname] = [] contig_dict[seqname].append(attributes)
def create_annotation(results_dictionary, dbname, annotated_gff, output_dir, Taxons, orfsPicked, orfToContig, lca, compact_output= False, sample_name = ""): meganTree = None #lca.set_results_dictionary(results_dictionary) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_name = output_dir + PATHDELIM + sample_name + ".functional_and_taxonomic_table.txt" output_table_file = open(output_table_name, 'a') count = 0 for contig in gffreader: # shortORFId = getShortORFId(orf['id']) for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) count +=1 #shortORFId = ShortenORFId(orf['id']) if shortORFId not in orfsPicked: continue orfToContig[shortORFId] = contig taxonomy = None #_results = re.search(r'refseq', opts_global.database_name, re.I) if shortORFId in Taxons: taxonomy1=Taxons[shortORFId] #print taxonomy1, shortORFId taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True) # print taxonomy_id preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id) if preferred_taxonomy: taxonomy = preferred_taxonomy else: taxonomy = Taxons[shortORFId] else: taxonomy = 'root' product = orf['product'] # leave product as it is # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip() # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip() # if "partial" in orf['product']: # print orf['product'].strip() # print product orf_id = orf['id'] seqname = orf['seqname'] if compact_output: orf_id = ShortenORFId(orf_id) seqname = ShortenContigId(seqname) fprintf(output_table_file, "%s", orf_id) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", seqname) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", product) # adding taxons to the megan tree #if meganTree and taxonomy != '': # meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() output_table_file.close()
def print_orf_table(results, orfToContig, output_dir, outputfile, compact_output=False): addHeader =True if not path.exists(output_dir): makedirs(output_dir) orf_dict = {} for dbname in results.keys(): print dbname, len(results[dbname].keys()) for orfname in results[dbname]: for orf in results[dbname][orfname]: if not orf['query'] in orf_dict: orf_dict[orf['query']] = {} if dbname in orf_dict[orf['query']]: # only the best hit prevails continue #if orf['query']=='2_0' and dbname=='refseq-nr-2014-01-18': # print orf orf_dict[orf['query']]['contig'] = orfToContig[orfname] product = orf['product'].strip() _results = re.search(r'cog', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = cog_id(product) continue _results = re.search(r'kegg', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = kegg_id(product) continue _results = re.search(r'cazy', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = cazy_id(product) continue _results = re.search(r'metacyc', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = product continue _results = re.search(r'seed', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = seed_id(product) # print "---", orf_dict[orf['query']][dbname] continue #if dbname=='refseq-nr-2014-01-18': # if orf['query']=='2_0': # print product #adds it anyway orf_dict[orf['query']][dbname] = product # compute the databases database_maps = {} for dbname in results.keys(): _results = re.search(r'cog', dbname, re.I) if _results: database_maps['cog'] = dbname continue _results = re.search(r'kegg', dbname, re.I) if _results: database_maps['kegg'] = dbname hit=True continue _results = re.search(r'cazy', dbname, re.I) if _results: database_maps['cazy'] = dbname continue _results = re.search(r'seed', dbname, re.I) if _results: database_maps['seed'] = dbname continue _results = re.search(r'metacyc', dbname, re.I) if _results: database_maps['metacyc'] = dbname continue _results = re.search(r'refseq', dbname, re.I) if _results: database_maps['refseq'] = dbname continue database_maps[dbname] = dbname std_dbnames = ['cog', 'kegg', 'seed', 'cazy', 'metacyc', 'refseq'] dbnames = std_dbnames headers = ["# ORF_ID", "CONTIG_ID"] for std_dbname in std_dbnames: headers.append(std_dbname.upper()) for dbname in sorted(results.keys()): non_std =True for std_dbname in std_dbnames: if re.search(std_dbname, dbname, re.I): non_std =False if non_std: dbnames.append(dbname) headers.append(std_dbname) sampleName = None for orfn in orf_dict: #if orfn=='2_0': # print orfn, '<<', orf_dict[orfn], ' >> xxxx' #_keys = orf_dict[orfn].keys() #_results = re.search(r'cog', dbname, re.I) if 'cog' in database_maps and database_maps['cog'] in orf_dict[orfn]: cogFn = orf_dict[orfn][database_maps['cog']] else: cogFn = "" if 'kegg' in database_maps and database_maps['kegg'] in orf_dict[orfn]: keggFn = orf_dict[orfn][database_maps['kegg']] #print orfn, keggFn else: keggFn = "" if 'metacyc' in database_maps and database_maps['metacyc'] in orf_dict[orfn]: metacycPwy = orf_dict[orfn][database_maps['metacyc']] else: metacycPwy = "" if 'seed' in database_maps and database_maps['seed'] in orf_dict[orfn]: seedFn = orf_dict[orfn][database_maps['seed']] else: seedFn = "" if 'cazy' in database_maps and database_maps['cazy'] in orf_dict[orfn]: cazyFn = orf_dict[orfn][database_maps['cazy']] else: cazyFn= "" if 'refseq' in database_maps and database_maps['refseq'] in orf_dict[orfn]: refseqFn = orf_dict[orfn][database_maps['refseq']] else: refseqFn= "" if not sampleName: sampleName = getSampleNameFromContig(orf_dict[orfn]['contig']) orfName = orfn contigName= orf_dict[orfn]['contig'] if compact_output: orfName = orfn contigName= ShortenContigId(contigName) row = [ orfName, contigName ] for dbname in dbnames: if dbname in database_maps and database_maps[dbname] in orf_dict[orfn]: row.append(orf_dict[orfn][database_maps[dbname]]) else: row.append("") # print '\t'.join(headers) # print '\t'.join(row) #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy) if addHeader: #fprintf(outputfile, "# %s\n", '\t'.join(headers)_"ORF_ID" + "\t" + "CONTIG_ID" + '\t' + "COG" + '\t' + "KEGG" +'\t' + "SEED" + '\t' + "CAZY" + '\t'+ "METACYC" + '\t' + "REFSEQ" ) fprintf(outputfile, "# %s\n", '\t'.join(headers)) addHeader=False #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy + '\t' + refseqFn ) fprintf(outputfile, "%s\n", '\t'.join(row))
def print_orf_table(results, orfToContig, output_dir, outputfile, compact_output=False): if not path.exists(output_dir): makedirs(output_dir) orf_dict = {} for dbname in results.keys(): print dbname, len(results[dbname].keys()) for orfname in results[dbname]: for orf in results[dbname][orfname]: if not orf['query'] in orf_dict: orf_dict[orf['query']] = {} if dbname in orf_dict[orf['query']]: continue #if orf['query']=='2_0' and dbname=='refseq-nr-2014-01-18': # print orf orf_dict[orf['query']]['contig'] = orfToContig[orfname] product = orf['product'].strip() _results = re.search(r'cog', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = cog_id(product) continue _results = re.search(r'kegg', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = kegg_id(product) continue _results = re.search(r'cazy', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = cazy_id(product) continue _results = re.search(r'metacyc', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = product continue _results = re.search(r'seed', dbname, re.I) if _results: orf_dict[orf['query']][dbname] = seed_id(product) # print "---", orf_dict[orf['query']][dbname] continue #if dbname=='refseq-nr-2014-01-18': # if orf['query']=='2_0': # print product orf_dict[orf['query']][dbname] = product # compute the databases database_maps = {} for dbname in results.keys(): _results = re.search(r'cog', dbname, re.I) if _results: database_maps['cog'] = dbname _results = re.search(r'kegg', dbname, re.I) if _results: database_maps['kegg'] = dbname _results = re.search(r'cazy', dbname, re.I) if _results: database_maps['cazy'] = dbname _results = re.search(r'seed', dbname, re.I) if _results: database_maps['seed'] = dbname _results = re.search(r'metacyc', dbname, re.I) if _results: database_maps['metacyc'] = dbname _results = re.search(r'refseq', dbname, re.I) if _results: database_maps['refseq'] = dbname sampleName = None for orfn in orf_dict: #if orfn=='2_0': # print orfn, '<<', orf_dict[orfn], ' >> xxxx' #_keys = orf_dict[orfn].keys() #_results = re.search(r'cog', dbname, re.I) if 'cog' in database_maps and database_maps['cog'] in orf_dict[orfn]: cogFn = orf_dict[orfn][database_maps['cog']] else: cogFn = "" if 'kegg' in database_maps and database_maps['kegg'] in orf_dict[orfn]: keggFn = orf_dict[orfn][database_maps['kegg']] #print orfn, keggFn else: keggFn = "" if 'metacyc' in database_maps and database_maps['metacyc'] in orf_dict[orfn]: metacycPwy = orf_dict[orfn][database_maps['metacyc']] else: metacycPwy = "" if 'seed' in database_maps and database_maps['seed'] in orf_dict[orfn]: seedFn = orf_dict[orfn][database_maps['seed']] else: seedFn = "" if 'cazy' in database_maps and database_maps['cazy'] in orf_dict[orfn]: cazyFn = orf_dict[orfn][database_maps['cazy']] else: cazyFn= "" if 'refseq' in database_maps and database_maps['refseq'] in orf_dict[orfn]: refseqFn = orf_dict[orfn][database_maps['refseq']] else: refseqFn= "" if not sampleName: sampleName = getSampleNameFromContig(orf_dict[orfn]['contig']) orfName = orfn contigName= orf_dict[orfn]['contig'] if compact_output: orfName = orfn contigName= ShortenContigId(contigName) #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy) fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy + '\t' + refseqFn )