def main(argv): (opts, args) = parser.parse_args() if check_arguments(opts, args): print usage sys.exit(0) input_folder = opts.input_folder output_file = opts.output_file filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa') cogSeqMatchesPATTERN = re.compile( r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa') list = [] for file in listdir(input_folder): if filePATTERN.match(file): hits = cogSeqMatchesPATTERN.search(file) if hits: list.append((hits.group(1), hits.group(2))) try: outputfile = open(output_file, 'w') except: print "Cannot open file to MLTreeMap hits" sys.exit(0) fprintf(outputfile, "Sequences\tCOG\n") for seq, cog in list: fprintf(outputfile, "%s\t%s\n", seq, cog) outputfile.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid): fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line= orf_dictionary[contig][candidate_orf_pos]['seqname'] for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field]) attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length'] attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial'] attributes += ";" + "sourcedb="+candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue="+str('0') attributes += ";" + "ec="+str('') attributes += ";" + "product="+'hypothetical protein' output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line);
def main(argv): (opts, args) = parser.parse_args() if check_arguments(opts, args): print usage sys.exit(0) input_folder = opts.input_folder output_file = opts.output_file filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa'); cogSeqMatchesPATTERN = re.compile(r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa'); list= [] for file in listdir(input_folder): if filePATTERN.match(file): hits = cogSeqMatchesPATTERN.search( file) if hits: list.append( (hits.group(1), hits.group(2)) ) try: outputfile = open(output_file, 'w') except: print "Cannot open file to MLTreeMap hits" sys.exit(0) fprintf(outputfile, "Sequences\tCOG\n") for seq, cog in list: fprintf(outputfile, "%s\t%s\n",seq, cog) outputfile.close()
def print_orf_table(results, refseq2peg, output_dir): if not path.exists(output_dir): makedirs(output_dir) outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w') orf_dict = {} for dbname in ['refseq', 'cog', 'kegg' ]: for seqname in results[dbname]: for orf in results[dbname][seqname]: if not orf['query'] in orf_dict: orf_dict[orf['query']] = {} if dbname =='cog': cog = cog_id(orf['product']) orf_dict[orf['query']][dbname] = cog if dbname =='kegg': kegg = kegg_id(orf['product']) orf_dict[orf['query']][dbname] = kegg if dbname=='refseq': refseq = refseq_id(orf['target']) if refseq in refseq2peg: refseq = refseq2peg[refseq] else: refseq = "" orf_dict[orf['query']][dbname] = refseq orf_dict[orf['query']]['contig'] = seqname for orfn in orf_dict: if 'cog' in orf_dict[orfn]: cogFn = orf_dict[orfn]['cog'] else: cogFn = "" if 'kegg' in orf_dict[orfn]: keggFn = orf_dict[orfn]['kegg'] else: keggFn = "" if 'metacyc' in orf_dict[orfn]: metacycPwy = orf_dict[orfn]['metacyc'] else: metacycPwy = "" if 'refseq' in orf_dict[orfn]: refseqFn = orf_dict[orfn]['refseq'] else: refseqFn = "" fprintf(outputfile, "%s\n", orfn + "\t" + orf_dict[orfn]['contig'] + '\t' + cogFn + '\t' + keggFn +'\t' + refseqFn + '\t' + metacycPwy) outputfile.close()
def copy_faa_gff_orf_prediction(source_files, target_files): for source, target in zip(source_files, target_files): #print source + ' ' + target sourcefile = open(source, 'r') targetfile = open(target, 'w') sourcelines = sourcefile.readlines() for line in sourcelines: fprintf(targetfile, "%s\n", line.strip()) sourcefile.close() targetfile.close()
def print_orf_table(results, output_dir): if not path.exists(output_dir): makedirs(output_dir) outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w') orf_dict = {} for dbname in results.iterkeys(): for seqname in results[dbname]: for orf in results[dbname][seqname]: if not orf['query'] in orf_dict: orf_dict[orf['query']] = {} if dbname =='cog': cog = cog_id(orf['product']) orf_dict[orf['query']][dbname] = cog if dbname =='kegg': kegg = kegg_id(orf['product']) orf_dict[orf['query']][dbname] = kegg if dbname=='seed': seed = orf['product'] orf_dict[orf['query']][dbname] = re.sub(r'\[.*\]','', seed).strip() orf_dict[orf['query']]['contig'] = seqname for orfn in orf_dict: if 'cog' in orf_dict[orfn]: cogFn = orf_dict[orfn]['cog'] else: cogFn = "" if 'kegg' in orf_dict[orfn]: keggFn = orf_dict[orfn]['kegg'] else: keggFn = "" if 'metacyc' in orf_dict[orfn]: metacycPwy = orf_dict[orfn]['metacyc'] else: metacycPwy = "" if 'seed' in orf_dict[orfn]: seedFn = orf_dict[orfn]['seed'] else: seedFn = "" fprintf(outputfile, "%s\n", orfn + "\t" + orf_dict[orfn]['contig'] + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + metacycPwy) outputfile.close()
def print_counts_at_level(hierarchical_map, field_to_description, depth, level, outputfile): if type(hierarchical_map) is type(0): return hierarchical_map count = 0 for key in hierarchical_map: tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile) if depth==level: if key in field_to_description: fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' + str(tempcount) ) else: fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount)) count+=tempcount return count
def write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag): fields = ['source', 'feature', 'start', 'end', 'score', 'strand', 'frame'] for rRNA in rRNA_dictionary: output_line = rRNA_dictionary[rRNA]['seqname'] for field in fields: output_line += "\t" + str(rRNA_dictionary[rRNA][field]) attributes = "ID=" + rRNA_dictionary[rRNA]['seqname'] + tag attributes += ";" + "locus_tag=" + rRNA_dictionary[rRNA][ 'seqname'] + tag attributes += ";" + "ec=" attributes += ";" + "product=" + rRNA_dictionary[rRNA]['product'] output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line)
def write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag): fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] for rRNA in rRNA_dictionary: output_line= rRNA_dictionary[rRNA]['seqname'] for field in fields: output_line += "\t"+ str(rRNA_dictionary[rRNA][field]) attributes = "ID="+rRNA_dictionary[rRNA]['seqname'] + tag attributes += ";" + "locus_tag="+rRNA_dictionary[rRNA]['seqname'] + tag attributes += ";" + "orf_length=0" attributes += ";" + "contig_length=0" attributes += ";" + "ec=" attributes += ";" + "product="+rRNA_dictionary[rRNA]['product'] output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line);
def add_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open(blast_table_out, 'r') refscores = {} lines = infile.readlines() for line in lines: line = line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n", key, value) infile.close()
def add_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: line=line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def print_counts_at_level(hierarchical_map, field_to_description, depth, level, outputfile): if type(hierarchical_map) is type(0): return hierarchical_map count = 0 for key in hierarchical_map: tempcount = print_counts_at_level(hierarchical_map[key], field_to_description, depth + 1, level, outputfile) if depth == level: if key in field_to_description: fprintf( outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' + str(tempcount)) else: fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount)) count += tempcount return count
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary): tag = re.sub(r'[.]gbk', '', input_gbk) tag = re.sub(r'.*/', '', tag) output_gbk_file = open(output_gbk, 'w') serializer = genbank.GenBankRecordSerializer() with open(input_gbk, 'r') as genbank_file: out_list = [] count = 0 for record in genbank.GenBankRecordParser(genbank_file.read()): count += 1 record.locus = tag + str(count) if count % 1000 == 0: print 'Count = ' + str(count) if headers and 'REFERENCES' in headers: record.references_ = headers['REFERENCES'] i = 0 for feature in record.features: if feature.type == "CDS": if feature.locus_tag in gff_dictionary: record.features[i].product = 'aaaaa ' + gff_dictionary[ feature.locus_tag]['product'] i += 1 #record.locus = "hello" out_list.append(serializer.serialize(record)) if count % 1000 == 0: output_str = '\n'.join(out_list) out_list = [] fprintf(output_gbk_file, '%s\n', output_str) output_str = '\n'.join(out_list) fprintf(output_gbk_file, '%s\n', output_str) output_gbk_file.close()
def process_file(genbank_filename, output_fna, output_faa, output_gff): with open(genbank_filename, 'r') as genbank_file: outputfnafile = open(output_fna, 'w') outputfaafile = open(output_faa, 'w') outputgfffile = open(output_gff, 'w') for record in genbank.GenBankRecordParser(genbank_file.read()): fprintf(outputfnafile, ">%s\n%s\n", record.locus, record.sequence) for feature in record.features: if feature.type == 'CDS': fprintf(outputfaafile, ">%s\n%s\n", feature.locus_tag, feature.translation) start, end, strand = feature.coordinates gff_Str = record.locus gff_Str += '\t' + 'Genbank file' gff_Str += '\t' + 'CDS' gff_Str += '\t' + start gff_Str += '\t' + end gff_Str += '\t' + '0' gff_Str += '\t' + strand gff_Str += '\t' + '0' gff_Str += '\tID=' + feature.locus_tag gff_Str += ';locus_tag=' + feature.locus_tag if feature.product: gff_Str += ';product=' + feature.product fprintf(outputgfffile, "%s\n", gff_Str) outputfnafile.close() outputgfffile.close() outputfaafile.close()
def process_file(genbank_filename, output_fna, output_faa, output_gff): with open(genbank_filename, 'r') as genbank_file: outputfnafile = open(output_fna,'w') outputfaafile = open(output_faa,'w') outputgfffile = open(output_gff,'w') for record in genbank.GenBankRecordParser(genbank_file.read()): fprintf(outputfnafile, ">%s\n%s\n", record.locus, record.sequence) for feature in record.features: if feature.type=='CDS': fprintf(outputfaafile, ">%s\n%s\n", feature.locus_tag, feature.translation) start, end, strand = feature.coordinates gff_Str = record.locus gff_Str += '\t' + 'Genbank file' gff_Str += '\t' + 'CDS' gff_Str += '\t' + start gff_Str += '\t' + end gff_Str += '\t' + '0' gff_Str += '\t' + strand gff_Str += '\t' + '0' gff_Str += '\tID=' + feature.locus_tag gff_Str += ';locus_tag=' + feature.locus_tag if feature.product: gff_Str += ';product=' + feature.product fprintf(outputgfffile, "%s\n", gff_Str) outputfnafile.close() outputgfffile.close() outputfaafile.close()
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary): tag = re.sub(r"[.]gbk", "", input_gbk) tag = re.sub(r".*/", "", tag) output_gbk_file = open(output_gbk, "w") serializer = genbank.GenBankRecordSerializer() with open(input_gbk, "r") as genbank_file: out_list = [] count = 0 for record in genbank.GenBankRecordParser(genbank_file.read()): count += 1 record.locus = tag + str(count) if count % 1000 == 0: print "Count = " + str(count) if headers and "REFERENCES" in headers: record.references_ = headers["REFERENCES"] i = 0 for feature in record.features: if feature.type == "CDS": if feature.locus_tag in gff_dictionary: record.features[i].product = "aaaaa " + gff_dictionary[feature.locus_tag]["product"] i += 1 # record.locus = "hello" out_list.append(serializer.serialize(record)) if count % 1000 == 0: output_str = "\n".join(out_list) out_list = [] fprintf(output_gbk_file, "%s\n", output_str) output_str = "\n".join(out_list) fprintf(output_gbk_file, "%s\n", output_str) output_gbk_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid): fields = ['source', 'feature', 'start', 'end', 'score', 'strand', 'frame'] # print contig # print orf_dictionary[contig] # print results_dictionary output_line = orf_dictionary[contig][candidate_orf_pos]['seqname'] for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t" + str( orf_dictionary[contig][candidate_orf_pos][field]) attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag=" + orf_dictionary[contig][ candidate_orf_pos]['locus_tag'] attributes += ";" + "partial=" + orf_dictionary[contig][candidate_orf_pos][ 'partial'] attributes += ";" + "sourcedb=" + candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[candidatedbname][ orfid]['product'] else: attributes += ";" + "annotvalue=" + str('0') attributes += ";" + "ec=" + str('') attributes += ";" + "product=" + 'hypothetical protein' output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line)
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames): commentPATTERN = re.compile(r'^#') infile = open(blast_table_out, 'r') refscores = {} lines = infile.readlines() for line in lines: if commentPATTERN.match(line): continue line = line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) if fields[6].rstrip() == fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[1]] = fields[0] for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n", key, value) infile.close()
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames): commentPATTERN = re.compile(r'^#') infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: if commentPATTERN.match(line): continue line=line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) if fields[6].rstrip()==fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[1]]=fields[0] for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def print_counts_at_level(hierarchical_map, field_to_description, depth, level, outputfile, printKey=True, header=None): if type(hierarchical_map) is type(0): return hierarchical_map if header: fprintf(outputfile, "%s\n",header ) count = 0 for key in hierarchical_map: tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile, printKey=printKey) if depth==level: if key in field_to_description: if printKey: fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' + str(tempcount) ) else: fprintf(outputfile, "%s\n", field_to_description[key] + '\t' + str(tempcount) ) else: if printKey: print "True 2" fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount)) else: fprintf(outputfile, "%s\n", key + '\t' + str(tempcount)) count+=tempcount return count
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts) fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = blastoutput + '.parsed.txt' outputfile = open(output_blastoutput_parsed, 'w') fprintf(outputfile, "#%s",'query') for field in fields: fprintf(outputfile,"\t%s",field) fprintf(outputfile, "\n") for data in blastparser: if not data: continue try: fprintf(outputfile, "%s",data['query']) except: print data sys.exit() for field in fields: fprintf(outputfile, "\t%s",data[field]) fprintf(outputfile, "\n") outputfile.close() # add_refscore_to_file(blastoutput,refscore_file, allNames) return None
def create_annotation(results_dictionary, annotated_gff, output_dir, ncbi_taxonomy_tree_file, min_score, top_percent, min_support): meganTree = None lca = None if 'refseq' in results_dictionary: lca = LCAComputation(ncbi_taxonomy_tree_file) lca.setParameters(min_score, top_percent, min_support) meganTree = MeganTree(lca) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: taxonomy = None if count%10000==0 : pass species = [] if 'refseq' in results_dictionary: if orf['id'] in results_dictionary['refseq']: for hit in results_dictionary['refseq'][orf['id']]: if hit['bitscore'] >= min_score: names = get_species(hit) if names: species.append(names) #print '---------------------------' # else: # print "hit " + hit['query'] + ' ' + hit['dbname'] + ' ' + str(hit['bitscore'] ) if lca: taxonomy=lca.getTaxonomy(species) fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", orf['product']) if meganTree and taxonomy != '': meganTree.insertTaxon(taxonomy) # print 'inserted taxon of taxonomy : ', taxonomy #print meganTree.getChildToParentMap() output_table_file.close() # print meganTree.getParentToChildrenMap() if meganTree: print output_dir + '/megan_tree.tre' megan_tree_file = open(output_dir + '/megan_tree.tre', 'w') #print meganTree.printTree('1') # exit() fprintf(megan_tree_file, "%s;", meganTree.printTree('1')) # print 'wrote out megan_tree_file' megan_tree_file.close()
def main(argv): (opts, args) = parser.parse_args() if not valid_arguments(opts, args): print usage sys.exit(0) min_length = opts.min_length inputfile = open(opts.input_fasta,'r') outfile = open(opts.output_fasta, 'w') logfile = open(opts.log_file, 'w') if opts.map_file: mapfile = open(opts.map_file, 'w') else: mapfile = None sample_name = opts.input_fasta; sample_name = re.sub(r'^.*/','',sample_name, re.I) sample_name = re.sub(r'^.*\\','',sample_name, re.I) sample_name = re.sub(r'\.fasta$','',sample_name, re.I) sample_name = re.sub(r'\.fna$','',sample_name, re.I) sample_name = re.sub(r'\.faa$','',sample_name, re.I) sample_name = re.sub(r'\.fas$','',sample_name, re.I) BEFORE = 'BEFORE' AFTER = 'AFTER' NUMSEQ = "Number of sequences :" NUMSEQ_SHORTER = "Number of sequences shorter than " AV_LENGTH= "Average length of sequences:" MIN_LENGTH= "Minimum length of sequences:" MAX_LENGTH= "Maximum length of sequences:" stats = { MIN_LENGTH: { 'BEFORE':10000000, 'AFTER':1000000 }, MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 }, NUMSEQ : { 'BEFORE' :0, 'AFTER':0}, NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 }, AV_LENGTH : { 'BEFORE':0, 'AFTER':0 }, } length_distribution = {} length_cumulative_distribution = {} for i in range(0,31): length_distribution[i]= 0 length_cumulative_distribution[i]= 0 seq_count = 0 allNames= dict() outputStr = "" outputLines = [] for record in read_fasta_records(inputfile): seqname = record.name seq = record.sequence length = len(seq) index = int(len(seq) / 50); if index >= 30: index = 30 #print length($seq) ."\t".$index."\n"; length_distribution[index] += 1 if length < stats[MIN_LENGTH][BEFORE] : stats[MIN_LENGTH][BEFORE] = length if length > stats[MAX_LENGTH][BEFORE] : stats[MAX_LENGTH][BEFORE] = length if length < MIN_LENGTH: stats[NUMSEQ_SHORTER][BEFORE] += 1 stats[AV_LENGTH][BEFORE] = stats[AV_LENGTH][BEFORE] + length seqvalue = filter_sequence(seq) stats[NUMSEQ][BEFORE] += 1 seqlen = len(seqvalue) if seqlen>= min_length : stats[NUMSEQ][AFTER] += 1 stats[AV_LENGTH][AFTER] = stats[AV_LENGTH][AFTER] + seqlen if mapfile==None: fprintf(outfile, "%s\n", seqname) else: fprintf(outfile, ">%s\n", sample_name + '_' + str(seq_count) ) key = re.sub(r'^>','',seqname) fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key) seq_count += 1 fprintf(outfile, "%s\n",seqvalue) if seqlen < stats[MIN_LENGTH][AFTER] : stats[MIN_LENGTH][AFTER] = seqlen if seqlen > stats[MAX_LENGTH][AFTER] : stats[MAX_LENGTH][AFTER] = seqlen if stats[NUMSEQ][BEFORE] > 0 : stats[AV_LENGTH][BEFORE] = stats[AV_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE] else: stats[AV_LENGTH][BEFORE] = 0 if stats[NUMSEQ][AFTER] > 0 : stats[AV_LENGTH][AFTER] = stats[AV_LENGTH][AFTER]/stats[NUMSEQ][AFTER] else : stats[AV_LENGTH][AFTER] = 0 outfile.close() inputfile.close() if mapfile != None: mapfile.close() fprintf(logfile, " %s\n", " \tBEFORE\tAFTER"); fprintf(logfile, " %s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER])); fprintf(logfile, " %s\n", NUMSEQ_SHORTER + str(MIN_LENGTH) + ':\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER])) fprintf(logfile, " %s\n", AV_LENGTH +'\t' + str(stats[AV_LENGTH][BEFORE]) + '\t'+ str(stats[AV_LENGTH][AFTER] )) fprintf(logfile, " %s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER])) fprintf(logfile, " %s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' + str(stats[MAX_LENGTH][AFTER])) fprintf(logfile, "\n\n"); fprintf(logfile, " READ_LENGTH_RANGE\tFREQUENCY\t\tMIN_LENGTH\tCUMULATIVE_FREQUENCY\n"); fprintf(logfile, " -----------------\t---------\t\t----------\t--------------------\n"); i = 30 length_cumulative_distribution[i] = length_cumulative_distribution[i]; i -= 1 while i >= 0: length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i]; i -= 1 for i in range(0,31): fprintf(logfile, " %s\n", str(i*50) + '-' + str((i+1)*50) + '\t' +\ str(length_distribution[i]) +'\t\t\t' + str( (i+1)*50) + '\t' + str(length_cumulative_distribution[i]) ) logfile.close()
def create_annotation(results_dictionary, annotated_gff, output_dir): file = 'blastDB/ncbi_taxonomy_tree.txt' lca = LCAComputation(file) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "id\tseqname\tstart\tend\tstrand\tec\ttaxonomy\tproduct\n") count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: #print orf if count%10000==0 : # print "fandt " + str(count) pass species = [] if 'refseq' in results_dictionary: if orf['id'] in results_dictionary['refseq']: for hit in results_dictionary['refseq'][orf['id']]: names = get_species(hit) if names: species.append(names) #print species #print '---------------------------' taxonomy=lca.getTaxonomy(species) fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) #fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", orf['product']) output_table_file.close()
def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_stats_files, tRNA_stats_files, output_gff, output_comparative_annotation): orf_dictionary={} # process_gff_file(input_gff, orf_dictionary) gffreader = GffFileParser(input_gff) output_gff_tmp = output_gff + ".tmp" outputgff_file = open( output_gff_tmp, 'w') output_comp_annot_file1 = open( output_comparative_annotation + '.1.txt', 'w') output_comp_annot_file2 = open( output_comparative_annotation + '.2.txt', 'w') output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue' fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str) output_comp_annot_file2_Str = 'orf_id' dbnames = dbname_weight.keys() for dbname in dbnames: weight = dbname_weight[dbname] output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(dbname) fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str) # gffreader = GffReader(input_gff) for contig in gffreader: count = 0 for orf in gffreader.orf_dictionary[contig]: #print orf value = 0.0001 success =False output_comp_annot_file1_Str = '' output_comp_annot_file2_Str = '' for dbname in dbnames: weight = dbname_weight[dbname] value = 0 if orf['id'] in results_dictionary[dbname]: if value < results_dictionary[dbname][orf['id']]['value']: value = results_dictionary[dbname][orf['id']]['value'] candidatedbname=dbname success =True candidate_orf_pos = count if output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: if not output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], '','','','') if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format('', '','') else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], '','','','') if success: # there was a database hit fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str) fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str) write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos, orf['id']) else: # if it was not a hit then it is a hypothetical protein #print gffreader.orf_dictionary write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf['id']) count +=1 #move to the next orf #del orf_dictionary[contig] output_comp_annot_file1.close() output_comp_annot_file2.close() # now deal with the rRNA sequences if there is rRNA stats file if len(rRNA_16S_stats_files) > 0 : rRNA_16S_dictionary={} for rRNA_16S_stats_file in rRNA_16S_stats_files: process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary) rRNA_dictionary = {} add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary) #print rRNA_dictionary write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA') # now deal with the tRNA sequences if there is tRNA stats file if len(tRNA_stats_files) > 0 : tRNA_dictionary={} for tRNA_stats_file in tRNA_stats_files: process_tRNA_stats(tRNA_stats_file, tRNA_dictionary) tRNA_gff_dictionary = {} add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary) write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA') #print tRNA_dictionary outputgff_file.close() rename(output_gff_tmp, output_gff)
def create_annotation(results_dictionary, annotated_gff, output_dir): file = 'blastDB/ncbi_taxonomy_tree.txt' lca = LCAComputation(file) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary = {} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open( output_dir + '/functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "id\tseqname\tstart\tend\tstrand\tec\ttaxonomy\tproduct\n") count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: #print orf if count % 10000 == 0: # print "fandt " + str(count) pass species = [] if 'refseq' in results_dictionary: if orf['id'] in results_dictionary['refseq']: for hit in results_dictionary['refseq'][orf['id']]: names = get_species(hit) if names: species.append(names) #print species #print '---------------------------' taxonomy = lca.getTaxonomy(species) fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) #fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", orf['product']) output_table_file.close()
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts) fields = [ 'q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = blastoutput + '.parsed.txt' outputfile = open(output_blastoutput_parsed, 'w') fprintf(outputfile, "#%s", 'query') for field in fields: fprintf(outputfile, "\t%s", field) fprintf(outputfile, "\n") for data in blastparser: if not data: continue try: fprintf(outputfile, "%s", data['query']) except: print data sys.exit() for field in fields: fprintf(outputfile, "\t%s", data[field]) fprintf(outputfile, "\n") outputfile.close() # add_refscore_to_file(blastoutput,refscore_file, allNames) return None
def main(argv): (opts, args) = parser.parse_args() if check_arguments(opts, args): print usage sys.exit(0) input_fasta = opts.input_fasta output_file = opts.output_file blast_executable = opts.blast_executable formatdb_executable = opts.formatdb_executable algorithm = opts.algorithm # input file to blast with itself to commpute refscore infile = open(input_fasta, 'r') #this file has the refscores of the entire file outfile = open(output_file, 'w') count = 0 allNames = dict() for record in read_fasta_records(infile): if count % SIZE == 0: if count > 0: seq_subset_file.close() compute_refscores(formatdb_executable, blast_executable, seq_subset_file, outfile, allNames, algorithm) # now remove the old file if algorithm == 'BLAST': remove_blast_index_files(seq_subset_file.name) if algorithm == 'LAST': remove_last_index_files(seq_subset_file.name) remove(seq_subset_file.name) seq_subset_file = open( output_file + '.tmp.' + str(count) + '.fasta', 'w') allNames[record.name.replace(">", "")] = False fprintf(seq_subset_file, "%s\n", record.name) fprintf(seq_subset_file, "%s\n", record.sequence) count = count + 1 #print str(count) + " " + "going to blast last sequence " if (count) % SIZE != 0: #print str(count) + " " + "last sequence " seq_subset_file.close() compute_refscores(formatdb_executable, blast_executable, seq_subset_file, outfile, allNames, algorithm) remove(seq_subset_file.name) if algorithm == 'BLAST': remove_blast_index_files(seq_subset_file.name) if algorithm == 'LAST': remove_last_index_files(seq_subset_file.name) #print count for key in allNames: if allNames[key] == False: fprintf(outfile, "%s\t%s\n", key, 1000000) outfile.close()
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts) fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = blastoutput + '.parsed.txt' # temporary file is used to deal with incomplete processing of the file output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp" outputfile = open(output_blastoutput_parsed_tmp, 'w') # write the headers out fprintf(outputfile, "#%s",'query') for field in fields: fprintf(outputfile,"\t%s",field) fprintf(outputfile, "\n") for data in blastparser: if not data: continue try: fprintf(outputfile, "%s",data['query']) except: print 'data is : ', data, '\n' sys.exit() for field in fields: fprintf(outputfile, "\t%s",data[field]) fprintf(outputfile, "\n") outputfile.close() rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return None
def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_stats_files, tRNA_stats_files, output_gff, output_comparative_annotation): orf_dictionary = {} # process_gff_file(input_gff, orf_dictionary) gffreader = GffFileParser(input_gff) outputgff_file = open(output_gff, 'w') output_comp_annot_file1 = open(output_comparative_annotation + '.1.txt', 'w') output_comp_annot_file2 = open(output_comparative_annotation + '.2.txt', 'w') output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue' fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str) output_comp_annot_file2_Str = 'orf_id' dbnames = dbname_weight.keys() for dbname in dbnames: weight = dbname_weight[dbname] output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format( dbname) fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str) # gffreader = GffReader(input_gff) for contig in gffreader: count = 0 for orf in gffreader.orf_dictionary[contig]: #print orf value = 0.0001 success = False output_comp_annot_file1_Str = '' output_comp_annot_file2_Str = '' for dbname in dbnames: weight = dbname_weight[dbname] value = 0 if orf['id'] in results_dictionary[dbname]: if value < results_dictionary[dbname][orf['id']]['value']: value = results_dictionary[dbname][orf['id']]['value'] candidatedbname = dbname success = True candidate_orf_pos = count if output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: if not output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format( orf['id'], '', '', '', '') if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format( '', '', '') else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format( orf['id'], '', '', '', '') if success: # there was a database hit fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str) fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str) write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos, orf['id']) else: # if it was not a hit then it is a hypothetical protein #print gffreader.orf_dictionary write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf['id']) count += 1 #move to the next orf #del orf_dictionary[contig] output_comp_annot_file1.close() output_comp_annot_file2.close() # now deal with the rRNA sequences if there is rRNA stats file if len(rRNA_16S_stats_files) > 0: rRNA_16S_dictionary = {} for rRNA_16S_stats_file in rRNA_16S_stats_files: process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary) rRNA_dictionary = {} add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary) #print rRNA_dictionary write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA') # now deal with the tRNA sequences if there is tRNA stats file if len(tRNA_stats_files) > 0: tRNA_dictionary = {} for tRNA_stats_file in tRNA_stats_files: process_tRNA_stats(tRNA_stats_file, tRNA_dictionary) tRNA_gff_dictionary = {} add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary) write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA') #print tRNA_dictionary outputgff_file.close()
def main(argv): global parser (opts, args) = parser.parse_args(argv) if check_arguments(opts, args): print usage sys.exit(0) input_fasta = opts.input_fasta output_file = opts.output_file blast_executable = opts.blast_executable formatdb_executable = opts.formatdb_executable algorithm = opts.algorithm # input file to blast with itself to commpute refscore infile = open(input_fasta,'r') #this file has the refscores of the entire file outfile = open(output_file, 'w') count = 0 allNames= dict() for record in read_fasta_records(infile): if count % SIZE == 0: if count > 0: seq_subset_file.close() compute_refscores(formatdb_executable, blast_executable,seq_subset_file, outfile, allNames, algorithm); # now remove the old file if algorithm == 'BLAST' : remove_blast_index_files(seq_subset_file.name) if algorithm == 'LAST' : remove_last_index_files(seq_subset_file.name) remove(seq_subset_file.name) seq_subset_file = open(output_file +'.tmp.'+ str(count) +'.fasta','w') allNames[record.name.replace(">","")] = False; fprintf(seq_subset_file, "%s\n", record.name) fprintf(seq_subset_file, "%s\n", record.sequence) count = count + 1 #print str(count) + " " + "going to blast last sequence " if (count) % SIZE != 0: #print str(count) + " " + "last sequence " seq_subset_file.close() compute_refscores(formatdb_executable, blast_executable,seq_subset_file, outfile, allNames, algorithm); remove(seq_subset_file.name) if algorithm == 'BLAST' : remove_blast_index_files(seq_subset_file.name) if algorithm == 'LAST' : remove_last_index_files(seq_subset_file.name) #print count for key in allNames: if allNames[key] ==False: fprintf(outfile, "%s\t%s\n",key, 1000000) outfile.close()
def create_annotation(results_dictionary, annotated_gff, output_dir, ncbi_taxonomy_tree_file): lca = LCAComputation(ncbi_taxonomy_tree_file) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") meganTree = MeganTree(lca) count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: if count%10000==0 : # print "fandt " + str(count) pass species = [] if 'refseq' in results_dictionary: if orf['id'] in results_dictionary['refseq']: for hit in results_dictionary['refseq'][orf['id']]: names = get_species(hit) if names: species.append(names) #print species #print '---------------------------' taxonomy=lca.getTaxonomy(species) fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) #fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", orf['product']) meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() output_table_file.close() #print meganTree.getChildToParentMap() # print meganTree.getParentToChildrenMap() megan_tree_file = open(output_dir + '/megan_tree.tre', 'w') fprintf(megan_tree_file, "%s;", meganTree.printTree('1')) megan_tree_file.close()