def __init__(self, dbname, blastoutput): self.dbname = dbname self.blastoutput = blastoutput self.i=1 self.data = {} self.fieldmap={} self.seq_beg_pattern = re.compile("#") try: self.blastoutputfile = open( blastoutput,'r') self.lines=self.blastoutputfile.readlines() self.blastoutputfile.close() self.size = len(self.lines) if not self.seq_beg_pattern.search(self.lines[0]) : exit_process("First line must have field header names and begin with \"#\"") header = self.lines[0].replace('#','',1) fields = [ x.strip() for x in header.rstrip().split('\t')] k = 0 for x in fields: self.fieldmap[x] = k k+=1 eprintf("\nProcessing database : %s\n", dbname) except AttributeError: eprintf("Cannot read the map file for database :%s\n", dbname) exit_process()
def process_blastout_file(blast_file, database, table, errorlogger=None): try: blastfile = open(blast_file, 'r') except IOError: eprintf("ERROR : Cannot write read file " + blast_file + " !") if errorlogger != None: errorlogger.write( "STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database) exit_process() blastLines = blastfile.readlines() blastfile.close() for line in blastLines: line = line.strip() fields = re.split('\t', line) if len(fields) < 12: continue fields[0] = str(fields[0].strip()) fields[1] = str(fields[1].strip()) fields[2] = float(fields[2].strip()) fields[6] = int(fields[6].strip()) fields[7] = int(fields[7].strip()) fields[10] = float(fields[10].strip()) fields[11] = float(fields[11].strip()) table[str(fields[0].strip())] = [ fields[2], fields[10], fields[11], fields[1], fields[6], fields[7] ]
def write_run_parameters_file(fileName, parameters): try: paramFile = open(fileName, 'w') except IOError: eprintf("Cannot write run parameters to file %s!\n", fileName) exit_process("Cannot write run parameters to file %s" %(fileName) ) # 16s_rRNA {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'} paramFile.write("\nRun Date : " + str(date.today()) + " \n") paramFile.write("\n\nNucleotide Quality Control parameters[s.n") paramFile.write( " min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n") paramFile.write("\n\nORF prediction parameters[s.n") paramFile.write( " min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n") paramFile.write( " algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n") paramFile.write("\n\nAmino acid quality control and annotation parameters[s.n") paramFile.write( " min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n") paramFile.write( " min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n") paramFile.write( " annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n") paramFile.write( " min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n") paramFile.write( " max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n") paramFile.write("\n\nPathway Tools parameters[s.n") paramFile.write( " taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n") paramFile.write("\n\nrRNA search/match parameters[s.n") paramFile.write( " min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n") paramFile.write( " max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n") paramFile.write( " rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n") paramFile.close()
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary): try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search(line) and bitscore_pattern.search(line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [ x.strip() for x in line.split('\t') ] if len(fields) >=6: if fields[1]!='-': rRNA_16S_dictionary[fields[0]] = [ fields[1], fields[2], fields[5] ] else: if len(fields) >=12: if fields[7]!='-': rRNA_16S_dictionary[fields[0]] = [ fields[7], fields[8], fields[11] ] taxonomy_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid): try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line= orf_dictionary[contig][candidate_orf_pos]['seqname'] for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field]) attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length'] attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial'] attributes += ";" + "sourcedb="+candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue="+str('0') attributes += ";" + "ec="+str('') attributes += ";" + "product="+'hypothetical protein' output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line); except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) exit_process()
def process_tRNA_stats(tRNA_stats_file, tRNA_dictionary): try: tRNA_file = open(tRNA_stats_file, 'r') except IOError: eprintf("Cannot read file %s!\n", tRNA_stats_file) exit_process() tRNA_lines = tRNA_file.readlines() sequence_name_pattern = re.compile("sequence name", re.I) number_pattern = re.compile("number", re.I) headerScanned = False for line in tRNA_lines: if number_pattern.search(line): continue if headerScanned == False: if sequence_name_pattern.search(line): headerScanned = True continue fields = [x.strip() for x in line.split('\t')] if len(fields) >= 6: name = get_sequence_number(fields[0]) tRNA_dictionary[name] = [ fields[3], fields[4], fields[5], fields[1] ]
def checkMissingParam_values(params, choices, logger=None): reqdCategoryParams = { 'annotation': { 'dbs': False }, 'orf_prediction': {}, 'rRNA': {}, 'metapaths_steps': {} } success = True for category in choices: for parameter in choices[category]: if (not params[category][parameter]) and\ ( (category in reqdCategoryParams) and\ (parameter in reqdCategoryParams[category]) and reqdCategoryParams[category][parameter]) : print(category, parameter) print(reqdCategoryParams) print(reqdCategoryParams[category]) eprintf('ERROR: Empty parameter %s of type %s\n' % (parameter, category)) eprintf('Please select at least one database for %s\n' % (category)) if logger != None: logger.write('ERROR\tEmpty parameter %s of type %s\n' % (parameter, category)) logger.write( 'Please select at least one database for %s\n' % (category)) success = False return success
def check_arguments(opts, args): return True if len(opts.input_blastout) == 0: eprintf("There should be at least one blastoutput file\n") return False if len(opts.database_name) == 0: eprintf("There should be at least one database name\n") return False if len(opts.weight_db) == 0: eprint("There should be at least one weight\n") return False if len(opts.input_blastout) != len(opts.database_name) or\ len(opts.input_blastout) != len(opts.weight_db) : eprint("The num of database names, blastoutputs and database map file should be equal\n") return False if opts.output_gff == None: eprintf("Must specify the output gff file\n") return False if opts.output_comparative_annotation == None: eprintf("Must specify the output tables for comparative annotation\n") return False if opts.input_gff == None: eprintf("Must specify the input gff file\n") return False return True
def process_tRNA_stats(tRNA_stats_file, tRNA_dictionary, shortenorfid=False): counter_tRNA={} try: tRNA_file = open(tRNA_stats_file, 'r') except IOError: eprintf("Cannot read file %s!\n", tRNA_stats_file) exit_process() tRNA_lines = tRNA_file.readlines() sequence_name_pattern = re.compile("sequence name", re.I) number_pattern = re.compile("number", re.I) headerScanned = False for line in tRNA_lines: if number_pattern.search(line): continue if headerScanned == False: if sequence_name_pattern.search(line): headerScanned = True continue fields = [ x.strip() for x in line.split('\t') ] if len(fields) >=6: if shortenorfid: name = get_sequence_number(fields[0]) else: name = fields[0] if not name in counter_tRNA: counter_tRNA[name] =0 _name = name + "_" + str(counter_tRNA[name]) counter_tRNA[name] = counter_tRNA[name] +1 tRNA_dictionary[_name] = [ fields[3], fields[4], fields[5], fields[1] ]
def create_query_dictionary(blastoutputfile, query_dictionary, algorithm, errorlogger= None ): seq_beg_pattern = re.compile("^#") try: blastoutfh = open( blastoutputfile,'r') except: print "ERROR : cannot open B/LAST output file " + blastoutputfile + " to parse " return try: for line in blastoutfh: if not seq_beg_pattern.search(line): words = line.rstrip().split('\t') if len(words) != 12: continue if algorithm =='BLAST': query_dictionary[words[1]] = 1 if algorithm =='LAST': query_dictionary[words[1]]= 1 blastoutfh.close() except: eprintf("\nERROR : while reading B/LAST output file " + blastoutputfile + " to parse " +\ " : make sure B/LAST ing was done for the particular database") if errorlogger: errorlogger.write("\nERROR : while reading B/LAST output file %s to parse\n" %(blastoutputfile)) errorlogger.write(" : make sure B/LAST ing was done for the particular database\n") pass
def add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths): for tRNA in tRNA_dictionary: start = int(tRNA_dictionary[tRNA][0]) end = int(tRNA_dictionary[tRNA][1]) if start > end: start = int(tRNA_dictionary[tRNA][1]) end = int(tRNA_dictionary[tRNA][0]) try: orf_length = end - start except: orf_length = 0 contig_name = re.sub(r'_\d+$', '', tRNA) if contig_name in contig_lengths: contig_length = contig_lengths[contig_name] else: contig_length = 0 if start > end or contig_length < end: eprintf("trna {} {} {} {} {}\n".format(tRNA, start, end, end - start, contig_length)) end = contig_length eprintf("trna {} {} {} {} {}\n".format(tRNA, start, end, end - start, contig_length)) dict = { 'id':ContigID(tRNA), 'seqname': tRNA, 'start':start, 'end':end,\ 'strand':tRNA_dictionary[tRNA][2], 'score':" ", 'orf_length':str(orf_length),\ 'contig_length':str(contig_length),\ 'feature':'tRNA', 'source':'trnaScan-1.4', 'frame':0, 'product':'tRNA-' + tRNA_dictionary[tRNA][3], 'ec':'' } tRNA_gff_dictionary[tRNA] = dict.copy()
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name, readFiles, rpkmFolder): num_threads = int(multiprocessing.cpu_count() * 0.8) if num_threads < 1: num_threads = 1 status = True readfiles = [','.join(read) for read in readFiles] if len(readFiles) == 2: command_frags = [ microbeCensusExec, ','.join(readfiles), microbeCensusOutput + ".tmp" ] result = getstatusoutput(' '.join(command_frags)) print ' '.join(command_frags) if result[0] == 0: pass rename(microbeCensusOutput + ".tmp", microbeCensusOutput) else: eprintf( "ERROR:\tError while running MicrobeCensus on read files %s\n", readFiles) status = False else: eprintf( "ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n", len(readFiles), ','.join(readFiles)) status = False return status
def __init__(self, dbname, blastoutput): self.dbname = dbname self.blastoutput = blastoutput self.i = 1 self.data = {} self.fieldmap = {} self.seq_beg_pattern = re.compile("#") try: self.blastoutputfile = open(blastoutput, 'r') self.lines = self.blastoutputfile.readlines() self.blastoutputfile.close() self.size = len(self.lines) if not self.seq_beg_pattern.search(self.lines[0]): exit_process( "First line must have field header names and begin with \"#\"" ) header = self.lines[0].replace('#', '', 1) fields = [x.strip() for x in header.rstrip().split('\t')] k = 0 for x in fields: self.fieldmap[x] = k k += 1 eprintf("\nProcessing database : %s\n", dbname) except AttributeError: eprintf("Cannot read the map file for database :%s\n", dbname) exit_process()
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary): try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search( line) and bitscore_pattern.search( line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [x.strip() for x in line.split('\t')] if len(fields) >= 6: if fields[1] != '-': rRNA_16S_dictionary[fields[0]] = [ fields[1], fields[2], fields[5] ] else: if len(fields) >= 12: if fields[7] != '-': rRNA_16S_dictionary[fields[0]] = [ fields[7], fields[8], fields[11] ] taxonomy_file.close()
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map) : try: map_file = open(dbname_map_filename, 'r') map_filelines = map_file.readlines() except: eprintf("ERROR: Cannot open file %s\n", dbname_map_filename) exit_process() tempfields = [ '', '', '', '', '', '', '' ] for line in map_filelines: pos = beginning_valid_field(line) if pos==-1: continue fields = [ x.strip() for x in line.split('\t') ] tempfields[pos] = fields[pos] if len(fields) > pos + 1: field_to_description[fields[pos]] = fields[pos+1] else: field_to_description[fields[pos]] = fields[pos] i=0 temp_hierarchical_map = hierarchical_map while i < pos : temp_hierarchical_map = temp_hierarchical_map[ tempfields[i] ] i+=1 temp_hierarchical_map[ tempfields[i] ] = {} fill_hierarchy_with_zeroes(hierarchical_map)
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name, readFiles, rpkmFolder) : num_threads = int(multiprocessing.cpu_count()*0.8) if num_threads < 1: num_threads = 1 status = True readfiles= [ ','.join(read) for read in readFiles ] if len(readFiles) == 2: command_frags = [microbeCensusExec, ','.join(readfiles), microbeCensusOutput + ".tmp"] result = getstatusoutput(' '.join(command_frags)) print ' '.join(command_frags) if result[0]==0: pass rename(microbeCensusOutput+".tmp", microbeCensusOutput) else: eprintf("ERROR:\tError while running MicrobeCensus on read files %s\n", readFiles) status = False else: eprintf("ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n", len(readFiles), ','.join(readFiles)) status = False return status
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map): try: map_file = open(dbname_map_filename, 'r') map_filelines = map_file.readlines() except: eprintf("ERROR: Cannot open file %s\n", dbname_map_filename) exit_process() tempfields = ['', '', '', '', '', '', ''] for line in map_filelines: pos = beginning_valid_field(line) if pos == -1: continue fields = [x.strip() for x in line.split('\t')] tempfields[pos] = fields[pos] if len(fields) > pos + 1: field_to_description[fields[pos]] = fields[pos + 1] else: field_to_description[fields[pos]] = fields[pos] i = 0 temp_hierarchical_map = hierarchical_map while i < pos: temp_hierarchical_map = temp_hierarchical_map[tempfields[i]] i += 1 temp_hierarchical_map[tempfields[i]] = {} fill_hierarchy_with_zeroes(hierarchical_map)
def process_blastout_file(blast_file, database, table, errorlogger = None): try: blastfile = open(blast_file, 'r') except IOError: eprintf("ERROR : Cannot write read file " + blast_file + " !" ) if errorlogger!=None: errorlogger.write("STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database ) exit_process() blastLines = blastfile.readlines() blastfile.close() for line in blastLines: line = line.strip() fields = re.split('\t', line) if len(fields) < 12: continue fields[0] = str(fields[0].strip()) fields[1] = str(fields[1].strip()) fields[2] = float(fields[2].strip()) fields[6] = int(fields[6].strip()) fields[7] = int(fields[7].strip()) fields[10] = float(fields[10].strip()) fields[11] = float(fields[11].strip()) table[str(fields[0].strip())] = [fields[2], fields [10], fields[11], fields[1], fields[6], fields[7]]
def checkMissingParam_values(params, choices, logger = None): reqdCategoryParams = { 'annotation': {'dbs': False}, 'orf_prediction':{}, 'rRNA':{}, 'metapaths_steps':{} } success = True for category in choices: for parameter in choices[category]: if (not params[category][parameter]) and\ ( (category in reqdCategoryParams) and\ (parameter in reqdCategoryParams[category]) and reqdCategoryParams[category][parameter]) : print category, parameter print reqdCategoryParams print reqdCategoryParams[category] eprintf('ERROR: Empty parameter %s of type %s\n' %(parameter, category)) eprintf('Please select at least one database for %s\n' %(category)) if logger!=None: logger.write('ERROR\tEmpty parameter %s of type %s\n' %(parameter, category)) logger.write('Please select at least one database for %s\n' %(category)) success = False return success
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid, compact_output): global errorcode try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line = orf_dictionary[contig][candidate_orf_pos]['seqname'] #if compact_output: #output_line = ShortenContigId(output_line) for field in fields: output_line += "\t" + str( orf_dictionary[contig][candidate_orf_pos][field]) #if compact_output: try: attributes = "ID=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['id']) attributes += ";" + "locus_tag=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['locus_tag']) except: attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag=" + orf_dictionary[contig][ candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length=" + orf_dictionary[contig][ candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length=" + orf_dictionary[contig][ candidate_orf_pos]['orf_length'] attributes += ";" + "partial=" + orf_dictionary[contig][ candidate_orf_pos]['partial'] attributes += ";" + "sourcedb=" + candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[ candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue=" + str('0') attributes += ";" + "ec=" + str('') attributes += ";" + "product=" + 'hypothetical protein' output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line) except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) insert_error(errorcode) exit_process()
def next(self): if self.i % self.SIZE == 0: self.refillBuffer() if len(self.lines)==0: raise StopIteration() if self.i % self.SIZE < self.size: fields = [ x.strip() for x in self.lines[self.i % self.SIZE].split('\t')] try: self.data = {} self.data['query'] = fields[self.fieldmap['query']] self.data['q_length'] = int(fields[self.fieldmap['q_length']]) self.data['bitscore'] = float(fields[self.fieldmap['bitscore']]) self.data['bsr'] = float(fields[self.fieldmap['bsr']]) self.data['target'] = fields[self.fieldmap['target']] self.data['aln_length'] = float(fields[self.fieldmap['aln_length']]) self.data['expect'] = float(fields[self.fieldmap['expect']]) self.data['identity'] = float(fields[self.fieldmap['identity']]) self.data['ec'] = fields[self.fieldmap['ec']] self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']]) self.lineToProcess = self.lines[self.i % self.SIZE] except: self.ERROR_COUNT += 1 if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT: eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME, self.lines[self.i % self.SIZE], self.blastoutput) if self.error_and_warning_logger != None: self.error_and_warning_logger.write("%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" %(self.STEP_NAME, re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]) , self.blastoutput)) self.i = self.i + 1 self.next() else: if self.error_and_warning_logger != None: self.error_and_warning_logger.write("%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" %(self.blastoutput, self.MAX_READ_ERRORS_ALLOWED) ) exit_process() # print "<<<<<<-------" # print 'self size ' + str(self.size) # print 'line ' + self.lines[self.i % self.SIZE] # print 'num fields ' + str(len(fields)) # fields = [ x for x in self.lines[self.i % self.SIZE].split('\t')] # for field in fields: # print field # print 'next line ' + self.lines[(self.i + 1) % self.SIZE] # print ' field map ' + str(self.fieldmap) # print 'index ' + str(self.i) # print 'data ' + str(self.data) # print 'fields ' + str(fields) # print ' while processing file ' + self.blastoutput # print ">>>>>>-------" # import traceback # print traceback.print_exc() self.i = self.i + 1 return self.data else: self.lineToProcess = None self.blastoutputfile.close() raise StopIteration()
def process_rRNA_16S_stats(dbname, rRNA_16S_file, orf_read_rpkgs, opts, shortenorfid=False): print "Processing rRNA database : ", dbname counter_rRNA = {} if not doesFileExist(rRNA_16S_file): return try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False seencounter = {} for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search( line) and bitscore_pattern.search( line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [x.strip() for x in line.split('\t')] if len(fields) >= 6: if not fields[0] in seencounter: seencounter[fields[0]] = 0 else: seencounter[fields[0]] += 1 _name = fields[0] + "_" + str(seencounter[fields[0]]) + "_rRNA" if not fields[6] in counter_rRNA: counter_rRNA[fields[6]] = 0.0 name = ShortenrRNAId(_name) if name in orf_read_rpkgs: counter_rRNA[fields[6]] += orf_read_rpkgs[name] else: counter_rRNA[fields[6]] += 0 taxonomy_file.close() with open( opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname + ".read_rpkgs.txt", 'w') as fout: fprintf(fout, "# Gene\tCounts\n") for name in counter_rRNA: fprintf(fout, "%s\t%0.2f\n", name, counter_rRNA[name]) return len(counter_rRNA)
def environment_variables_defined(): variables = ['METAPATHWAYS_DB'] status =True for variable in variables: if not variable in os.environ: eprintf("%-10s:Environment variable %s not defined! Please set %s as \'export %s=<value>\'\n" %('ERROR', variable, variable,variable)) if variables in ['METAPATHWAYS_DB']: status=False return status
def next(self): if self.i % self.SIZE == 0: self.refillBuffer() if len(self.lines) == 0: raise StopIteration() if self.i % self.SIZE < self.size: fields = [ x.strip() for x in self.lines[self.i % self.SIZE].split('\t') ] try: self.data = {} self.data['query'] = fields[self.fieldmap['query']] self.data['q_length'] = int(fields[self.fieldmap['q_length']]) self.data['bitscore'] = float( fields[self.fieldmap['bitscore']]) self.data['bsr'] = float(fields[self.fieldmap['bsr']]) self.data['target'] = fields[self.fieldmap['target']] self.data['aln_length'] = float( fields[self.fieldmap['aln_length']]) self.data['expect'] = float(fields[self.fieldmap['expect']]) self.data['identity'] = float( fields[self.fieldmap['identity']]) self.data['ec'] = fields[self.fieldmap['ec']] self.data['product'] = re.sub(r'=', ' ', fields[self.fieldmap['product']]) self.lineToProcess = self.lines[self.i % self.SIZE] except: self.ERROR_COUNT += 1 if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT: eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME, self.lines[self.i % self.SIZE], self.blastoutput) if self.error_and_warning_logger != None: self.error_and_warning_logger.write( "%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" % (self.STEP_NAME, re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]), self.blastoutput)) self.i = self.i + 1 self.next() else: if self.error_and_warning_logger != None: self.error_and_warning_logger.write( "%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" % (self.blastoutput, self.MAX_READ_ERRORS_ALLOWED)) exit_process() self.i = self.i + 1 return self.data else: self.lineToProcess = None self.blastoutputfile.close() raise StopIteration()
def get_pipeline_steps(steps_log_file): try: logfile = open(steps_log_file, 'r') except IOError: eprintf("Did not find %s!\n", logfile) eprintf("Try running in \'complete\' run-type\n") else: lines = logfile.readlines() pipeline_steps = None return pipeline_steps
def report_missing_filenames(input_output_list, sample_subset, logger=None): foundFiles = {} for samplePath in input_output_list.keys(): sampleName = path.basename(input_output_list[samplePath]) foundFiles[sampleName] =True for sample_in_subset in sample_subset: if not sample_in_subset in foundFiles: eprintf("ERROR\tCannot find input file for sample %s\n!", sample_in_subset) if logger: logger.printf("ERROR\tCannot file input for sample %s!\n", sample_in_subset)
def check_arguments(opts, args): if opts.blastdir == None: eprintf("The blast_results folder must be specified\n") return False if opts.sample_name == None: eprintf("There should be at least one sample name\n") return False return True
def __init__(self, dbname, blastoutput, database_mapfile, refscore_file, opts, errorlogger =None): self.Size = 10000 self.dbname = dbname self.ln2 = 0.69314718055994530941 self.lnk = math.log(opts.k) self.Lambda = opts.Lambda self.blastoutput = blastoutput self.database_mapfile =database_mapfile self.refscore_file = refscore_file self.annot_map = {} self.i=0 self.opts = opts self.hits_counts = {} self.data = {} self.refscores = {} self.refBitScores = {} self.needToPermute = False; self.MAX_READ_ERRORS_ALLOWED = 100 self.ERROR_COUNT = 0 self.STEP_NAME = 'PARSE_BLAST' self.error_and_warning_logger = errorlogger #print "trying to open blastoutput file " + blastoutput query_dictionary = {} create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger = errorlogger) try: self.blastoutputfile = open(self.blastoutput,'r') except: eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\ " : make sure \"B/LAST\"ing was done for the particular database" ) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\ " : make sure \"B/LAST\"ing was done for "+\ "the particular database" %(blastoutput) ) exit_process( "Cannot open B/LAST output file " + blastoutput ) try: self.create_refBitScores() except: print traceback.print_exc(10) exit_process( "Error while reading from B/LAST refscore file " + self.refscore_file ) try: create_dictionary(database_mapfile, self.annot_map, query_dictionary) query_dictionary = {} except AttributeError: eprintf("Cannot read the map file for database : %s\n" % (dbname)) if errorlogger!= None: errorlogger.write("PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" %(database_mapfile, dbname)) exit_process("Cannot read the map file for database " + dbname)
def checkParam_values(allcategorychoices, parameters, runlogger = None): for category in allcategorychoices: for choice in allcategorychoices[category]: if choice in parameters: if not parameters[choice] in allcategorychoices[category][choice]: logger.write('ERROR\tIncorrect setting in your parameter file') logger.write('for step %s as %s' %(choice, parameters[choices])) eprintf("ERROR: Incorrect setting in your parameter file" +\ " for step %s as %s", choice, parameters[choices]) exit_process()
def __init__(self, gff_filename): self.Size = 10000 self.i=0 self.orf_dictionary = {} self.gff_beg_pattern = re.compile("^#") self.lines= [] self.size=0 try: self.gff_file = open( gff_filename,'r') except AttributeError: eprintf("Cannot read the map file for database : %s\n", dbname) exit_process()
def __init__(self, gff_filename): self.Size = 10000 self.i = 0 self.orf_dictionary = {} self.gff_beg_pattern = re.compile("^#") self.lines = [] self.size = 0 try: self.gff_file = open(gff_filename, 'r') except AttributeError: eprintf("Cannot read the map file for database : %s\n", dbname) exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if options.algorithm == 'BLAST': _execute_BLAST(options) elif options.algorithm == 'LAST': _execute_LAST(options) else: eprintf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n") if errorlogger: errorlogger.printf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n") exit_process("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
def halt_on_invalid_input(input_output_list, filetypes, sample_subset): for samplePath in input_output_list.keys(): sampleName = path.basename(input_output_list[samplePath]) ''' in the selected list''' if not sampleName in sample_subset: continue if filetypes[samplePath][0]=='UNKNOWN': eprintf("ERROR\tIncorrect input sample %s. Check for bad characters or format\n!", samplePath) return False return True
def get_ORF_annotations_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.annot.gff$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table' file_name = input_dir + PATHDELIM + 'ORF_annotation_table.txt' eprintf("\nCounting number of ORFs for mapping to functional classification ...") count = get_number_of_uncommented_lines(file_name) eprintf("done\n") results.append( ('Total orfs count for functional classification', count ) ) if results==[]: return None return results
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None): """ creates a list of input output pairs if input is an input dir """ clean = True if not re.search(r'^[a-zA-Z]',shortname): eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\n",shortname) clean = False if re.search(r'[.]',shortname): eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) clean = False if len(shortname)<2: eprintf("ERROR\tSample name %s is too short!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname) clean = False if clean: return clean errmessage = """ Sample names before the suffixes .fasta, .fas, .fna, .faa or .gbk, must consist only of alphabets, digits and _; and should consist of at least two characters """ eprintf("ERROR\t%s\n",errmessage) if globalerrorlogger: globalerrorlogger.printf("ERROR\t%s\n",errmessage) exit_process("ERROR\t" + errmessage + "Exiting!" + "\n") return False
def remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = None): """ keep only the samples that are specified before processing """ shortened_names = {} input_sample_list = input_output_list.keys() for sample_name in input_sample_list: short_sample_name = derive_sample_name(sample_name) # print short_sample_name, len(short_sample_name) if len(short_sample_name) > 35: eprintf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name) if not derive_sample_name(sample_name) in sample_subset and sample_subset: del input_output_list[sample_name]
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None): """ creates a list of input output pairs if input is an input dir """ clean = True if not re.search(r'^[a-zA-Z]',shortname): eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\tConsider prefixing an alphabet to the front\n",shortname) clean = False if re.search(r'[.]',shortname): eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) clean = False if len(shortname)<2: eprintf("ERROR\tSample name %s is too short!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname) clean = False if clean: return clean errmessage = """Sample names before the suffixes .fasta, .fas, .fna, .faa or .gbk, must consist only of alphabets, digits and _; and should consist of at least two characters """ eprintf("ERROR\t%s\n",errmessage) if globalerrorlogger: globalerrorlogger.printf("ERROR\t%s\n",errmessage) # exit_process(errmessage + "Exiting!" + "\n", logger=globalerrorlogger) return False
def checkParam_values(allcategorychoices, parameters, runlogger=None): for category in allcategorychoices: for choice in allcategorychoices[category]: if choice in parameters: if not parameters[choice] in allcategorychoices[category][ choice]: logger.write( 'ERROR\tIncorrect setting in your parameter file') logger.write('for step %s as %s' % (choice, parameters[choices])) eprintf("ERROR: Incorrect setting in your parameter file" +\ " for step %s as %s", choice, parameters[choices]) exit_process()
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary, shortenorfid=False): counter_rRNA = {} if not doesFileExist(rRNA_16S_file): return try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search( line) and bitscore_pattern.search( line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [x.strip() for x in line.split('\t')] if len(fields) >= 6: if shortenorfid: name = get_sequence_number(fields[0]) else: name = fields[0] if not name in counter_rRNA: counter_rRNA[name] = 0 _name = name + "_" + str(counter_rRNA[name]) counter_rRNA[name] = counter_rRNA[name] + 1 if fields[1] != '-': rRNA_16S_dictionary[_name] = [fields[1], fields[2], fields[5]] else: if len(fields) >= 12: if fields[7] != '-': rRNA_16S_dictionary[_name] = [ fields[7], fields[8], fields[11] ] taxonomy_file.close()
def get_functional_taxonomic_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.annot.gff$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table' file_name = input_dir + PATHDELIM + 'functional_and_taxonomic_table.txt' eprintf("\nCounting number of functionally and taxonomically ORFs ...") count = get_number_of_uncommented_lines(file_name) eprintf("done\n") results.append( ('Total number of taxonomically and taxonmically annotated ORFs', count ) ) if results==[]: return None return results
def get_BLAST_LAST_parsed_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.LASTout.parsed.txt$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'blast_results' files = [ re.sub(r'.*\/', '', f) for f in glob(input_dir + PATHDELIM + sample_name + '*') if regPattern.search(f) ] regPattern = re.compile(r'[.](.*)[.]LASTout.parsed.txt$', re.IGNORECASE) for file in files: result = regPattern.search(file) if result: database = result.group(1) file_name = input_dir + PATHDELIM + sample_name + '.' + result.group( 1) + '.LASTout.parsed.txt' eprintf("\nParse LAST hits for : %s...", database) count = get_number_of_uncommented_lines(file_name) results.append(('Total number of selected hits in ' + database + ' with LAST ', count)) # now for the BLAST algorithm regPattern = re.compile(r'.BLASTout.parsed.txt') input_dir = folder_path + PATHDELIM + 'blast_results' files = [ re.sub(r'.*\/', '', f) for f in glob(input_dir + PATHDELIM + sample_name + '*') if regPattern.search(f) ] regPattern = re.compile(r'[.](.*)[.]BLASTout') for file in files: result = regPattern.search(file) if result: database = result.group(1) file_name = input_dir + PATHDELIM + sample_name + '.' + result.group( 1) + '.BLASTout.parsed.txt' eprintf("\nParse BLAST hits for : %s...", database) count = get_number_of_uncommented_lines(file_name) results.append(('Total number of selected hits in ' + database + ' with BLAST ', count)) if results == []: return None return results
def formatted_db_exists(dbname, suffixes): for suffix in suffixes: allfileList = glob(dbname + '*.' + suffix) fileList = [] tempFilePattern = re.compile(r''+ dbname + '\d*.' + suffix +'$'); for aFile in allfileList: searchResult = tempFilePattern.search(aFile) if searchResult: fileList.append(aFile) if len(fileList)==0 : eprintf("ERROR : if formatted correctely then expected the files with pattern %s\n", dbname + suffix) return False return True
def permuteForLAST(self, words): try : temp = copy(words) words[0] = temp[6] # query words[1] = temp[1] # target words[2] = 100.0 # percent id words[3] = temp[3] #aln length words[6] = temp[2] words[7] = int(temp[2]) + int(temp[3]) - 1 words[10] = 0.0 # evalue words[11] = temp[0] except: eprintf("ERROR : Invalid B/LAST output file %s \n" % (self.blastoutput)) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : Invalid B/LAST output file" %(self.blastoutput)) exit_process( "ERROR : Invalid B/LAST output file %s " % (self.blastoutput))
def get_ORF_annotations_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.annot.gff$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table' file_name = input_dir + PATHDELIM + 'ORF_annotation_table.txt' eprintf( "\nCounting number of ORFs for mapping to functional classification ..." ) count = get_number_of_uncommented_lines(file_name) eprintf("done\n") results.append(('Total orfs count for functional classification', count)) if results == []: return None return results
def write_run_parameters_file(fileName, parameters): try: paramFile = open(fileName, 'w') except IOError: eprintf("Cannot write run parameters to file %s!\n", fileName) exit_process("Cannot write run parameters to file %s" % (fileName)) # 16s_rRNA {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'} paramFile.write("\nRun Date : " + str(date.today()) + " \n") paramFile.write("\n\nNucleotide Quality Control parameters[s.n") paramFile.write(" min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n") paramFile.write("\n\nORF prediction parameters[s.n") paramFile.write(" min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n") paramFile.write(" algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n") paramFile.write( "\n\nAmino acid quality control and annotation parameters[s.n") paramFile.write(" min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n") paramFile.write(" min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n") paramFile.write(" annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n") paramFile.write(" min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n") paramFile.write(" max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n") paramFile.write("\n\nPathway Tools parameters[s.n") paramFile.write(" taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n") paramFile.write("\n\nrRNA search/match parameters[s.n") paramFile.write(" min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n") paramFile.write(" max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n") paramFile.write(" rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n") paramFile.close()
def add_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: line=line.rstrip() fields = line.split('\t') if len(fields) != 12: eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def process_gff_file(gff_file_name, orf_dictionary): try: gfffile = open(gff_file_name, 'r') except IOError: eprintf("Cannot read file %s!\n", gff_file_name) gff_lines = gfffile.readlines() gff_beg_pattern = re.compile("^#") gfffile.close() count = 0 for line in gff_lines: line = line.strip() if gff_beg_pattern.search(line): continue insert_orf_into_dict(line, orf_dictionary) count += 1
def get_functional_taxonomic_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.annot.gff$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table' file_name = input_dir + PATHDELIM + 'functional_and_taxonomic_table.txt' eprintf("\nCounting number of functionally and taxonomically ORFs ...") count = get_number_of_uncommented_lines(file_name) eprintf("done\n") results.append( ('Total number of taxonomically and taxonmically annotated ORFs', count)) if results == []: return None return results
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary, shortenorfid=False): counter_rRNA={} if not doesFileExist(rRNA_16S_file): return try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search(line) and bitscore_pattern.search(line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [ x.strip() for x in line.split('\t') ] if len(fields) >=6: if shortenorfid: name = get_sequence_number(fields[0]) else: name = fields[0] if not name in counter_rRNA: counter_rRNA[name] =0 _name = name + "_" + str(counter_rRNA[name]) counter_rRNA[name] = counter_rRNA[name] + 1 if fields[1]!='-': rRNA_16S_dictionary[_name] = [ fields[1], fields[2], fields[5] ] else: if len(fields) >=12: if fields[7]!='-': rRNA_16S_dictionary[_name] = [ fields[7], fields[8], fields[11] ] taxonomy_file.close()
def get_annotation_hits(sample_name, folder_path): results = [] # for the LAST algorithm regPattern = re.compile(r'.annot.gff$', re.IGNORECASE) input_dir = folder_path + PATHDELIM + 'genbank' files = [ re.sub(r'.*\/','',f) for f in glob(input_dir + PATHDELIM + sample_name + '*') if regPattern.search(f) ] regPattern = re.compile(r'(.*)[.]annot.gff$', re.IGNORECASE) for file in files: result = regPattern.search(file) if result: file_name = input_dir + PATHDELIM + sample_name + '.annot.gff' eprintf("\nCounting number of annotations...") count = get_number_of_uncommented_lines(file_name) eprintf("done\n") results.append( ('Total number of valid annotations', count ) ) if results==[]: return None return results