def read_map_file(dbname_map_filename, field_to_description, hierarchical_map): try: map_file = open(dbname_map_filename, 'r') map_filelines = map_file.readlines() except: eprintf("ERROR: Cannot open file %s\n", dbname_map_filename) exit_process() tempfields = ['', '', '', '', '', '', ''] for line in map_filelines: pos = beginning_valid_field(line) if pos == -1: continue fields = [x.strip() for x in line.split('\t')] tempfields[pos] = fields[pos] if len(fields) > pos + 1: field_to_description[fields[pos]] = fields[pos + 1] else: field_to_description[fields[pos]] = fields[pos] i = 0 temp_hierarchical_map = hierarchical_map while i < pos: temp_hierarchical_map = temp_hierarchical_map[tempfields[i]] i += 1 temp_hierarchical_map[tempfields[i]] = {} fill_hierarchy_with_zeroes(hierarchical_map)
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None): """ creates a list of input output pairs if input is an input dir """ clean = True if not re.search(r'^[a-zA-Z]',shortname): eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\n",shortname) clean = False if re.search(r'[.]',shortname): eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname) clean = False if len(shortname)<2: eprintf("ERROR\tSample name %s is too short!\n",shortname) if globalerrorlogger: globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname) clean = False if clean: return clean errmessage = """ Sample names before the suffixes .fasta, .fas, .fna, .faa or .gbk, must consist only of alphabets, digits and _; and should consist of at least two characters """ eprintf("ERROR\t%s\n",errmessage) if globalerrorlogger: globalerrorlogger.printf("ERROR\t%s\n",errmessage) exit_process("ERROR\t" + errmessage + "Exiting!" + "\n") return False
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map) : try: map_file = open(dbname_map_filename, 'r') map_filelines = map_file.readlines() except: eprintf("ERROR: Cannot open file %s\n", dbname_map_filename) exit_process() tempfields = [ '', '', '', '', '', '', '' ] for line in map_filelines: pos = beginning_valid_field(line) if pos==-1: continue fields = [ x.strip() for x in line.split('\t') ] tempfields[pos] = fields[pos] if len(fields) > pos + 1: field_to_description[fields[pos]] = fields[pos+1] else: field_to_description[fields[pos]] = fields[pos] i=0 temp_hierarchical_map = hierarchical_map while i < pos : temp_hierarchical_map = temp_hierarchical_map[ tempfields[i] ] i+=1 temp_hierarchical_map[ tempfields[i] ] = {} fill_hierarchy_with_zeroes(hierarchical_map)
def process_blastout_file(blast_file, database, table, errorlogger=None): try: blastfile = open(blast_file, 'r') except IOError: eprintf("ERROR : Cannot write read file " + blast_file + " !") if errorlogger != None: errorlogger.write( "STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database) exit_process() blastLines = blastfile.readlines() blastfile.close() for line in blastLines: line = line.strip() fields = re.split('\t', line) if len(fields) < 12: continue fields[0] = str(fields[0].strip()) fields[1] = str(fields[1].strip()) fields[2] = float(fields[2].strip()) fields[6] = int(fields[6].strip()) fields[7] = int(fields[7].strip()) fields[10] = float(fields[10].strip()) fields[11] = float(fields[11].strip()) table[str(fields[0].strip())] = [ fields[2], fields[10], fields[11], fields[1], fields[6], fields[7] ]
def make_sure_map_file_exists(config_settings, dbname, globallogger = None): dbmapFile = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt" seqFilePath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname if not doFilesExist( [dbmapFile ] ): eprintf("WARNING: Trying to create database map file for %s\n", dbname) if globallogger!= None: globallogger.write("WARNING: Trying to create database map file for %s\n" %( dbname) ) if not doFilesExist( [seqFilePath] ): eprintf("ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname) eprintf(" : Make sure you have the file %s\n", seqFilePath) if globallogger!= None: globallogger.write("ERROR \t You do not even have the raw sequence for Database %s to format!\n" %( dbname)) globallogger.write("Make sure you have the file %s\n" %( seqFilePath)) exit_process() mapfile = open(dbmapFile,'w') seqFile = open(seqFilePath,'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n",line.strip()) seqFile.close() mapfile.close() return dbmapFile
def process_input(input, output, input_type , gene_list, append, errorlogger = None): commentPATT = re.compile(r'^#') count = 0 mode = 'w' if append: mode = 'a' gene_list = read_gene_list(gene_list) gene_dict = {} for gene in gene_list: gene_dict[gene.lower()] = gene # re.compile(r'[\/\s]' + gene + '[\/\s]') if input_type=='LAST2': q = 0 t = 9 if input_type=='LAST1': q = 0 t = 1 if input_type=='HMM': q = 2 t = 0 try: inputfile = open(input, 'r') outputfile = open(output, mode) except: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) for line in inputfile: result = commentPATT.search(line) if result: continue fields = [ x.strip() for x in line.split('\t') ] if len(fields) < 3: continue orfid = fields[q] #if input_type=='LAST1' or input_type=='LAST2': target = find_gene_name(fields[t], gene_list, gene_dict) if target==None: continue fprintf(outputfile, "%s\t%s\n",orfid, gene_dict[target]); outputfile.close() inputfile.close() # rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count
def process_blastout_file(blast_file, database, table, errorlogger = None): try: blastfile = open(blast_file, 'r') except IOError: eprintf("ERROR : Cannot write read file " + blast_file + " !" ) if errorlogger!=None: errorlogger.write("STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database ) exit_process() blastLines = blastfile.readlines() blastfile.close() for line in blastLines: line = line.strip() fields = re.split('\t', line) if len(fields) < 12: continue fields[0] = str(fields[0].strip()) fields[1] = str(fields[1].strip()) fields[2] = float(fields[2].strip()) fields[6] = int(fields[6].strip()) fields[7] = int(fields[7].strip()) fields[10] = float(fields[10].strip()) fields[11] = float(fields[11].strip()) table[str(fields[0].strip())] = [fields[2], fields [10], fields[11], fields[1], fields[6], fields[7]]
def write_run_parameters_file(fileName, parameters): try: paramFile = open(fileName, 'w') except IOError: eprintf("Cannot write run parameters to file %s!\n", fileName) exit_process("Cannot write run parameters to file %s" %(fileName) ) # 16s_rRNA {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'} paramFile.write("\nRun Date : " + str(date.today()) + " \n") paramFile.write("\n\nNucleotide Quality Control parameters[s.n") paramFile.write( " min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n") paramFile.write("\n\nORF prediction parameters[s.n") paramFile.write( " min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n") paramFile.write( " algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n") paramFile.write("\n\nAmino acid quality control and annotation parameters[s.n") paramFile.write( " min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n") paramFile.write( " min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n") paramFile.write( " annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n") paramFile.write( " min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n") paramFile.write( " max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n") paramFile.write("\n\nPathway Tools parameters[s.n") paramFile.write( " taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n") paramFile.write("\n\nrRNA search/match parameters[s.n") paramFile.write( " min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n") paramFile.write( " max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n") paramFile.write( " rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n") paramFile.close()
def next(self): if self.i % self.SIZE == 0: self.refillBuffer() if len(self.lines)==0: raise StopIteration() if self.i % self.SIZE < self.size: fields = [ x.strip() for x in self.lines[self.i % self.SIZE].split('\t')] try: self.data = {} self.data['query'] = fields[self.fieldmap['query']] self.data['q_length'] = int(fields[self.fieldmap['q_length']]) self.data['bitscore'] = float(fields[self.fieldmap['bitscore']]) self.data['bsr'] = float(fields[self.fieldmap['bsr']]) self.data['target'] = fields[self.fieldmap['target']] self.data['aln_length'] = float(fields[self.fieldmap['aln_length']]) self.data['expect'] = float(fields[self.fieldmap['expect']]) self.data['identity'] = float(fields[self.fieldmap['identity']]) self.data['ec'] = fields[self.fieldmap['ec']] self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']]) self.lineToProcess = self.lines[self.i % self.SIZE] except: self.ERROR_COUNT += 1 if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT: eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME, self.lines[self.i % self.SIZE], self.blastoutput) if self.error_and_warning_logger != None: self.error_and_warning_logger.write("%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" %(self.STEP_NAME, re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]) , self.blastoutput)) self.i = self.i + 1 self.next() else: if self.error_and_warning_logger != None: self.error_and_warning_logger.write("%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" %(self.blastoutput, self.MAX_READ_ERRORS_ALLOWED) ) exit_process() # print "<<<<<<-------" # print 'self size ' + str(self.size) # print 'line ' + self.lines[self.i % self.SIZE] # print 'num fields ' + str(len(fields)) # fields = [ x for x in self.lines[self.i % self.SIZE].split('\t')] # for field in fields: # print field # print 'next line ' + self.lines[(self.i + 1) % self.SIZE] # print ' field map ' + str(self.fieldmap) # print 'index ' + str(self.i) # print 'data ' + str(self.data) # print 'fields ' + str(fields) # print ' while processing file ' + self.blastoutput # print ">>>>>>-------" # import traceback # print traceback.print_exc() self.i = self.i + 1 return self.data else: self.lineToProcess = None self.blastoutputfile.close() raise StopIteration()
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = None): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger) blastparser.setMaxErrorsLimit(100) blastparser.setErrorAndWarningLogger(errorlogger) blastparser.setSTEP_NAME('PARSE BLAST') fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = opts.parsed_output # temporary file is used to deal with incomplete processing of the file output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp" try: outputfile = open(output_blastoutput_parsed_tmp, 'w') except: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) # write the headers out fprintf(outputfile, "#%s",'query') for field in fields: fprintf(outputfile,"\t%s",field) fprintf(outputfile, "\n") pattern = re.compile(r'' + "(\d+_\d+)$") count = 0; uniques = {} for data in blastparser: if not data: continue try: fprintf(outputfile, "%s",data['query']) result = pattern.search(data['query']) if result: name = result.group(1) uniques[name] =True except: print 'data is : ', data, '\n' return count, len(uniques) for field in fields: fprintf(outputfile, "\t%s",data[field]) fprintf(outputfile, "\n") count += 1 outputfile.close() rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count, len(uniques)
def next(self): if self.i % self.SIZE == 0: self.refillBuffer() if len(self.lines) == 0: raise StopIteration() if self.i % self.SIZE < self.size: fields = [ x.strip() for x in self.lines[self.i % self.SIZE].split('\t') ] try: self.data = {} self.data['query'] = fields[self.fieldmap['query']] self.data['q_length'] = int(fields[self.fieldmap['q_length']]) self.data['bitscore'] = float( fields[self.fieldmap['bitscore']]) self.data['bsr'] = float(fields[self.fieldmap['bsr']]) self.data['target'] = fields[self.fieldmap['target']] self.data['aln_length'] = float( fields[self.fieldmap['aln_length']]) self.data['expect'] = float(fields[self.fieldmap['expect']]) self.data['identity'] = float( fields[self.fieldmap['identity']]) self.data['ec'] = fields[self.fieldmap['ec']] self.data['product'] = re.sub(r'=', ' ', fields[self.fieldmap['product']]) self.lineToProcess = self.lines[self.i % self.SIZE] except: self.ERROR_COUNT += 1 if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT: eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME, self.lines[self.i % self.SIZE], self.blastoutput) if self.error_and_warning_logger != None: self.error_and_warning_logger.write( "%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" % (self.STEP_NAME, re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]), self.blastoutput)) self.i = self.i + 1 self.next() else: if self.error_and_warning_logger != None: self.error_and_warning_logger.write( "%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" % (self.blastoutput, self.MAX_READ_ERRORS_ALLOWED)) exit_process() self.i = self.i + 1 return self.data else: self.lineToProcess = None self.blastoutputfile.close() raise StopIteration()
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None): global parser options, args = parser.parse_args(argv) # is there a pathwaytools executable installed if False and not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf( "ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" % (options.ptoolsExec)) # command to build the ePGDB command = "%s " % (options.ptoolsExec) command += " -api" pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) #resultLines = pythonCyc.getReactionListLines() resultLines = pythonCyc.getFlatFiles() StopPathwayTools() try: if False: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file, "%s\n", line.strip()) reaction_list_file.close() StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" % (options.sample_name)) eprintf( "INFO\tKill any other PathwayTools instance running on the machine and try again" ) if errorlogger: errorlogger.write( "ERROR\tFailed to run extract pathways for %s : " % (options.sample_name)) errorlogger.write( "INFO\tKill any other PathwayTools instance running on the machine and try again\n" ) StopPathwayTools()
def read_pipeline_configuration( file, globallogger ): patternKEYVALUE = re.compile(r'^([^\t\s]+)[\t\s]+\'(.*)\'') try: configfile = open(file, 'r') except IOError: eprintf("ERROR :Did not find pipeline config %s!\n", file) globalerrorlogger.write("ERROR\tDid not find pipeline config %s!\n" %(file)) else: lines = configfile.readlines() config_settings = {} for line in lines: if not re.match("#",line) and len(line.strip()) > 0 : line = line.strip() result = patternKEYVALUE.search(line) try: if len(result.groups()) == 2: fields = result.groups() else: eprintf(" The following line in your config settings files is not set up yet\n") eprintf(" Please rerun the pipeline after setting up this line\n") eprintf(" Error in line : %s\n", line) globalerrorlogger( "WARNING\t\n"+\ " The following line in your config settings files is not set up yet\n"+\ " Please rerun the pipeline after setting up this line\n"+\ " Error in line : %s\n" %(line)) exit_process() except: eprintf(" The following line in your config settings files is not set up yet\n") eprintf(" Please rerun the pipeline after setting up this line\n") eprintf(" Error ine line : %s\n", line) globalerrorlogger( "WARNING\t\n"+\ " The following line in your config settings files is not set up yet\n"+\ " Please rerun the pipeline after setting up this line\n"+\ " Error in line : %s\n" %(line)) exit_process() if PATHDELIM=='\\': config_settings[fields[0]] = re.sub(r'/',r'\\',fields[1]) else: config_settings[fields[0]] = re.sub(r'\\','/',fields[1]) config_settings['METAPATHWAYS_PATH'] = config_settings['METAPATHWAYS_PATH'] + PATHDELIM config_settings['REFDBS'] = config_settings['REFDBS'] + PATHDELIM check_config_settings(config_settings, file, globallogger); config_settings['configuration_file'] = file return config_settings
def checkParam_values(allcategorychoices, parameters, runlogger = None): for category in allcategorychoices: for choice in allcategorychoices[category]: if choice in parameters: if not parameters[choice] in allcategorychoices[category][choice]: logger.write('ERROR\tIncorrect setting in your parameter file') logger.write('for step %s as %s' %(choice, parameters[choices])) eprintf("ERROR: Incorrect setting in your parameter file" +\ " for step %s as %s", choice, parameters[choices]) exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if options.algorithm == 'BLAST': _execute_BLAST(options) elif options.algorithm == 'LAST': _execute_LAST(options) else: eprintf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n") if errorlogger: errorlogger.printf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n") exit_process("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
def checkParam_values(allcategorychoices, parameters, runlogger=None): for category in allcategorychoices: for choice in allcategorychoices[category]: if choice in parameters: if not parameters[choice] in allcategorychoices[category][ choice]: logger.write( 'ERROR\tIncorrect setting in your parameter file') logger.write('for step %s as %s' % (choice, parameters[choices])) eprintf("ERROR: Incorrect setting in your parameter file" +\ " for step %s as %s", choice, parameters[choices]) exit_process()
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = None): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger) blastparser.setMaxErrorsLimit(100) blastparser.setErrorAndWarningLogger(errorlogger) blastparser.setSTEP_NAME('PARSE BLAST') fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = blastoutput + '.parsed.txt' # temporary file is used to deal with incomplete processing of the file output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp" try: outputfile = open(output_blastoutput_parsed_tmp, 'w') except: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) # write the headers out fprintf(outputfile, "#%s",'query') for field in fields: fprintf(outputfile,"\t%s",field) fprintf(outputfile, "\n") count = 0; for data in blastparser: if not data: continue try: fprintf(outputfile, "%s",data['query']) except: print 'data is : ', data, '\n' sys.exit() for field in fields: fprintf(outputfile, "\t%s",data[field]) fprintf(outputfile, "\n") count += 1 outputfile.close() rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count
def permuteForLAST(self, words): try : temp = copy(words) words[0] = temp[6] # query words[1] = temp[1] # target words[2] = 100.0 # percent id words[3] = temp[3] #aln length words[6] = temp[2] words[7] = int(temp[2]) + int(temp[3]) - 1 words[10] = 0.0 # evalue words[11] = temp[0] except: eprintf("ERROR : Invalid B/LAST output file %s \n" % (self.blastoutput)) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : Invalid B/LAST output file" %(self.blastoutput)) exit_process( "ERROR : Invalid B/LAST output file %s " % (self.blastoutput))
def write_run_parameters_file(fileName, parameters): try: paramFile = open(fileName, 'w') except IOError: eprintf("Cannot write run parameters to file %s!\n", fileName) exit_process("Cannot write run parameters to file %s" % (fileName)) # 16s_rRNA {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'} paramFile.write("\nRun Date : " + str(date.today()) + " \n") paramFile.write("\n\nNucleotide Quality Control parameters[s.n") paramFile.write(" min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n") paramFile.write("\n\nORF prediction parameters[s.n") paramFile.write(" min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n") paramFile.write(" algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n") paramFile.write( "\n\nAmino acid quality control and annotation parameters[s.n") paramFile.write(" min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n") paramFile.write(" min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n") paramFile.write(" annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n") paramFile.write(" min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n") paramFile.write(" max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n") paramFile.write("\n\nPathway Tools parameters[s.n") paramFile.write(" taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n") paramFile.write("\n\nrRNA search/match parameters[s.n") paramFile.write(" min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n") paramFile.write(" max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n") paramFile.write(" rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n") paramFile.close()
def add_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: line=line.rstrip() fields = line.split('\t') if len(fields) != 12: eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def create_dictionary(databasemapfile, annot_map, query_dictionary, errorlogger=None): if not query_dictionary: print "WARNING : empty query dictionary in parse B/LAST" if errorlogger: errologger.write( "WARNING : empty query dictionary in parse B/LAST\n") return seq_beg_pattern = re.compile(">") try: dbmapfile = open(databasemapfile, 'r') except: if errorlogger: errologger.write( "PARSE_BLAST\tERROR\tCannot open database map file %s\t Please check the file manuallyT\n" % (databasemapfile)) exit_process("ERROR: Cannot open database map file %s\n" % (databasemapfile)) for line in dbmapfile: if seq_beg_pattern.search(line): words = line.rstrip().split() name = words[0].replace('>', '', 1) if not name in query_dictionary: continue words.pop(0) if len(words) == 0: annotation = 'hypothetical protein' else: annotation = ' '.join(words) annot_map[name] = annotation dbmapfile.close() if len(annot_map) == 0: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tFile " + databasemapfile + " seems to be empty!\tCreate datbasemap file\n") errorlogger.write("Try re-running after deleting file : %s\n" % (databasemapfile)) exit_process("no anntations in file :" + databasemapfile)
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) # is there a pathwaytools executable installed if False and not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec)) # command to build the ePGDB command = "%s " %(options.ptoolsExec) command += " -api" pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) #resultLines = pythonCyc.getReactionListLines() resultLines = pythonCyc.getFlatFiles() StopPathwayTools() try: if False: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file,"%s\n",line.strip()) reaction_list_file.close() StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again") if errorlogger: errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n") StopPathwayTools()
def __init__(self, dbname, blastoutput): self.lineToProcess = "" self.dbname = dbname self.blastoutput = blastoutput self.i = 0 self.SIZE = 10000 self.data = {} self.fieldmap = {} self.seq_beg_pattern = re.compile("^#") self.lines = [] self.headerline = None self.MAX_READ_ERRORS_ALLOWED = 0 self.ERROR_COUNT = 0 self.STEP_NAME = 'CREATE_REPORT_FILES' #PARSE_BLAST' self.error_and_warning_logger = None try: self.blastoutputfile = open(blastoutput, 'r') line = self.blastoutputfile.readline() if not self.seq_beg_pattern.search(line): eprintf( "First line must have field header names and begin with \"#\"\n" ) exit_process() self.headerline = line.strip() self.lineToProcess = self.headerline header = re.sub('^#', '', line) fields = [x.strip() for x in header.rstrip().split('\t')] k = 0 for x in fields: self.fieldmap[x] = k k += 1 except AttributeError: print "Cannot read the map file for database :" + dbname sys.exit(0)
def create_dictionary(databasemapfile, annot_map, query_dictionary, errorlogger= None): if not query_dictionary: print "WARNING : empty query dictionary in parse B/LAST" if errorlogger: errologger.write("WARNING : empty query dictionary in parse B/LAST\n") return seq_beg_pattern = re.compile(">") try: dbmapfile = open( databasemapfile,'r') except: if errorlogger: errologger.write("PARSE_BLAST\tERROR\tCannot open database map file %s\t Please check the file manuallyT\n" %(databasemapfile) ) exit_process("ERROR: Cannot open database map file %s\n" %(databasemapfile)) for line in dbmapfile: if seq_beg_pattern.search(line): words = line.rstrip().split() name = words[0].replace('>','',1) if not name in query_dictionary: continue words.pop(0) if len(words)==0: annotation = 'hypothetical protein' else: annotation = ' '.join(words) annot_map[name] = annotation dbmapfile.close() if len(annot_map)==0: if errorlogger: errorlogger.write( "PARSE_BLAST\tERROR\tFile "+databasemapfile+ " seems to be empty!\tCreate datbasemap file\n") errorlogger.write( "Try re-running after deleting file : %s\n" %(databasemapfile)) exit_process( "no anntations in file :" + databasemapfile)
def __init__(self, dbname, blastoutput): self.lineToProcess = "" self.dbname = dbname self.blastoutput = blastoutput self.i=0 self.SIZE = 10000 self.data = {} self.fieldmap={} self.seq_beg_pattern = re.compile("^#") self.lines = [] self.headerline = None self.MAX_READ_ERRORS_ALLOWED = 0 self.ERROR_COUNT = 0 self.STEP_NAME = 'CREATE_REPORT_FILES' #PARSE_BLAST' self.error_and_warning_logger = None try: self.blastoutputfile = open( blastoutput,'r') line = self.blastoutputfile.readline() if not self.seq_beg_pattern.search(line) : eprintf("First line must have field header names and begin with \"#\"\n") exit_process() self.headerline = line.strip() self.lineToProcess = self.headerline header = re.sub('^#','',line) fields = [ x.strip() for x in header.rstrip().split('\t')] k = 0 for x in fields: self.fieldmap[x] = k k += 1 except AttributeError: print "Cannot read the map file for database :" + dbname sys.exit(0)
def checkMetapathsteps(params, runlogger = None): choices = { 'metapaths_steps':{}, 'annotation':{}, 'INPUT':{} } choices['INPUT']['format'] = ['fasta', 'gbk_unannotated', 'gbk_annotated', 'gff_unannotated', 'gff_annotated'] choices['annotation']['algorithm'] = ['last', 'blast'] choices['metapaths_steps']['PREPROCESS_FASTA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['ORF_PREDICTION'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['GFF_TO_AMINO'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['FILTERED_FASTA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['COMPUTE_REFSCORE'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['BLAST_REFDB'] = ['yes', 'skip', 'stop', 'redo', 'grid'] choices['metapaths_steps']['PARSE._BLAST'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['SCAN_rRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['STATS_rRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['ANNOTATE'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['PATHOLOGIC_INPUT'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['GENBANK_FILE'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['CREATE_SEQUIN_FILE'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['CREATE_REPORT_FILES'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['SCAN_tRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['MLTREEMAP_CALCULATION'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['MLTREEMAP_IMAGEMAKER'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['PATHOLOGIC'] = ['yes', 'skip', 'stop', 'redo'] if params['metapaths_steps']: checkParam_values(choices, params['metapaths_steps'], runlogger) checkparams = {} checkparams['annotation'] = [] checkparams['annotation'].append('dbs') if not checkMissingParam_values(params, checkparams, runlogger): exit_process("Missing parameters")
def make_sure_map_file_exists(config_settings, dbname, globallogger=None): dbmapFile = config_settings[ 'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt" seqFilePath = config_settings[ 'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname if not doFilesExist([dbmapFile]): eprintf("WARNING: Trying to create database map file for %s\n", dbname) if globallogger != None: globallogger.write( "WARNING: Trying to create database map file for %s\n" % (dbname)) if not doFilesExist([seqFilePath]): eprintf( "ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname) eprintf(" : Make sure you have the file %s\n", seqFilePath) if globallogger != None: globallogger.write( "ERROR \t You do not even have the raw sequence for Database %s to format!\n" % (dbname)) globallogger.write("Make sure you have the file %s\n" % (seqFilePath)) exit_process() mapfile = open(dbmapFile, 'w') seqFile = open(seqFilePath, 'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n", line.strip()) seqFile.close() mapfile.close() return dbmapFile
def __init__(self, dbname, blastoutput, database_mapfile, refscore_file, opts, errorlogger =None): self.Size = 10000 self.dbname = dbname self.ln2 = 0.69314718055994530941 self.lnk = math.log(opts.k) self.Lambda = opts.Lambda self.blastoutput = blastoutput self.database_mapfile =database_mapfile self.refscore_file = refscore_file self.annot_map = {} self.i=0 self.opts = opts self.hits_counts = {} self.data = {} self.refscores = {} self.refBitScores = {} self.needToPermute = False; self.MAX_READ_ERRORS_ALLOWED = 100 self.ERROR_COUNT = 0 self.STEP_NAME = 'PARSE_BLAST' self.error_and_warning_logger = errorlogger #print "trying to open blastoutput file " + blastoutput query_dictionary = {} create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger = errorlogger) try: self.blastoutputfile = open(self.blastoutput,'r') except: eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\ " : make sure \"B/LAST\"ing was done for the particular database" ) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\ " : make sure \"B/LAST\"ing was done for "+\ "the particular database" %(blastoutput) ) exit_process( "Cannot open B/LAST output file " + blastoutput ) try: self.create_refBitScores() except: print traceback.print_exc(10) exit_process( "Error while reading from B/LAST refscore file " + self.refscore_file ) try: create_dictionary(database_mapfile, self.annot_map, query_dictionary) query_dictionary = {} except AttributeError: eprintf("Cannot read the map file for database : %s\n" % (dbname)) if errorlogger!= None: errorlogger.write("PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" %(database_mapfile, dbname)) exit_process("Cannot read the map file for database " + dbname)
def isWithinCutoffs(self, words, data, cutoffs, annot_map, refbitscores): data['query'] = words[0] try: data['target'] = words[1] except: data['target'] = 0 try: data['q_length'] = int(words[7]) - int(words[6]) + 1 except: data['q_length'] = 0 try: data['bitscore'] = float(words[11]) except: data['bitscore'] = 0 try: data['bsr'] = float(words[11])/refbitscores[words[0]] except: #print "words 0 " + str(refscores[words[0]]) #print "words 11 " + str( words[11]) data['bsr'] = 0 try: data['expect'] = float(words[10]) except: data['expect'] = 0 try: data['aln_length'] = float(words[3]) except: data['aln_length'] = 0 try: data['identity'] = float(words[2]) except: data['identity'] = 0 try: data['product'] = annot_map[words[1]] except: eprintf("Sequence with name \"" + words[1] + "\" is not present in map file ") if self.error_and_warning_logger: self.error_and_warning_logger.write("Sequence with name %s is not present in map file " %(words[1] )) self.incErrorCount() if self.maxErrorsReached(): if self.error_and_warning_logger: self.error_and_warning_logger.write("Number of sequence absent in map file %s exceeds %d" %(self.blastoutput, self.ERROR_COUNT )) exit_process("Number of sequence absent in map file %s exceeds %d" %(self.blastoutput, self.ERROR_COUNT )) data['product'] = 'hypothetical protein' try: m = re.search(r'(\d+[.]\d+[.]\d+[.]\d+)', data['product']) if m != None: data['ec'] = m.group(0) else: data['ec'] = '' except: data['ec'] = '' if cutoffs.taxonomy: try: m = re.search(r'\[([^\[]+)\]', data['product']) if m != None: data['taxonomy'] = m.group(1) else: data['taxonomy'] = '' except: data['taxonomy'] = '' if cutoffs.remove_taxonomy: try: data['product'] = re.sub(r'\[([^\[]+)\]','', data['product']) except: data['product'] = '' if cutoffs.remove_ec: try: data['product'] = re.sub(r'\([Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\)', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.-]\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.-.-]\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.-.-.-]\]', '', data['product']) except: data['product'] = '' if float(data['q_length']) < cutoffs.min_length: return False if float(data['bitscore']) < cutoffs.min_score: return False if float(data['expect']) > cutoffs.max_evalue: return False if float(data['identity']) < cutoffs.min_identity: return False if float(data['bsr']) < cutoffs.min_bsr: return False #min_length' #'min_score' #'max_evalue' # 'min_identity' #'limit' #'max_length' #'min_query_coverage' #'max_gaps' #min_bsr' return True
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if not len(options.blast_files): parser.error('At least one taxonomic BLAST output is required') if runBlastCommandrRNA(runcommand = runcommand) !=0: if errorlogger: errorlogger.write("ERROR: Failed to BLAST the sequences against database %s : " %(options.tax_databases[0]) ) errorlogger.write(" : " + runcommand) exit_process("ERROR: Failed to BLAST the sequences against database %s : " %(options.tax_databases[0]) +\ " : " + runcommand) if not ( len(options.tax_databases) == len( options.blast_files) ): parser.error('Number of taxonomic databases and BLAST outputs should be the same') if not options.output: parser.error('Output file must be specified') # Incredible sanity check if not files_exist(options.blast_files): sys.exit(0) if not files_exist( options.tax_databases): sys.exit(0) params = {'length': int(options.length), 'similarity': float(options.similarity), 'evalue':float(options.evalue), 'bitscore':float(options.bitscore) } #print params['bitscore'] table={} for x in range(0, len(options.blast_files)): table[options.tax_databases[x]]={} process_blastout_file(options.blast_files[x], options.tax_databases[x],table[options.tax_databases[x]], errorlogger = errorlogger) priority = 7000 reads = {} for x in range(0, len(options.blast_files)): append_taxonomic_information(options.tax_databases[x], table[options.tax_databases[x]], params) for key in table[options.tax_databases[x]]: if len(table[options.tax_databases[x]][key][6]) > 1: reads[key] = True dbname = re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]) runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" %(str(priority), dbname, str(len(reads)))) priority += 1 outputfile = open(options.output, 'w') fprintf(outputfile, "#Similarity cutoff :\t" + str(params['similarity']) +'\n') fprintf(outputfile, "#Length cutoff :\t" + str(params['length']) +'\n') fprintf(outputfile, "#Evalue cutoff :\t" + str(params['evalue']) +'\n') fprintf(outputfile, "#Bit score cutoff :\t" + str(params['bitscore']) +'\n') fprintf(outputfile, "#Number of rRNA sequences detected:\t" + str(len(reads)) +'\n\n') for x in range(0, len(options.tax_databases)): # printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x])) fprintf(outputfile, '\t%s\t\t\t', re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])) #printf('\n') fprintf(outputfile,'\n') #printf('%s', 'read') for x in range(0, len(options.blast_files)): fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start', 'end', 'similarity', 'evalue', 'bitscore', 'taxonomy') fprintf(outputfile,'\n') for read in reads: #printf('%s', read) fprintf(outputfile,'%s', read) for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s', str(table[options.tax_databases[x]][read][4]), str(table[options.tax_databases[x]][read][5]), str(table[options.tax_databases[x]][read][0]),str(table[options.tax_databases[x]][read][1]),str(table[options.tax_databases[x]][read][2]), str(table[options.tax_databases[x]][read][6])) else: fprintf(outputfile, '\t-\t-\t-\t-\t-\t-') fprintf(outputfile,'\n') outputfile.close() # collect the exact reads database_hits = {} for read in reads: for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: database_hits[read] = [ table[options.tax_databases[x]][read][4], table[options.tax_databases[x]][read][5]] # pick the hits, trim them according to the match and write them if options.fasta: selected_sequences={} read_select_fasta_sequences(database_hits, selected_sequences, options.fasta) for read in database_hits: selected_sequences[read] = selected_sequences[read][database_hits[read][0]:database_hits[read][1]] write_selected_sequences(selected_sequences, options.output +'.fasta')
def check_an_format_refdb(dbname, seqType, config_settings, params, globallogger = None): algorithm= get_parameter( params,'annotation','algorithm').upper() suffixes=[] # we do not use LAST for searchingin the taxonomic databas. e.g., greengenes, silva, etc # if the db formatting request is done with nucl and LAST, we switch to BLAST-based formatting if algorithm == 'LAST' and seqType == 'nucl': algorithm = 'BLAST' if algorithm == 'LAST' and seqType == 'prot': suffixes = [ 'des', 'sds', 'suf', 'bck', 'prj', 'ssp', 'tis' ] if algorithm == 'BLAST': if seqType=='prot': suffixes = ['phr', 'psq', 'pin'] if seqType=='nucl': suffixes = ['nhr', 'nsq', 'nin'] # formatted DB directories taxonomic_formatted = config_settings['REFDBS'] + PATHDELIM + 'taxonomic' + PATHDELIM + 'formatted' functional_formatted = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' # check if formatted folder exis. if not create it for d in [taxonomic_formatted, functional_formatted]: if not createFolderIfNotFound(d): eprintf("WARNING : Creating formatted subdirectory in blastDB folder.\n") # formatted database output paths if seqType == 'nucl': seqPath= config_settings['REFDBS'] + PATHDELIM + 'taxonomic' + PATHDELIM + dbname formattedDBPath = taxonomic_formatted + PATHDELIM + dbname elif seqType == 'prot': seqPath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname formattedDBPath = functional_formatted + PATHDELIM + dbname else: eprintf("ERROR : Undefined sequnce type for %s!\n", dbname) if globallogger!=None: globallogger.write("ERROR \t Undefined sequnce type for %s!\n" %( dbname) ) exit_process() # database formatting executables paths if algorithm == 'LAST' and seqType =='prot': executable = config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['LASTDB_EXECUTABLE'] else: # algorithm == 'BLAST': executable = config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['FORMATDB_EXECUTABLE'] if not (formatted_db_exists(formattedDBPath, suffixes) ): eprintf("WARNING : You do not seem to have Database %s formatted!\n", dbname) if globallogger!=None: globallogger.write("WARNING\t You do not seem to have Database %s formatted!\n" %(dbname) ) if check_if_raw_sequences_exist(seqPath): eprintf(" Found raw sequences for Database %s in folder %s!\n", dbname, seqPath) eprintf(" Trying to format on the fly .... for %s!\n", algorithm ) if globallogger!=None: globallogger.write("WARNING\t Found raw sequences for Database %s in folder %s!\n" %(dbname, seqPath) ) globallogger.write("Trying to format on the fly .... for %s!\n" %(algorithm ) ) result =format_db(executable, seqType, seqPath, formattedDBPath, algorithm) if result ==True: eprintf(" Formatting successful!\n") return else: eprintf(" Formatting failed! Please consider formatting manually or do not try to annotate with this database!\n") if globallogger!=None: globallogger.write("ERROR\tFormatting failed! Please consider formatting manually or do not try to annotate with this database!\n") exit_process() eprintf("ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname) eprintf(" in the folder %s\n", seqPath) eprintf(" Please put the appropriate files in folder \"blastDB\"\n") if globallogger!=None: globallogger.write("ERROR \t You do not even have the raw sequence for Database %s to format!\n" %( dbname) ) globallogger.write("in the folder %s\n" %(seqPath)) globallogger.write("Please put the appropriate files in folder \"blastDB\"\n") exit_process()
def check_config_settings(config_settings, file, globalerrorlogger = None): essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR'] missingItems = [] for key, value in config_settings.items(): # make sure MetaPathways directory is present if key in ['METAPATHWAYS_PATH' ]: if not path.isdir( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 1.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write(" Currently it is set to \"%s\"\n" %(config_settings[key] ) ) missingItems.append(key) continue # make sure REFDB directories are present if key in [ 'REFDBS' ]: if not path.isdir( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key,file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key]) ) missingItems.append(key) continue # make sure EXECUTABLES_DIR directories are present if key in [ 'EXECUTABLES_DIR']: if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] )) missingItems.append(key) continue # make sure RESOURCES_DIR directories are present if key in [ 'RESOURCES_DIR']: if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 4.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key] ) print config_settings['METAPATHWAYS_PATH'], config_settings[key] if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key])) missingItems.append(key) continue # make sure MetaPaths directory is present if key in ['PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]: if not path.isfile( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] ) ) missingItems.append(key) continue # ignore pgdb folder for now if key in ['PGDB_FOLDER' ]: continue # check if the desired file exists. if not, then print a message if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + value)\ and not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) : eprintf("ERROR:Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("5.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + value ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" %(key, file) ) globalerrorlogger.write("Currently it is set to \"%s\"\n" %(config_settings['METAPATHWAYS_PATH'] + value)) missingItems.append(key) continue stop_execution = False for item in missingItems: if item in essentialItems: eprintf("ERROR\t Essential field in setting %s is missing in configuration file!\n", item) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tEssential field in setting %s is missing in configuration file!\n" %(item)) stop_execution = True if stop_execution ==True: eprintf("ERROR: Terminating execution due to missing essential fields in configuration file!\n") if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tTerminating execution due to missing essential fields in configuration file!\n") exit_process()
def check_config_settings(config_settings, file, globalerrorlogger = None): essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR'] missingItems = [] for key, value in config_settings.items(): # these are not files or executables if key in ['NUM_CPUS', 'FORMATTED_DB_SIZE' ]: continue if key in ['FORMATDB_EXECUTABLE', 'BLASTP_EXECUTABLE', 'BLASTN_EXECUTABLE' ] and value=='': continue # make sure MetaPathways directory is present if key in ['METAPATHWAYS_PATH' ]: if not path.isdir( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 1.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write(" Currently it is set to \"%s\". Please correct it and try again.\n" %(config_settings[key] ) ) missingItems.append(key) continue # make sure REFDB directories are present if key in [ 'REFDBS' ]: if not path.isdir( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key,file)) globalerrorlogger.write("Currently it is set to \"%s\". Please correct it and try again.\n" %( config_settings[key]) ) missingItems.append(key) continue # make sure EXECUTABLES_DIR directories are present if key in [ 'EXECUTABLES_DIR']: if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\". Please correct the path.\n" %( config_settings[key] )) missingItems.append(key) continue if key in [ 'ACCESSION_TO_TAXONID']: if not path.isfile( config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM + config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 7.Currently it is set to \"%s\"\n", config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM +config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\". Please correct the path to compute LCA with accession id translation.\n" %( config_settings[key] )) missingItems.append(key) continue # make sure RESOURCES_DIR directories are present if key in [ 'RESOURCES_DIR']: if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 4.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key] ) print config_settings['METAPATHWAYS_PATH'], config_settings[key] if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key])) missingItems.append(key) continue # make sure MetaPaths directory is present if key in ['PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]: if not path.isfile( config_settings[key]) : eprintf("ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key] ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" %( config_settings[key] ) ) missingItems.append(key) continue # ignore pgdb folder for now if key in ['PGDB_FOLDER' ]: continue # check if the desired file exists. if not, then print a message if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + value)\ and not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) : eprintf("ERROR:Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("6.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH']+ PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" %(key, file) ) globalerrorlogger.write("Currently it is set to \"%s\"\n" %(config_settings['METAPATHWAYS_PATH'] + value)) missingItems.append(key) continue stop_execution = False for item in missingItems: if item in essentialItems: eprintf("ERROR\t Essential field in setting %s is missing in configuration file!\n", item) if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tEssential field in setting %s is missing in configuration file!\n" %(item)) stop_execution = True if stop_execution ==True: eprintf("ERROR: Terminating execution due to missing essential fields in configuration file!\n") if globalerrorlogger!=None: globalerrorlogger.write("ERROR\tTerminating execution due to missing essential fields in configuration file!\n") exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if options.inputfolder ==None: parser.error('ERROR\tInput folder for Pathologic not found') else: # required files to be able to build ePGDB files = [ #options.inputfolder + PATHDELIM + '0.pf', # options.inputfolder + PATHDELIM + '0.fasta', options.inputfolder + PATHDELIM + 'genetic-elements.dat', options.inputfolder + PATHDELIM + 'organism-params.dat' ] if files_exist( files , errorlogger = errorlogger): exit_process("ERROR\tCannot find all inputs for Pathologic in folder %s : " %(options.inputfolder) ) # is there a pathwaytools executable installed if not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec)) # command to build the ePGDB command = "%s -patho %s" %(options.ptoolsExec, options.inputfolder) if options.no_taxonomic_pruning: command += " -no-taxonomic-pruning " if options.no_web_cel_overview: command += " -no-web-cel-overview" command += " -tip" command += " -api" status =0 fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if not path.exists(options.pgdbdir): status = runPathologicCommand(runcommand = command) fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if status!=0: eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %(options.inputfolder)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again\n") if errorlogger: errorlogger.write("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again") errorlogger.write(" : " + command) insert_error(9) sys.exit(0) #exit_process("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder) ) if not path.exists(options.reactions_list): try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file,"%s\n",line.strip()) reaction_list_file.close() rename(options.reactions_list + ".tmp", options.reactions_list) StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again") if errorlogger: errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n") insert_error(9) StopPathwayTools() if not path.exists(options.table_out): ExtractPathway_WTD(options)
def main(argv, errorlogger=None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) sample_name = opts.sample_name folder_path = opts.folder_path results = [] try: STEP_NAME = "GATHER_STATS" # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'nuc') if status != None: results += status else: errorlogger.write( "%s\tERROR\tCannot read nuc stats file\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'amino') if status != None: results += status else: errorlogger.write( "%s\tERROR\tCannot read amino stats file\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the blast/last hits status = get_BLAST_LAST_hits(sample_name, folder_path) if status != None: results += status else: errorlogger.write( "%s\tERROR\tReading BLAST HITS\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the selected parsed blast/last hits status = get_BLAST_LAST_parsed_hits(sample_name, folder_path) if status != None: results += status else: errorlogger.write( "%s\tERROR\tReading parsed BLAST HITS\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the annotated gff hits status = get_annotation_hits(sample_name, folder_path) if status != None: results += status # read the annotated gff hits status = get_functional_taxonomic_hits(sample_name, folder_path) if status != None: results += status # read the number of ORFs that are used for mapping to functional categories status = get_ORF_annotations_hits(sample_name, folder_path) if status != None: results += status # get the rRNA hits status = get_rRNA_hits(sample_name, folder_path) if status != None: results += status # get the tRNA hits status = get_tRNA_hits(sample_name, folder_path) if status != None: results += status stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt' try: statsfilename = open(stats_file_name, 'w') except: print "ERRROR : Cannot open stats file format " + stats_file_name sys.exit(0) for pair in results: fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1]) statsfilename.close() except: exit_process()
def main(argv, errorlogger = None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) sample_name = opts.sample_name folder_path = opts.folder_path results = [] try: STEP_NAME = "GATHER_STATS" # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'nuc') if status!=None: results += status else: errorlogger.write("%s\tERROR\tCannot read nuc stats file\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'amino') if status!=None: results += status else: errorlogger.write("%s\tERROR\tCannot read amino stats file\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the blast/last hits status = get_BLAST_LAST_hits(sample_name, folder_path) if status!=None: results += status else: errorlogger.write("%s\tERROR\tReading BLAST HITS\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the selected parsed blast/last hits status = get_BLAST_LAST_parsed_hits(sample_name, folder_path) if status!=None: results += status else: errorlogger.write("%s\tERROR\tReading parsed BLAST HITS\t%s" %(STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the annotated gff hits status = get_annotation_hits(sample_name, folder_path) if status!=None: results += status # read the annotated gff hits status = get_functional_taxonomic_hits(sample_name, folder_path) if status!=None: results += status # read the number of ORFs that are used for mapping to functional categories status = get_ORF_annotations_hits(sample_name, folder_path) if status!=None: results += status # get the rRNA hits status = get_rRNA_hits(sample_name, folder_path) if status!=None: results += status # get the tRNA hits status = get_tRNA_hits(sample_name, folder_path) if status!=None: results += status stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt' try: statsfilename = open(stats_file_name, 'w') except: print "ERRROR : Cannot open stats file format " + stats_file_name sys.exit(0) for pair in results: fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1]) statsfilename.close() except: exit_process()
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None): linecount = 0 readerhandles = [] if verbose: eprintf("Processing for database : %s\n", dbname) if len(filenames)==0: eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname) exit_process() try: for i in range(len(filenames)): #print filenames readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) ) except OSError: eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i]) exit_process() # set error and warning parameters for readerhandle in readerhandles: readerhandle.setMaxErrorsLimit(5) readerhandle.setErrorAndWarningLogger(errorlogger) readerhandle.setSTEP_NAME('PARSE BLAST') try: outputfile = open(outputfilename, 'w') fieldmapHeaderLine = readerhandles[0].getHeaderLine() fprintf(outputfile, "%s\n",fieldmapHeaderLine) except OSError: eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename) exit_process() values = [] for i in range(len(filenames)): iterate = iter(readerhandles[i]) try : next(iterate) line = readerhandles[i].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values.append( (i, orfRanks[shortORFId], line) ) except: outputfile.close() return S = len(filenames) BuildHeap(S, values) while S>0: try: iterate = iter(readerhandles[values[0][0]]) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] #print fields[0], orfRanks[fields[0]] fprintf(outputfile, "%s\n",line) next(iterate) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values[0] = (values[0][0], orfRanks[shortORFId], line) except: #import traceback #traceback.print_exc() #print 'finished ' + str(S) values[0] = values[S-1] S = S - 1 if S>0: Heapify(values, 0, S) #print 'line count ' + str(linecount) outputfile.close()
def sigint_handler(signum, frame): eprintf("Received TERMINATION signal\n") exit_process()
def check_config_settings(config_settings, file, globalerrorlogger=None): essentialItems = ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR'] missingItems = [] for key, value in config_settings.items(): # make sure MetaPathways directory is present if key in ['METAPATHWAYS_PATH']: if not path.isdir(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 1.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( " Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # make sure REFDB directories are present if key in ['REFDBS']: if not path.isdir(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # make sure EXECUTABLES_DIR directories are present if key in ['EXECUTABLES_DIR']: if not path.isdir(config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # make sure RESOURCES_DIR directories are present if key in ['RESOURCES_DIR']: if not path.isdir(config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf( "ERROR: 4.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) print config_settings['METAPATHWAYS_PATH'], config_settings[ key] if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # make sure MetaPaths directory is present if key in ['PYTHON_EXECUTABLE', 'PATHOLOGIC_EXECUTABLE']: if not path.isfile(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # ignore pgdb folder for now if key in ['PGDB_FOLDER']: continue # check if the desired file exists. if not, then print a message if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + value)\ and not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) : eprintf( "ERROR:Path for \"%s\" is NOT set properly in configuration file \"%s\"\n", key, file) eprintf("5.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + value) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( "Currently it is set to \"%s\"\n" % (config_settings['METAPATHWAYS_PATH'] + value)) missingItems.append(key) continue stop_execution = False for item in missingItems: if item in essentialItems: eprintf( "ERROR\t Essential field in setting %s is missing in configuration file!\n", item) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tEssential field in setting %s is missing in configuration file!\n" % (item)) stop_execution = True if stop_execution == True: eprintf( "ERROR: Terminating execution due to missing essential fields in configuration file!\n" ) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tTerminating execution due to missing essential fields in configuration file!\n" ) exit_process()
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None): global parser options, args = parser.parse_args(argv) if options.inputfolder == None: parser.error('ERROR\tInput folder for Pathologic not found') else: # required files to be able to build ePGDB files = [ options.inputfolder + PATHDELIM + '0.pf', # options.inputfolder + PATHDELIM + '0.fasta', options.inputfolder + PATHDELIM + 'genetic-elements.dat', options.inputfolder + PATHDELIM + 'organism-params.dat' ] if files_exist(files, errorlogger=errorlogger): exit_process( "ERROR\tCannot find all inputs for Pathologic in folder %s : " % (options.inputfolder)) # is there a pathwaytools executable installed if not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf( "ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" % (options.ptoolsExec)) # command to build the ePGDB command = "%s -patho %s" % (options.ptoolsExec, options.inputfolder) if options.no_taxonomic_pruning: command += " -no-taxonomic-pruning " if options.no_web_cel_overview: command += " -no-web-cel-overview" command += " -api" status = 0 fix_pgdb_input_files(options.pgdbdir, pgdbs=[]) if not path.exists(options.pgdbdir): status = runPathologicCommand(runcommand=command) fix_pgdb_input_files(options.pgdbdir, pgdbs=[]) if status != 0: eprintf("ERROR\tFailed to run Pathologic on input %s : \n" % (options.inputfolder)) eprintf( "INFO\tKill any other PathwayTools instance running on the machine and try again\n" ) if errorlogger: errorlogger.write( "ERROR\tFailed to run Pathologic on input %s : " % (options.inputfolder)) errorlogger.write( "INFO\tKill any other PathwayTools instance running on the machine and try again" ) errorlogger.write(" : " + command) exit_process("ERROR\tFailed to run Pathologic on input %s : " % (options.inputfolder)) if not path.exists(options.reactions_list): try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file, "%s\n", line.strip()) reaction_list_file.close() rename(options.reactions_list + ".tmp", options.reactions_list) StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" % (options.sample_name)) eprintf( "INFO\tKill any other PathwayTools instance running on the machine and try again" ) if errorlogger: errorlogger.write( "ERROR\tFailed to run extract pathways for %s : " % (options.sample_name)) errorlogger.write( "INFO\tKill any other PathwayTools instance running on the machine and try again\n" ) StopPathwayTools() if not path.exists(options.table_out): ExtractPathway_WTD(options)
def main(argv): global parser (opts, args) = parser.parse_args() if valid_arguments(opts, args): print usage sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) eprintf("COMMAND : %s\n", sys.argv[0] + ' ' + ' '.join(argv)) # initialize the input directory or file input_fp = opts.input_fp output_dir = path.abspath(opts.output_dir) verbose = opts.verbose print_only = opts.print_only sample_subset= opts.sample_subset run_type = opts.run_type.strip() '''no need to remove the whole directory''' # if run_type == 'overwrite': # force_remove_dir=True # else: # force_remove_dir=False if opts.config_file: config_file= opts.config_file else: config_file = cmd_folder + PATHDELIM + metapaths_config if opts.ncbi_header and opts.ncbi_sbt: if not path.exists(opts.ncbi_header): print "Could not open or missing NCBI header file " + opts.ncbi_header print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file" sys.exit(0) if not path.exists(opts.ncbi_sbt): print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt sys.exit(0) ncbi_sequin_params = path.abspath(opts.ncbi_header) ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt) else: ncbi_sequin_params = None ncbi_sequin_sbt = None # try to load the parameter file try: parameter_f = opts.parameter_fp except IOError: raise IOError,\ "Can't open parameters file (%s). Does it exist? Do you have read access?"\ % opts.parameter_fp try: if run_type in ['overlay', 'safe'] and not path.exists(output_dir): makedirs(output_dir) except OSError: print "" print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\ " Perhaps directory \"" + output_dir + "\" already exists.\n" +\ " Please choose a different directory, or \n" +\ " run with the option \"-r overwrite\" to force overwrite it." sys.exit(1) if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates command_line_params={} command_line_params['verbose']= opts.verbose params=parse_metapaths_parameters(parameter_f) format = params['INPUT']['format'] """ load the sample inputs it expects either a fasta file or a directory containing fasta and yaml file pairs """ globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') input_output_list = {} # TODO: Check for illumina paired data... this complicates things a little. if path.isfile(input_fp): """ check if it is a file """ # TODO: Check for illumina pattern, if so check for pairs input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger) else: if path.exists(input_fp): """ check if dir exists """ input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger) else: """ must be an error """ eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!") eprintf("ERROR\tAs provided as arguments in the -in option.!\n") exit_process("ERROR\tAs provided as arguments in the -in option.!\n") """ these are the subset of sample to process if specified in case of an empty subset process all the sample """ if sample_subset: remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger) # add check the config parameters sorted_input_output_list = sorted(input_output_list.keys()) config_settings = read_pipeline_configuration(config_file, globalerrorlogger) parameter = Parameters() if not staticDiagnose(config_settings, params, logger = globalerrorlogger): eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n") samplesData = {} # PART1 before the blast block_mode = opts.block_mode runid = opts.runid try: # load the sample information if len(input_output_list): for input_file in sorted_input_output_list: sample_output_dir = input_output_list[input_file] algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper() s = SampleData() s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir) s.setParameter('algorithm', algorithm) s.setParameter('ncbi_params_file', ncbi_sequin_params) s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt) s.clearJobs() if run_type=='overwrite' and path.exists(sample_output_dir): shutil.rmtree(sample_output_dir) makedirs(sample_output_dir) if not path.exists(sample_output_dir): makedirs(sample_output_dir) s.prepareToRun() samplesData[input_file] = s # load the sample information run_metapathways( samplesData, sample_output_dir, output_dir, globallogger = globalerrorlogger, command_line_params=command_line_params, params=params, metapaths_config=metapaths_config, status_update_callback=status_update_callback, config_file=config_file, run_type = run_type, config_settings = config_settings, block_mode = block_mode, runid = runid ) else: eprintf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) ) globalerrorlogger.printf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) ) # blast the files blasting_system = get_parameter(params, 'metapaths_steps', 'BLAST_REFDB', default='yes') if blasting_system =='grid': # blasting the files files on the grids input_files = sorted_input_output_list blast_in_grid( sampleData[input_file], input_files, path.abspath(opts.output_dir), #important to use opts. params=params, metapaths_config=metapaths_config, config_file=config_file, run_type = run_type, runid = runid ) except: globalerrorlogger.write( "ERROR\t" + str(traceback.format_exc(10))) exit_process("ERROR:" + str(traceback.format_exc(10))) eprintf(" *********** \n") eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n") eprintf(" THE END \n") eprintf(" *********** \n") halt_process(4)
def isWithinCutoffs(self, words, data, cutoffs, annot_map, refbitscores): try: orfid = ShortORFId(words[0]) except: orfid = words[0] data['query'] = orfid try: data['target'] = words[1] except: data['target'] = 0 try: data['q_length'] = int(words[7]) - int(words[6]) + 1 except: data['q_length'] = 0 try: data['bitscore'] = float(words[11]) except: data['bitscore'] = 0 try: data['bsr'] = float(words[11]) / refbitscores[orfid] except: #print "words 0 " + str(refscores[words[0]]) #print "words 11 " + str( words[11]) data['bsr'] = 0 try: data['expect'] = float(words[10]) except: data['expect'] = 0 try: data['aln_length'] = float(words[3]) except: data['aln_length'] = 0 try: data['identity'] = float(words[2]) except: data['identity'] = 0 try: data['product'] = annot_map[words[1]] except: eprintf("Sequence with name \"" + words[1] + "\" is not present in map file\n") if self.error_and_warning_logger: self.error_and_warning_logger.write( "Sequence with name %s is not present in map file " % (words[1])) self.incErrorCount() if self.maxErrorsReached(): if self.error_and_warning_logger: self.error_and_warning_logger.write( "Number of sequence absent in map file %s exceeds %d" % (self.blastoutput, self.ERROR_COUNT)) exit_process( "Number of sequence absent in map file %s exceeds %d" % (self.blastoutput, self.ERROR_COUNT)) data['product'] = 'hypothetical protein' try: m = re.search(r'(\d+[.]\d+[.]\d+[.]\d+)', data['product']) if m != None: data['ec'] = m.group(0) else: data['ec'] = '' except: data['ec'] = '' if cutoffs.taxonomy: try: m = re.search(r'\[([^\[]+)\]', data['product']) if m != None: data['taxonomy'] = m.group(1) else: data['taxonomy'] = '' except: data['taxonomy'] = '' if cutoffs.remove_taxonomy: try: data['product'] = re.sub(r'\[([^\[]+)\]', '', data['product']) except: data['product'] = '' if cutoffs.remove_ec: try: data['product'] = re.sub( r'\([Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\)', '', data['product']) data['product'] = re.sub( r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.]\d+\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.]\d+[.-]\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.]\d+[.-.-]\]', '', data['product']) data['product'] = re.sub(r'\[[Ee][Ce][:]\d+[.-.-.-]\]', '', data['product']) except: data['product'] = '' if data['q_length'] < cutoffs.min_length: return False if data['bitscore'] < cutoffs.min_score: return False if data['expect'] > cutoffs.max_evalue: return False if data['identity'] < cutoffs.min_identity: return False if data['bsr'] < cutoffs.min_bsr: return False #min_length' #'min_score' #'max_evalue' # 'min_identity' #'limit' #'max_length' #'min_query_coverage' #'max_gaps' #min_bsr' return True
def main(argv): global parser (opts, args) = parser.parse_args() if valid_arguments(opts, args): print usage sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) eprintf("COMMAND : %s\n", sys.argv[0] + ' ' + ' '.join(argv)) # initialize the input directory or file input_fp = opts.input_fp output_dir = path.abspath(opts.output_dir) verbose = opts.verbose print_only = opts.print_only sample_subset = removeSuffix(opts.sample_subset) run_type = opts.run_type.strip() '''no need to remove the whole directory''' # if run_type == 'overwrite': # force_remove_dir=True # else: # force_remove_dir=False if opts.config_file: config_file = opts.config_file else: config_file = cmd_folder + PATHDELIM + metapaths_config if opts.ncbi_header and opts.ncbi_sbt: if not path.exists(opts.ncbi_header): print "Could not open or missing NCBI header file " + opts.ncbi_header print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file" sys.exit(0) if not path.exists(opts.ncbi_sbt): print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt sys.exit(0) ncbi_sequin_params = path.abspath(opts.ncbi_header) ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt) else: ncbi_sequin_params = None ncbi_sequin_sbt = None # try to load the parameter file try: if opts.parameter_fp: parameter_fp = opts.parameter_fp else: parameter_fp = cmd_folder + PATHDELIM + metapaths_param except IOError: raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp) try: if run_type in ['overlay', 'safe'] and not path.exists(output_dir): makedirs(output_dir) except OSError: print "" print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\ " Perhaps directory \"" + output_dir + "\" already exists.\n" +\ " Please choose a different directory, or \n" +\ " run with the option \"-r overwrite\" to force overwrite it." sys.exit(1) if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates command_line_params = {} command_line_params['verbose'] = opts.verbose params = parse_metapaths_parameters(parameter_fp) """ load the sample inputs it expects either a fasta file or a directory containing fasta and yaml file pairs """ globalerrorlogger = WorkflowLogger(generate_log_fp( output_dir, basefile_name='global_errors_warnings'), open_mode='w') input_output_list = {} if path.isfile(input_fp): """ check if it is a file """ input_output_list = create_an_input_output_pair( input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: if path.exists(input_fp): """ check if dir exists """ input_output_list = create_input_output_pairs( input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: """ must be an error """ eprintf( "ERROR\tNo valid input sample file or directory containing samples exists .!" ) eprintf("ERROR\tAs provided as arguments in the -in option.!\n") exit_process( "ERROR\tAs provided as arguments in the -in option.!\n") """ these are the subset of sample to process if specified in case of an empty subset process all the sample """ # remove all samples that are not specifed unless sample_subset is empty remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger=globalerrorlogger) # add check the config parameters sorted_input_output_list = sorted(input_output_list.keys()) filetypes = check_file_types(sorted_input_output_list) #stop on in valid samples if not halt_on_invalid_input(input_output_list, filetypes, sample_subset): globalerrorlogger.printf( "ERROR\tInvalid inputs found. Check for file with bad format or characters!\n" ) halt_process(opts.delay) # make sure the sample files are found report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger) #check the pipeline configuration config_settings = read_pipeline_configuration(config_file, globalerrorlogger) parameter = Parameters() if not staticDiagnose(config_settings, params, logger=globalerrorlogger): eprintf( "ERROR\tFailed to pass the test for required scripts and inputs before run\n" ) globalerrorlogger.printf( "ERROR\tFailed to pass the test for required scripts and inputs before run\n" ) halt_process(opts.delay) samplesData = {} # PART1 before the blast block_mode = opts.block_mode runid = opts.runid try: # load the sample information print "RUNNING MetaPathways version 2.5.2" if len(input_output_list): for input_file in sorted_input_output_list: sample_output_dir = input_output_list[input_file] algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper() s = SampleData() s.setInputOutput(inputFile=input_file, sample_output_dir=sample_output_dir) s.setParameter('algorithm', algorithm) s.setParameter('ncbi_params_file', ncbi_sequin_params) s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt) s.setParameter('FILE_TYPE', filetypes[input_file][0]) if params["INPUT"]['format'] in [ "gbk-annotated", "gff-annotated" ]: s.setParameter('ANNOTATED', True) else: s.setParameter('ANNOTATED', False) s.setParameter('SEQ_TYPE', filetypes[input_file][1]) s.clearJobs() if run_type == 'overwrite' and path.exists(sample_output_dir): shutil.rmtree(sample_output_dir) makedirs(sample_output_dir) if not path.exists(sample_output_dir): makedirs(sample_output_dir) s.prepareToRun() samplesData[input_file] = s # load the sample information run_metapathways(samplesData, sample_output_dir, output_dir, globallogger=globalerrorlogger, command_line_params=command_line_params, params=params, metapaths_config=metapaths_config, status_update_callback=status_update_callback, config_file=config_file, run_type=run_type, config_settings=config_settings, block_mode=block_mode, runid=runid) else: eprintf( "ERROR\tNo valid input files/Or no files specified to process in folder %s!\n", sQuote(input_fp)) globalerrorlogger.printf( "ERROR\tNo valid input files to process in folder %s!\n", sQuote(input_fp)) # blast the files blasting_system = get_parameter(params, 'metapaths_steps', 'BLAST_REFDB', default='yes') if blasting_system == 'grid': # blasting the files files on the grids input_files = sorted_input_output_list blast_in_grid( sampleData[input_file], input_files, path.abspath(opts.output_dir), #important to use opts. params=params, metapaths_config=metapaths_config, config_file=config_file, run_type=run_type, runid=runid) except: exit_process(str(traceback.format_exc(10)), logger=globalerrorlogger) eprintf(" *********** \n") eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n") eprintf(" THE END \n") eprintf(" *********** \n") halt_process(opts.delay)
def main(argv): global parser (opts, args) = parser.parse_args() if valid_arguments(opts, args): print usage sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' + ' '.join(argv)) ) # initialize the input directory or file input_fp = opts.input_fp output_dir = path.abspath(opts.output_dir) verbose = opts.verbose print_only = opts.print_only sample_subset = removeSuffix(opts.sample_subset) run_type = opts.run_type.strip() '''no need to remove the whole directory''' # if run_type == 'overwrite': # force_remove_dir=True # else: # force_remove_dir=False if opts.config_file: config_file= opts.config_file else: config_file = cmd_folder + PATHDELIM + metapaths_config # try to load the parameter file try: if opts.parameter_fp: parameter_fp= opts.parameter_fp else: parameter_fp = cmd_folder + PATHDELIM + metapaths_param except IOError: raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp ) try: if run_type in ['overlay', 'safe'] and not path.exists(output_dir): makedirs(output_dir) except OSError: print "" print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\ " Perhaps directory \"" + output_dir + "\" already exists.\n" +\ " Please choose a different directory, or \n" +\ " run with the option \"-r overwrite\" to force overwrite it." sys.exit(2) if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates command_line_params={} command_line_params['verbose']= opts.verbose if not path.exists(parameter_fp): eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp)) eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp)) create_metapaths_parameters(parameter_fp, cmd_folder) params=parse_metapaths_parameters(parameter_fp) """ load the sample inputs it expects either a fasta file or a directory containing fasta and yaml file pairs """ globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') input_output_list = {} if path.isfile(input_fp): """ check if it is a file """ input_output_list = create_an_input_output_pair(input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: if path.exists(input_fp): """ check if dir exists """ input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: """ must be an error """ eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!") eprintf("ERROR\tAs provided as arguments in the -in option.!\n") exit_process("ERROR\tAs provided as arguments in the -in option.!\n") """ these are the subset of sample to process if specified in case of an empty subset process all the sample """ # remove all samples that are not specifed unless sample_subset is empty remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger) # add check the config parameters sorted_input_output_list = sorted(input_output_list.keys()) filetypes = check_file_types(sorted_input_output_list) #stop on in valid samples if not halt_on_invalid_input(input_output_list, filetypes, sample_subset): globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n") halt_process(opts.delay) # make sure the sample files are found report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger) #check the pipeline configuration print 'config' if not path.exists(config_file): eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file)) eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file)) if not environment_variables_defined(): sys.exit(0) create_metapaths_configuration(config_file, cmd_folder) config_settings = read_pipeline_configuration(config_file, globalerrorlogger) parameter = Parameters() if not staticDiagnose(config_settings, params, logger = globalerrorlogger): eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") return samplesData = {} # PART1 before the blast block_mode = opts.block_mode runid = opts.runid try: # load the sample information print "RUNNING MetaPathways version FogDog 3.0" if len(input_output_list): for input_file in sorted_input_output_list: sample_output_dir = input_output_list[input_file] algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper() s = SampleData() s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir) s.setParameter('algorithm', algorithm) s.setParameter('FILE_TYPE', filetypes[input_file][0]) s.setParameter('SEQ_TYPE', filetypes[input_file][1]) s.clearJobs() if run_type=='overwrite' and path.exists(sample_output_dir): shutil.rmtree(sample_output_dir) makedirs(sample_output_dir) if not path.exists(sample_output_dir): makedirs(sample_output_dir) s.prepareToRun() samplesData[input_file] = s # load the sample information run_metapathways( samplesData, sample_output_dir, output_dir, globallogger = globalerrorlogger, command_line_params=command_line_params, params=params, metapaths_config=metapaths_config, status_update_callback=status_update_callback, config_file=config_file, run_type = run_type, config_settings = config_settings, block_mode = block_mode, runid = runid ) else: eprintf("ERROR\tNo valid input files/Or no files specified to process in folder %s!\n",sQuote(input_fp) ) globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) ) except: exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger ) eprintf(" *********** \n") eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n") eprintf(" THE END \n") eprintf(" *********** \n")
def check_config_settings(config_settings, file, globalerrorlogger=None): essentialItems = ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR'] missingItems = [] for key, value in config_settings.items(): # these are not files or executables if key in ['NUM_CPUS', 'FORMATTED_DB_SIZE']: continue if key in [ 'FORMATDB_EXECUTABLE', 'BLASTP_EXECUTABLE', 'BLASTN_EXECUTABLE' ] and value == '': continue # make sure MetaPathways directory is present if key in ['METAPATHWAYS_PATH']: if not path.isdir(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 1.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( " Currently it is set to \"%s\". Please correct it and try again.\n" % (config_settings[key])) missingItems.append(key) continue # make sure REFDB directories are present if key in ['REFDBS']: if not path.isdir(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 2.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( "Currently it is set to \"%s\". Please correct it and try again.\n" % (config_settings[key])) missingItems.append(key) continue # make sure EXECUTABLES_DIR directories are present if key in ['EXECUTABLES_DIR']: if not path.isdir(config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 3.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( "Currently it is set to \"%s\". Please correct the path.\n" % (config_settings[key])) missingItems.append(key) continue if key in ['ACCESSION_TO_TAXONID']: if not path.isfile(config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM + config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf( "ERROR: 7.Currently it is set to \"%s\"\n", config_settings['REFDBS'] + PATHDELIM + 'ncbi_tree' + PATHDELIM + config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( "Currently it is set to \"%s\". Please correct the path to compute LCA with accession id translation.\n" % (config_settings[key])) missingItems.append(key) continue # make sure RESOURCES_DIR directories are present if key in ['RESOURCES_DIR']: if not path.isdir(config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf( "ERROR: 4.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) print(config_settings['METAPATHWAYS_PATH'], config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # make sure MetaPaths directory is present if key in ['PYTHON_EXECUTABLE', 'PATHOLOGIC_EXECUTABLE']: if not path.isfile(config_settings[key]): eprintf( "ERROR: Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf("ERROR: 5.Currently it is set to \"%s\"\n", config_settings[key]) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write("Currently it is set to \"%s\"\n" % (config_settings[key])) missingItems.append(key) continue # ignore pgdb folder for now if key in ['PGDB_FOLDER']: continue # check if the desired file exists. if not, then print a message if not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + value)\ and not path.isfile( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value ) : eprintf( "ERROR:Path for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n", key, file) eprintf( "6.Currently it is set to \"%s\"\n", config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings['EXECUTABLES_DIR'] + PATHDELIM + value) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tPath for \"%s\" is NOT set properly (or missing) in configuration file \"%s\"\n" % (key, file)) globalerrorlogger.write( "Currently it is set to \"%s\"\n" % (config_settings['METAPATHWAYS_PATH'] + value)) missingItems.append(key) continue stop_execution = False for item in missingItems: if item in essentialItems: eprintf( "ERROR\t Essential field in setting %s is missing in configuration file!\n", item) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tEssential field in setting %s is missing in configuration file!\n" % (item)) stop_execution = True if stop_execution == True: eprintf( "ERROR: Terminating execution due to missing essential fields in configuration file!\n" ) if globalerrorlogger != None: globalerrorlogger.write( "ERROR\tTerminating execution due to missing essential fields in configuration file!\n" ) exit_process()
def read_pipeline_configuration(file, globallogger): patternKEYVALUE = re.compile(r'^([^\t\s]+)[\t\s]+\'(.*)\'') try: configfile = open(file, 'r') except IOError: eprintf("ERROR :Did not find pipeline config %s!\n", file) globalerrorlogger.write("ERROR\tDid not find pipeline config %s!\n" % (file)) else: lines = configfile.readlines() config_settings = {} for line in lines: if not re.match("#", line) and len(line.strip()) > 0: line = line.strip() result = patternKEYVALUE.search(line) try: if len(result.groups()) == 2: fields = result.groups() else: eprintf( " The following line in your config settings files is not set up yet\n" ) eprintf( " Please rerun the pipeline after setting up this line\n" ) eprintf(" Error in line : %s\n", line) globalerrorlogger( "WARNING\t\n"+\ " The following line in your config settings files is not set up yet\n"+\ " Please rerun the pipeline after setting up this line\n"+\ " Error in line : %s\n" %(line)) exit_process() except: eprintf( " The following line in your config settings files is not set up yet\n" ) eprintf( " Please rerun the pipeline after setting up this line\n" ) eprintf(" Error ine line : %s\n", line) globalerrorlogger( "WARNING\t\n"+\ " The following line in your config settings files is not set up yet\n"+\ " Please rerun the pipeline after setting up this line\n"+\ " Error in line : %s\n" %(line)) exit_process() if PATHDELIM == '\\': config_settings[fields[0]] = re.sub(r'/', r'\\', fields[1]) else: config_settings[fields[0]] = re.sub(r'\\', '/', fields[1]) config_settings[ 'METAPATHWAYS_PATH'] = config_settings['METAPATHWAYS_PATH'] + PATHDELIM config_settings['REFDBS'] = config_settings['REFDBS'] + PATHDELIM check_config_settings(config_settings, file, globallogger) config_settings['configuration_file'] = file return config_settings
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger=None): linecount = 0 readerhandles = [] if verbose: eprintf("Processing database : %s\n", dbname) if len(filenames) == 0: eprintf( "WARNING : Cannot find any B/LAST output file for database : %\n", dbname) exit_process() try: for i in range(len(filenames)): #print filenames readerhandles.append(BlastOutputTsvParser(dbname, filenames[i])) except OSError: eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i]) exit_process() # set error and warning parameters for readerhandle in readerhandles: readerhandle.setMaxErrorsLimit(5) readerhandle.setErrorAndWarningLogger(errorlogger) readerhandle.setSTEP_NAME('PARSE BLAST') try: outputfile = open(outputfilename, 'w') fieldmapHeaderLine = readerhandles[0].getHeaderLine() fprintf(outputfile, "%s\n", fieldmapHeaderLine) except OSError: eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename) exit_process() values = [] for i in range(len(filenames)): iterate = iter(readerhandles[i]) try: next(iterate) line = readerhandles[i].getProcessedLine() fields = [x.strip() for x in line.split('\t')] shortORFId = getShortORFId(fields[0]) values.append((i, orfRanks[shortORFId], line)) except: outputfile.close() return S = len(filenames) BuildHeap(S, values) while S > 0: try: iterate = iter(readerhandles[values[0][0]]) line = readerhandles[values[0][0]].getProcessedLine() fields = [x.strip() for x in line.split('\t')] #print fields[0], orfRanks[fields[0]] fprintf(outputfile, "%s\n", line) next(iterate) line = readerhandles[values[0][0]].getProcessedLine() fields = [x.strip() for x in line.split('\t')] shortORFId = getShortORFId(fields[0]) values[0] = (values[0][0], orfRanks[shortORFId], line) except: #print 'finished ' + str(S) values[0] = values[S - 1] S = S - 1 if S > 0: Heapify(values, 0, S) #print 'line count ' + str(linecount) outputfile.close()
def checkMetapathsteps(params, runlogger=None): choices = {'metapaths_steps': {}, 'annotation': {}, 'INPUT': {}} choices['INPUT']['format'] = [ 'fasta', 'gbk_unannotated', 'gbk_annotated', 'gff_unannotated', 'gff_annotated' ] choices['annotation']['algorithm'] = ['last', 'blast'] choices['metapaths_steps']['PREPROCESS_FASTA'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['ORF_PREDICTION'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['GFF_TO_AMINO'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['FILTERED_FASTA'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['COMPUTE_REFSCORE'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['BLAST_REFDB'] = [ 'yes', 'skip', 'stop', 'redo', 'grid' ] choices['metapaths_steps']['PARSE._BLAST'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['SCAN_rRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['STATS_rRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['ANNOTATE'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['PATHOLOGIC_INPUT'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['GENBANK_FILE'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['CREATE_SEQUIN_FILE'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['CREATE_REPORT_FILES'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['SCAN_tRNA'] = ['yes', 'skip', 'stop', 'redo'] choices['metapaths_steps']['MLTREEMAP_CALCULATION'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['MLTREEMAP_IMAGEMAKER'] = [ 'yes', 'skip', 'stop', 'redo' ] choices['metapaths_steps']['PATHOLOGIC'] = ['yes', 'skip', 'stop', 'redo'] if params['metapaths_steps']: checkParam_values(choices, params['metapaths_steps'], runlogger) checkparams = {} checkparams['annotation'] = [] checkparams['annotation'].append('dbs') if not checkMissingParam_values(params, checkparams, runlogger): exit_process("Missing parameters")
def sigint_handler(signum, frame): eprintf("Received TERMINATION signal\n") exit_process()
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None): global parser options, args = parser.parse_args(argv) if not len(options.blast_files): parser.error('At least one taxonomic BLAST output is required') if runBlastCommandrRNA(runcommand=runcommand) != 0: if errorlogger: errorlogger.write( "ERROR: Failed to BLAST the sequences against database %s : " % (options.tax_databases[0])) errorlogger.write(" : " + runcommand) exit_process("ERROR: Failed to BLAST the sequences against database %s : " %(options.tax_databases[0]) +\ " : " + runcommand) if not (len(options.tax_databases) == len(options.blast_files)): parser.error( 'Number of taxonomic databases and BLAST outputs should be the same' ) if not options.output: parser.error('Output file must be specified') # Incredible sanity check if not files_exist(options.blast_files): sys.exit(0) if not files_exist(options.tax_databases): sys.exit(0) params = { 'length': int(options.length), 'similarity': float(options.similarity), 'evalue': float(options.evalue), 'bitscore': float(options.bitscore) } #print params['bitscore'] table = {} for x in range(0, len(options.blast_files)): table[options.tax_databases[x]] = {} process_blastout_file(options.blast_files[x], options.tax_databases[x], table[options.tax_databases[x]], errorlogger=errorlogger) priority = 7000 reads = {} for x in range(0, len(options.blast_files)): append_taxonomic_information(options.tax_databases[x], table[options.tax_databases[x]], params) for key in table[options.tax_databases[x]]: if len(table[options.tax_databases[x]][key][6]) > 1: reads[key] = True dbname = re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]) runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" % (str(priority), dbname, str(len(reads)))) priority += 1 outputfile = open(options.output, 'w') fprintf(outputfile, "#Similarity cutoff :\t" + str(params['similarity']) + '\n') fprintf(outputfile, "#Length cutoff :\t" + str(params['length']) + '\n') fprintf(outputfile, "#Evalue cutoff :\t" + str(params['evalue']) + '\n') fprintf(outputfile, "#Bit score cutoff :\t" + str(params['bitscore']) + '\n') fprintf(outputfile, "#Number of rRNA sequences detected:\t" + str(len(reads)) + '\n\n') for x in range(0, len(options.tax_databases)): # printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x])) fprintf(outputfile, '\t%s\t\t\t', re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])) #printf('\n') fprintf(outputfile, '\n') #printf('%s', 'read') for x in range(0, len(options.blast_files)): fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start', 'end', 'similarity', 'evalue', 'bitscore', 'taxonomy') fprintf(outputfile, '\n') for read in reads: #printf('%s', read) fprintf(outputfile, '%s', read) for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s', str(table[options.tax_databases[x]][read][4]), str(table[options.tax_databases[x]][read][5]), str(table[options.tax_databases[x]][read][0]), str(table[options.tax_databases[x]][read][1]), str(table[options.tax_databases[x]][read][2]), str(table[options.tax_databases[x]][read][6])) else: fprintf(outputfile, '\t-\t-\t-\t-\t-\t-') fprintf(outputfile, '\n') outputfile.close() # collect the exact reads database_hits = {} for read in reads: for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: database_hits[read] = [ table[options.tax_databases[x]][read][4], table[options.tax_databases[x]][read][5] ] # pick the hits, trim them according to the match and write them if options.fasta: selected_sequences = {} read_select_fasta_sequences(database_hits, selected_sequences, options.fasta) for read in database_hits: selected_sequences[read] = selected_sequences[read][ database_hits[read][0]:database_hits[read][1]] write_selected_sequences(selected_sequences, options.output + '.fasta')
def main(argv): global parser (opts, args) = parser.parse_args() if valid_arguments(opts, args): print usage sys.exit(0) signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' + ' '.join(argv)) ) # initialize the input directory or file input_fp = opts.input_fp output_dir = path.abspath(opts.output_dir) verbose = opts.verbose print_only = opts.print_only sample_subset = removeSuffix(opts.sample_subset) run_type = opts.run_type.strip() '''no need to remove the whole directory''' # if run_type == 'overwrite': # force_remove_dir=True # else: # force_remove_dir=False if opts.config_file: config_file= opts.config_file else: config_file = cmd_folder + PATHDELIM + metapaths_config # try to load the parameter file try: if opts.parameter_fp: parameter_fp= opts.parameter_fp else: parameter_fp = cmd_folder + PATHDELIM + metapaths_param except IOError: raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp ) try: if run_type in ['overlay', 'safe'] and not path.exists(output_dir): makedirs(output_dir) except OSError: print "" print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\ " Perhaps directory \"" + output_dir + "\" already exists.\n" +\ " Please choose a different directory, or \n" +\ " run with the option \"-r overwrite\" to force overwrite it." sys.exit(2) if verbose: status_update_callback = print_to_stdout else: status_update_callback = no_status_updates command_line_params={} command_line_params['verbose']= opts.verbose if not path.exists(parameter_fp): eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp)) eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp)) create_metapaths_parameters(parameter_fp, cmd_folder) params=parse_metapaths_parameters(parameter_fp) """ load the sample inputs it expects either a fasta file or a directory containing fasta and yaml file pairs """ globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') input_output_list = {} if path.isfile(input_fp): """ check if it is a file """ input_output_list = create_an_input_output_pair(input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: if path.exists(input_fp): """ check if dir exists """ input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger) else: """ must be an error """ eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!") eprintf("ERROR\tAs provided as arguments in the -in option.!\n") exit_process("ERROR\tAs provided as arguments in the -in option.!\n") """ these are the subset of sample to process if specified in case of an empty subset process all the sample """ # remove all samples that are not specifed unless sample_subset is empty remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger) # add check the config parameters sorted_input_output_list = sorted(input_output_list.keys()) filetypes = check_file_types(sorted_input_output_list) #stop on in valid samples if not halt_on_invalid_input(input_output_list, filetypes, sample_subset): globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n") halt_process(opts.delay) # make sure the sample files are found report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger) #check the pipeline configuration print 'config' if not path.exists(config_file): eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file)) eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file)) if not environment_variables_defined(): sys.exit(0) create_metapaths_configuration(config_file, cmd_folder) config_settings = read_pipeline_configuration(config_file, globalerrorlogger) parameter = Parameters() if not staticDiagnose(config_settings, params, logger = globalerrorlogger): eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n") return samplesData = {} # PART1 before the blast block_mode = opts.block_mode runid = opts.runid try: # load the sample information print "RUNNING MetaPathways version FogDog 3.0" if len(input_output_list): for input_file in sorted_input_output_list: sample_output_dir = input_output_list[input_file] algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper() s = SampleData() s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir) s.setParameter('algorithm', algorithm) s.setParameter('FILE_TYPE', filetypes[input_file][0]) s.setParameter('SEQ_TYPE', filetypes[input_file][1]) s.clearJobs() if run_type=='overwrite' and path.exists(sample_output_dir): shutil.rmtree(sample_output_dir) makedirs(sample_output_dir) if not path.exists(sample_output_dir): makedirs(sample_output_dir) s.prepareToRun() samplesData[input_file] = s # load the sample information run_metapathways( samplesData, sample_output_dir, output_dir, globallogger = globalerrorlogger, command_line_params=command_line_params, params=params, metapaths_config=metapaths_config, status_update_callback=status_update_callback, config_file=config_file, run_type = run_type, config_settings = config_settings, block_mode = block_mode, runid = runid ) else: eprintf("ERROR\tNo valid input files/Or no files specified to process in folder %s!\n",sQuote(input_fp) ) globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) ) except: exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger ) eprintf(" *********** \n") eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n") eprintf(" THE END \n") eprintf(" *********** \n")