def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db, algorithm): _temp_formatted_db = formatted_db + "__temp__" """ format with 4GB file size """ if algorithm == 'BLAST': cmd = '%s -dbtype %s --max_file_sz 4294967296 -in %s -out %s' % ( formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) if algorithm == 'LAST': # dirname = os.path.dirname(raw_sequence_file) cmd = '%s -s 4G -p -c %s %s' % (formatdb_executable, _temp_formatted_db, raw_sequence_file) result = getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') try: for tempFile in temp_fileList: file = re.sub('__temp__', '', tempFile) rename(tempFile, file) except: return False if result[0] == 0: return True else: return False
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name, readFiles, rpkmFolder): num_threads = int(multiprocessing.cpu_count() * 0.8) if num_threads < 1: num_threads = 1 status = True readfiles = [','.join(read) for read in readFiles] if len(readFiles) == 2: command_frags = [ microbeCensusExec, ','.join(readfiles), microbeCensusOutput + ".tmp" ] result = getstatusoutput(' '.join(command_frags)) print ' '.join(command_frags) if result[0] == 0: pass rename(microbeCensusOutput + ".tmp", microbeCensusOutput) else: eprintf( "ERROR:\tError while running MicrobeCensus on read files %s\n", readFiles) status = False else: eprintf( "ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n", len(readFiles), ','.join(readFiles)) status = False return status
def _execute_prodigal(options): args= [ ] if options.prod_exec : args.append( options.prod_exec ) if options.prod_m: args.append("-m") if options.prod_p: args += [ "-p", options.prod_p ] if options.prod_f: args += [ "-f", options.prod_f ] if options.prod_g: args += [ "-g", options.prod_g ] if options.prod_input: args += [ "-i", options.prod_input ] if options.prod_output: args += [ "-o", options.prod_output ] result = getstatusoutput(' '.join(args) ) return result[0]
def _execute_tRNA_Scan(options): global errorcode args= [ ] if options.trna_executable : args.append( options.trna_executable ) if options.trna_i: args += [ "-i", options.trna_i ] if options.trna_o: args += [ "-o", options.trna_o ] if options.trna_D: args += [ "-D", options.trna_D ] if options.trna_T: args += [ "-T", options.trna_T ] if options.trna_F: args += [ "-F", options.trna_F] result = getstatusoutput(' '.join(args) ) if result[0]!=0: insert_error(errorcode) return result
def main(argv, errorlogger=None, runstatslogger=None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) gene_annotations = opts.gene_annotations barcode_reads = opts.barcode_reads output_folder = opts.output_folder sample_name = opts.sample_name transform_refgene(gene_annotations, 400, output_folder + PATHDELIM + sample_name) barcodes_txt = output_folder + PATHDELIM + sample_name + ".barcodes.txt" reform_barcode_fastq(barcode_reads, barcodes_txt, 12, 8) barcodes_sorted_txt = output_folder + PATHDELIM + sample_name + ".barcodes.sorted.txt" cmd_barcodes_sort = 'sort -k 1,1 %s > %s' % (barcodes_txt, barcodes_sorted_txt) try: result = getstatusoutput(cmd_barcodes_sort) except: return (1, "Cannot sort barcodes successfully")
def runUsingBWA(bwaExec, sample_name, indexFile, readFiles, bwaFolder): if len(readFiles) > 2: return False num_threads = int(multiprocessing.cpu_count() * 0.8) if num_threads < 1: num_threads = 1 bwaOutput = bwaFolder + PATHDELIM + sample_name + '.sam' if len(readFiles) == 2: cmd = "%s mem -t %d -o %s %s %s %s" % (bwaExec, num_threads, bwaOutput, indexFile, readFiles[0], readFiles[1]) if len(readFiles) == 1: cmd = "%s mem -t %d -p -o %s %s %s " % ( bwaExec, num_threads, bwaOutput, indexFile, readFiles[0]) result = getstatusoutput(cmd) if result[0] == 0: return True return False
def _execute_BLAST(options, logger = None): args= [ ] if options.blast_executable : args.append( options.blast_executable ) if options.blast_max_target_seqs: args +=["-max_target_seqs", options.blast_max_target_seqs] if options.blast_num_threads: args += [ "-num_threads", options.blast_num_threads ] if options.blast_outfmt: args += [ "-outfmt", options.blast_outfmt ] if options.blast_db: args += [ "-db", options.blast_db ] if options.blast_query: args += [ "-query", options.blast_query ] if options.blast_evalue: args += [ "-evalue", options.blast_evalue ] if options.blast_out: args += [ "-out", options.blast_out + ".tmp" ] try: result = getstatusoutput(' '.join(args) ) rename(options.blast_out + ".tmp", options.blast_out) except: return (1, "Cannot execute BLAST successfully") return (result[0], result[1])
def _execute_LAST(options, logger = None): args= [ ] if options.last_executable : args.append( options.last_executable ) if options.last_f: args += [ "-f", options.last_f ] if options.last_o: args += [ "-o", options.last_o + ".tmp"] if options.last_db: args += [ options.last_db ] if options.last_query: args += [ options.last_query ] try: result = getstatusoutput(' '.join(args) ) rename(options.last_o + ".tmp", options.last_o) except: message = "Could not run LASTAL correctly" if result and len(result) > 1: message = result[1] if logger: logger.printf("ERROR\t%s\n", message) return (1, message) return (result[0], result[1])
def _execute_BLAST(options, logger=None): args = [] if options.blast_executable: args.append(options.blast_executable) if options.blast_max_target_seqs: args += ["-max_target_seqs", options.blast_max_target_seqs] if options.num_threads: args += ["-num_threads", options.num_threads] if options.blast_outfmt: args += ["-outfmt", options.blast_outfmt] if options.blast_db: args += ["-db", options.blast_db] if options.blast_query: args += ["-query", options.blast_query] if options.blast_evalue: args += ["-evalue", options.blast_evalue] if options.blast_out: args += ["-out", options.blast_out + ".tmp"] try: result = getstatusoutput(' '.join(args)) rename(options.blast_out + ".tmp", options.blast_out) except: return (1, "Cannot execute BLAST successfully") return (result[0], result[1])
def _execute_prodigal(options): args= [ ] if options.prod_exec : args.append( options.prod_exec ) if options.prod_m: args.append("-m") if options.prod_p: args += [ "-p", options.prod_p ] if options.prod_f: args += [ "-f", options.prod_f ] if options.prod_g: args += [ "-g", options.prod_g ] if options.prod_input: args += [ "-i", options.prod_input ] if options.prod_output: args += [ "-o", options.prod_output + ".tmp" ] #args += [ "-o", options.prod_output ] result = getstatusoutput(' '.join(args) ) rename(options.prod_output + ".tmp" , options.prod_output) return result[0]
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name, readFiles, rpkmFolder) : num_threads = int(multiprocessing.cpu_count()*0.8) if num_threads < 1: num_threads = 1 status = True readfiles= [ ','.join(read) for read in readFiles ] if len(readFiles) == 2: command_frags = [microbeCensusExec, ','.join(readfiles), microbeCensusOutput + ".tmp"] result = getstatusoutput(' '.join(command_frags)) print ' '.join(command_frags) if result[0]==0: pass rename(microbeCensusOutput+".tmp", microbeCensusOutput) else: eprintf("ERROR:\tError while running MicrobeCensus on read files %s\n", readFiles) status = False else: eprintf("ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n", len(readFiles), ','.join(readFiles)) status = False return status
def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db, algorithm): _temp_formatted_db = formatted_db+ "__temp__" """ format with 4GB file size """ if algorithm=='BLAST': cmd='%s -dbtype %s --max_file_sz 4294967296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, formatted_db) if algorithm=='LAST': # dirname = os.path.dirname(raw_sequence_file) cmd='%s -s 4G -p -c %s %s' %(formatdb_executable, _temp_formatted_db, raw_sequence_file) print cmd result= getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') try: for tempFile in temp_fileList: file = re.sub('__temp__','', tempFile) rename( tempFile, file); except: return False if result[0]==0: return True else: return False
def runBIOMCommand(infile, outfile, biomExec="biom"): commands = [ biomExec, " convert", "-i", infile, "-o", outfile, "--table-type=\"Table\"", "--to-hdf5" ] result = getstatusoutput(' '.join(commands)) return result[0]
def _execute_LAST(options, logger=None): args = [] if options.last_executable: args.append(options.last_executable) if options.last_f: args += ["-f", options.last_f] if options.last_o: args += ["-o", options.last_o + ".tmp"] if options.num_threads: args += ["-P", options.num_threads] args += [" -K", options.num_hits] if options.last_db: args += [options.last_db] if options.last_query: args += [options.last_query] try: result = getstatusoutput(' '.join(args)) rename(options.last_o + ".tmp", options.last_o) except: message = "Could not run LASTAL correctly" if result and len(result) > 1: message = result[1] if logger: logger.printf("ERROR\t%s\n", message) return (1, message) return (result[0], result[1])
def execute_pipeline_stage(pipeline_command, extra_command=None, errorlogger=None, runstatslogger=None): argv = [x.strip() for x in pipeline_command.split()] funcname = re.sub(r'.py$', '', argv[0]) funcname = re.sub(r'^.*/', '', funcname) args = argv[1:] if hasattr(python_scripts, funcname): methodtocall = getattr(getattr(python_scripts, funcname), funcname) if extra_command == None: result = methodtocall(args, errorlogger=errorlogger, runstatslogger=runstatslogger) else: # print extra_command result = methodtocall(args, errorlogger=errorlogger, extra_command=extra_command, runstatslogger=runstatslogger) else: result = getstatusoutput(pipeline_command) return result
def _execute_tRNA_Scan(options): global errorcode args = [] if options.trna_executable: args.append(options.trna_executable) if options.trna_i: args += ["-i", options.trna_i] if options.trna_o: args += ["-o", options.trna_o] if options.trna_D: args += ["-D", options.trna_D] if options.trna_T: args += ["-T", options.trna_T] if options.trna_F: args += ["-F", options.trna_F] result = getstatusoutput(' '.join(args)) if result[0] != 0: insert_error(errorcode) return result
def runBlastCommandrRNA(runcommand=None): if runcommand == None: return False print runcommand result = getstatusoutput(runcommand) return result[0]
def _execute_BLAST(options): args= [ ] if options.blast_executable : args.append( options.blast_executable ) if options.blast_max_target_seqs: args +=["-max_target_seqs", options.blast_max_target_seqs] if options.blast_num_threads: args += [ "-num_threads", options.blast_num_threads ] if options.blast_outfmt: args += [ "-outfmt", options.blast_outfmt ] if options.blast_db: args += [ "-db", options.blast_db ] if options.blast_query: args += [ "-query", options.blast_query ] if options.blast_evalue: args += [ "-evalue", options.blast_evalue ] if options.blast_out: args += [ "-out", options.blast_out + ".tmp" ] try: result = getstatusoutput(' '.join(args) ) rename(options.blast_out + ".tmp", options.blast_out) except: return '1' return result[0]
def _execute_LAST(options): args= [ ] if options.last_executable : args.append( options.last_executable ) if options.last_f: args += [ "-f", options.last_f ] if options.last_o: args += [ "-o", options.last_o + ".tmp"] if options.last_db: args += [ options.last_db ] if options.last_query: args += [ options.last_query ] try: result = getstatusoutput(' '.join(args) ) rename(options.last_o + ".tmp", options.last_o) except: return '1' return result[0]
def runRPKMCommand(runcommand = None): if runcommand == None: return False result = getstatusoutput(runcommand) if result[1]: print result[1] return result[0]
def runRPKMCommand(runcommand=None): if runcommand == None: return False result = getstatusoutput(runcommand) if result[1]: print result[1] return result[0]
def indexForBWA(bwaExec, contigs, indexfile): cmd = "%s index -p %s %s" %(bwaExec, indexfile, contigs, ) result = getstatusoutput(cmd) if result[0]==0: return True return False
def _convert_to_bam(options, bamout, logger = None): args= [ ] args.append( 'samtools' ) args += [ "view -S -b" ] args += [">", bamout] try: result = getstatusoutput(' '.join(args) ) except: return (1, "Cannot execute samtools successfully") return (result[0], result[1])
def indexForBWA(bwaExec, contigs, indexfile): cmd = "%s index -p %s %s" % ( bwaExec, indexfile, contigs, ) result = getstatusoutput(cmd) if result[0] == 0: return True return False
def format_db(formatdb_executable, seqType, refdb_sequence_file, algorithm): if algorithm=='BLAST': cmd='%s -dbtype %s -in %s' %(formatdb_executable, seqType, refdb_sequence_file) if algorithm=='LAST': dirname = os.path.dirname(refdb_sequence_file) cmd='%s -p -c %s %s' %(formatdb_executable, refdb_sequence_file, refdb_sequence_file) result= getstatusoutput(cmd) if result[0]==0: return True else: return False
def _execute_bowtie2(options, logger = None): args= [ ] args.append( 'bowtie2' ) args += [ "-p", options.num_threads ] args += ["-x", options.refindex] args += ["-U", options.reads] args += ["-S", options.samout] try: result = getstatusoutput(' '.join(args) ) except: return (1, "Cannot execute BOWTIE2 successfully") return (result[0], result[1])
def _execute_bedtools_intersect(a_arg, b_arg, outfile, additional_params=""): args = [] args.append('bedtools intersect') args += ["-a", a_arg] args += ["-b", b_arg] args += [additional_params] args += [">", outfile] try: result = getstatusoutput(' '.join(args)) except: return (1, "Cannot execute BEDTOOLS successfully") return (result[0], result[1])
def formatDB(tools, db, refdbspath, seqType, dbType, algorithm, configs, logger = None): """ Formats the sequences for the specified algorithm """ EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs['EXECUTABLES_DIR'] formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType=='nucl': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType=='prot': if algorithm=='LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm=='BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db _temp_formatted_db = formatted_db+ "__temp__" """ format with 4GB file size """ if algorithm=='BLAST': cmd='%s -dbtype %s -max_file_sz 4294967296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) if algorithm=='LAST': # dirname = os.path.dirname(raw_sequence_file) cmd='%s -s 4G -p -c %s %s' %(formatdb_executable, _temp_formatted_db, raw_sequence_file) eprintf("INFO\t" + cmd) result= getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') try: for tempFile in temp_fileList: file = re.sub('__temp__','', tempFile) rename(tempFile, file); except: return False if result[0]==0: eprintf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) logger.printf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) return True else: eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) logger.printf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) return False
def execute_pipeline_stage(pipeline_command, extra_command = None, errorlogger = None, runstatslogger = None): argv = [ x.strip() for x in pipeline_command.split() ] funcname = re.sub(r'.py$','', argv[0]) funcname = re.sub(r'^.*/','', funcname) args = argv[1:] if hasattr(python_scripts, funcname): methodtocall = getattr( getattr(python_scripts, funcname), funcname) if extra_command == None: result = methodtocall(args, errorlogger = errorlogger, runstatslogger = runstatslogger) else: # print extra_command result = methodtocall(args, errorlogger = errorlogger, extra_command = extra_command, runstatslogger = runstatslogger) else: result = getstatusoutput(pipeline_command) return result
def runUsingBWA(bwaExec, sample_name, indexFile, _readFiles, bwaFolder): num_threads = int(multiprocessing.cpu_count() * 0.8) if num_threads < 1: num_threads = 1 status = True count = 0 for readFiles in _readFiles: bwaOutput = bwaFolder + PATHDELIM + sample_name + "_" + str( count) + '.sam' bwaOutputTmp = bwaOutput + ".tmp" cmd = "command not prepared" if len(readFiles) == 2: cmd = "%s mem -t %d -o %s %s %s %s" % (bwaExec, num_threads, bwaOutputTmp, indexFile, readFiles[0], readFiles[1]) if len(readFiles) == 1: res0 = re.search(r'_[1-2].fastq', readFiles[0]) res1 = re.search(r'_[1-2].b\d+.fastq', readFiles[0]) if res0 or res1: cmd = "%s mem -t %d -o %s %s %s " % (bwaExec, num_threads, bwaOutputTmp, indexFile, readFiles[0]) else: cmd = "%s mem -t %d -p -o %s %s %s " % ( bwaExec, num_threads, bwaOutputTmp, indexFile, readFiles[0]) # print cmd result = getstatusoutput(cmd) if result[0] == 0: pass rename(bwaOutputTmp, bwaOutput) else: eprintf("ERROR:\t Error file processing read files %s\n", readFiles) status = False count += 1 return status
def _execute_LAST(options): args= [ ] if options.last_executable : args.append( options.last_executable ) if options.last_f: args += [ "-f", options.last_f ] if options.last_o: args += [ "-o", options.last_o ] if options.last_db: args += [ options.last_db ] if options.last_query: args += [ options.last_query ] result = getstatusoutput(' '.join(args) ) return result[0]
def _execute_tRNA_Scan(options): args= [ ] if options.trna_executable : args.append( options.trna_executable ) if options.trna_i: args += [ "-i", options.trna_i ] if options.trna_o: args += [ "-o", options.trna_o ] if options.trna_D: args += [ "-D", options.trna_D ] if options.trna_T: args += [ "-T", options.trna_T ] if options.trna_F: args += [ "-F", options.trna_F] result = getstatusoutput(' '.join(args) ) return result
def runUsingBWA(bwaExec, sample_name, indexFile, readFiles, bwaFolder) : if len(readFiles) > 2: return False num_threads = int(multiprocessing.cpu_count()*0.8) if num_threads < 1: num_threads = 1 bwaOutput = bwaFolder + PATHDELIM + sample_name + '.sam' if len(readFiles) == 2: cmd = "%s mem -t %d -o %s %s %s %s" %(bwaExec, num_threads, bwaOutput, indexFile, readFiles[0], readFiles[1]) if len(readFiles) == 1: cmd = "%s mem -t %d -p -o %s %s %s " %(bwaExec, num_threads, bwaOutput, indexFile, readFiles[0]) result = getstatusoutput(cmd) if result[0]==0: return True return False
def _execute_tRNA_Scan(options): args = [] if options.trna_executable: args.append(options.trna_executable) if options.trna_i: args += ["-i", options.trna_i] if options.trna_o: args += ["-o", options.trna_o] if options.trna_D: args += ["-D", options.trna_D] if options.trna_T: args += ["-T", options.trna_T] if options.trna_F: args += ["-F", options.trna_F] result = getstatusoutput(' '.join(args)) return result
def _execute_fgs(options): modelFile = "illumina_10" sample_name=re.sub(r'.gff', '', options.prod_output) args= [ ] if options.prod_exec : args.append( options.prod_exec ) if options.prod_input: args += [ "-s", options.prod_input ] if options.prod_output: args += [ "-o", sample_name + ".tmp" ] args += [ "-w", "0" ] args += [ "-t", modelFile ] args += [ "-p", options.nthreads ] #arguments = [ fragGeneScan, "-s", inputFile, "-o", outputfile, "-w", "0", "-t", modelFile, "-p", thread] result = getstatusoutput(' '.join(args)) create_gff_faa(sample_name+ ".tmp" + ".faa", sample_name + ".gff", sample_name+".faa") remove(sample_name + ".tmp" + ".faa") return (0, '')
def format_db_blast(formatdb_executable, seq_subset_file): cmd='%s -dbtype prot -in %s' %(formatdb_executable, seq_subset_file.name) result= getstatusoutput(cmd)
def formatDB(tools, db, refdbspath, seqType, dbType, algorithm, configs, logger = None): """ Formats the sequences for the specified algorithm """ EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs['EXECUTABLES_DIR'] formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType=='nucl': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType=='prot': if algorithm=='LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm=='BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db _temp_formatted_db = formatted_db + "__temp__" """ format with 4GB file size """ if algorithm=='BLAST': # actual cmd='%s -dbtype %s -max_file_sz 4294967296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) cmd='%s -dbtype %s -max_file_sz 20267296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, formatted_db) if algorithm=='LAST': # dirname = os.path.dirname(raw_sequence_file) cmd='%s -s 4000M -p -c %s %s' %(formatdb_executable, _temp_formatted_db, raw_sequence_file) result= getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') _formatted_db_pal = _temp_formatted_db + ".pal" if algorithm=='BLAST' and path.exists(_formatted_db_pal): try: formatted_db_pal = formatted_db + ".pal" _openpal = open(_formatted_db_pal, 'r') openpal = open(formatted_db_pal, 'w') lines = _openpal.readlines() tempPATT =re.compile(r'__temp__') for line in lines: _result = tempPATT.search(line) modline = line.strip() if _result: modline = re.sub('__temp__','', modline) fprintf(openpal, "%s\n", modline) openpal.close() _openpal.close() remove(_formatted_db_pal) except: return False try: temp_fileList = glob(_temp_formatted_db + '*') for tempFile in temp_fileList: file = re.sub('__temp__','', tempFile) rename(tempFile, file); except: return False if result[0]==0: eprintf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) logger.printf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) return True else: eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) logger.printf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) return False
def blast_against_itself(blast_executable, seq_subset_file, blast_table_out): cmd='%s -outfmt 6 -db %s -query %s -out %s' %(blast_executable, seq_subset_file.name, seq_subset_file.name, blast_table_out) result= getstatusoutput(cmd)
def format_db_last(formatdb_executable, seq_subset_file): dirname = os.path.dirname(seq_subset_file.name) cmd='%s -p -c %s %s' %(formatdb_executable, dirname + PATHDELIM + 'subset_db', seq_subset_file.name) result= getstatusoutput(cmd)
def startUpPathwayTools(ptoolsExec): cmd = ptoolsExec + " -api" status = getstatusoutput(cmd)
def last_against_itself(last_executable, seq_subset_file, last_table_out): dirname = os.path.dirname(seq_subset_file.name) cmd='%s -o %s -f 0 %s %s' %(last_executable, last_table_out, dirname + PATHDELIM + 'subset_db', seq_subset_file.name) result= getstatusoutput(cmd)
if len(readFiles) == 1: res0 = re.search(r'_[1-2].fastq',readFiles[0]) res1 = re.search(r'_[1-2].b\d+.fastq',readFiles[0]) if res0 or res1: <<<<<<< HEAD cmd = "%s mem -t %d -p -o %s %s %s "%(bwaExec, num_threads, bwaOutputTmp, indexFile, readFiles[0]) else: cmd = "%s mem -t %d -o %s %s %s "%(bwaExec, num_threads, bwaOutputTmp, indexFile, readFiles[0]) print cmd ======= cmd = "%s mem -t %d %s %s > %s "%(bwaExec, num_threads, indexFile, readFiles[0], bwaOutputTmp) else: cmd = "%s mem -t %d -p %s %s > %s "%(bwaExec, num_threads, indexFile, readFiles[0], bwaOutputTmp) >>>>>>> 9d3adb2ed47aa16f9fe02e1a5da5260224a6e659 result = getstatusoutput(cmd) if result[0]==0: pass rename(bwaOutputTmp, bwaOutput) else: eprintf("ERROR:\t Error file processing read files %s\n", readFiles) status = False count += 1 return status def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name, readFiles, rpkmFolder) : num_threads = int(multiprocessing.cpu_count()*0.8)
def startPathwayTools(): cmd = "~/pathway-tools/pathway-tools -api" status = getstatusoutput(cmd)
def runBIOMCommand(infile, outfile, biomExec="biom"): commands = [biomExec, " convert", "-i", infile, "-o", outfile, "--table-type=\"Table\"", "--to-hdf5"] result = getstatusoutput(' '.join(commands)) return result[0]
def runPathologicCommand(runcommand=None): if runcommand == None: return False result = getstatusoutput(runcommand) return result[0]
def format_db_blast(formatdb_executable, seq_subset_file): cmd = '%s -dbtype prot -in %s' % (formatdb_executable, seq_subset_file.name) result = getstatusoutput(cmd)
def runPathologicCommand(runcommand = None): if runcommand == None: return False result = getstatusoutput(runcommand) return result[0]
def formatDB(tools, db, refdbspath, seqType, dbType, algorithm, configs, logger=None): """ Formats the sequences for the specified algorithm """ EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs[ 'EXECUTABLES_DIR'] formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH'][ 'exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType == 'nucl': if algorithm == 'LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm == 'BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType == 'prot': if algorithm == 'LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm == 'BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db _temp_formatted_db = formatted_db + "__temp__" """ format with 4GB file size """ cmd = "" if algorithm == 'BLAST': cmd = '%s -dbtype %s -max_file_sz 4294967296 -in %s -out %s' % ( formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) #cmd='%s -dbtype %s -max_file_sz 20267296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) if algorithm == 'LAST': # dirname = os.path.dirname(raw_sequence_file) cmd = "" if seqType == "prot": cmd = '%s -s 4000M -p -c %s %s' % ( formatdb_executable, _temp_formatted_db, raw_sequence_file) if seqType == "nucl": cmd = '%s -s 4000M -c %s %s' % ( formatdb_executable, _temp_formatted_db, raw_sequence_file) eprintf("INFO\tCommand to format \"%s\"\n", cmd) logger.printf("INFO\tCommand to format \"%s\"\n", cmd) result = getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') _formatted_db_pal = _temp_formatted_db + ".pal" if algorithm == 'BLAST' and path.exists(_formatted_db_pal): try: formatted_db_pal = formatted_db + ".pal" if seqType == "nucl": formatted_db_pal = formatted_db + ".nal" _openpal = open(_formatted_db_pal, 'r') openpal = open(formatted_db_pal, 'w') lines = _openpal.readlines() tempPATT = re.compile(r'__temp__') for line in lines: _result = tempPATT.search(line) modline = line.strip() if _result: modline = re.sub('__temp__', '', modline) fprintf(openpal, "%s\n", modline) openpal.close() _openpal.close() remove(_formatted_db_pal) except: return False try: temp_fileList = glob(_temp_formatted_db + '*') for tempFile in temp_fileList: file = re.sub('__temp__', '', tempFile) rename(tempFile, file) except: return False if result[0] == 0: eprintf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm)) logger.printf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm)) return True else: eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm)) eprintf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm)) return False
def write_sequin_file(tbl_file_name, contig_dict, sample_name, nucleotide_seq_dict, protein_seq_dict, sequin_input_files): sequin_src_filename = re.sub(r'tbl$', 'src', tbl_file_name) sequin_output_fasta = re.sub(r'tbl$', 'fasta', tbl_file_name) sequin_output_sbt = re.sub(r'tbl$', 'sbt', tbl_file_name) shutil.copy(sequin_input_files['sequin_fasta'], sequin_output_fasta) shutil.copy(sequin_input_files['sequin_sbt_file'], sequin_output_sbt) sequin_required_files = { 'fasta': sequin_output_fasta, 'tbl': tbl_file_name, 'src': sequin_src_filename, 'tbl2asn': sequin_input_files['sequin_tbl2asn'], 'sbt': sequin_output_sbt } outputfile = open(tbl_file_name, 'w') #print contig_dict count = 0 outputStr = "" for key in contig_dict: first = True if count % 10000 == 0: #print "count " + str(count) outputfile.write(outputStr) outputStr = "" count += 1 for attrib in contig_dict[key]: id = attrib['id'] try: protein_seq = protein_seq_dict[id] except: protein_seq = "" None definition = sample_name accession = '.' version = '.' + spaces(10) + "GI:." dblink = sample_name keywords = '.' source = sample_name organism = sample_name if first: first = False try: dna_seq = nucleotide_seq_dict[key] dna_seq_formatted = format_sequence_origin(dna_seq) dna_length = len(dna_seq) sourceStr = "1.." + str(dna_length) except: dna_seq = "" dna_seq_formatted = "" dna_length = 0 sourceStr = "0..0" outputStr += (">Feature %s\n" % (key)) outputStr += re.sub('\.\.', '\t', sourceStr) + '\t' + "REFERENCE" + '\n' startPrefix = '' endPrefix = '' if 'partial' in attrib: if attrib['partial'] == '10': startPrefix = '<' if attrib['partial'] == '01': endPrefix = '>' if attrib['partial'] == '11': startPrefix = '<' endPrefix = '>' if 'start' in attrib and 'end' in attrib: if 'strand' in attrib: if attrib['strand'] == '-': geneLoc = str(attrib['end'] ) + endPrefix + '\t' + startPrefix + str( attrib['start']) else: geneLoc = startPrefix + str( attrib['start']) + '\t' + str( attrib['end']) + endPrefix outputStr += geneLoc + '\t' + "gene" + '\n' if 'locus_tag' in attrib: locus_tag = "gene" + '\t' + attrib['locus_tag'] outputStr += '\t\t\t' + locus_tag + '\n' outputStr += geneLoc + '\t' + "CDS" + '\n' if 'product' in attrib: product_tag = "product" + '\t' + attrib['product'] outputStr += '\t\t\t' + product_tag + '\n' outputfile.write(outputStr) outputfile.close() outputsrcfile = open(sequin_src_filename, 'w') ncbi_sequin_params = parse_parameter_file( sequin_input_files['sequin_params']) headers = [ 'Collection_date', 'Country', 'isolation_source', 'Lat_Lon', 'Organism', 'environmental_sample' ] header_values = {} headerStr = 'Sequence_ID' for header_name in headers: headerStr += '\t' + header_name header_values[header_name] = get_parameter(ncbi_sequin_params, 'SequinHeader', header_name, default='__' + header_name + '__') valueStr = "" for header_name in headers: valueStr += "\t" + header_values[header_name] fprintf(outputsrcfile, "%s\n", key + headerStr) for key in contig_dict: fprintf(outputsrcfile, "%s\n", key + valueStr) outputsrcfile.close() # Now open a pipe process and run the tbl2asn script on the sequin input for file in sequin_required_files: if not path.exists(sequin_required_files[file]): print "Could not find file : " + sequin_required_files[file] print "Make sure all of the following files are present :" for file in sequin_required_files: print file sys.exit(0) args = [ sequin_required_files['tbl2asn'], '-t', sequin_required_files['sbt'], '-i', sequin_required_files['fasta'], '-a', 's', '-V', 'v' ] command = ' '.join(args) result = getstatusoutput(command) if result[0] == 0: print "Successfully created the SEQUIN file"
def checkbinaries(configs): message = None executables_dir = "---" if "METAPATHWAYS_PATH" in configs: executables_dir = configs["METAPATHWAYS_PATH"] if "EXECUTABLES_DIR" in configs: executables_dir += PATHDELIM + configs["EXECUTABLES_DIR"] else: executables_dir += PATHDELIM + "---" if not path.exists(executables_dir): message = "ERROR\tMissing executables folder under \'MetaPathways_Python/executables\' it set to " + executables_dir return message binaries = {} binaries["LASTDB_EXECUTABLE"] = ["-h"] binaries["LAST_EXECUTABLE"] = ["-h"] binaries["FORMATDB_EXECUTABLE"] = ["-help"] binaries["BLASTP_EXECUTABLE"] = ['-h'] binaries["BLASTN_EXECUTABLE"] = ['-h'] binaries["PRODIGAL_EXECUTABLE"] = ["-h"] binaries["SCAN_tRNA_EXECUTABLE"] = ['-h'] binaries["RPKM_EXECUTABLE"] = ['-h'] status = {} error = False for name in binaries.keys(): if not name in configs: status[name] = "BINARY UNSPECIFIED" error = True continue executable = executables_dir + PATHDELIM + configs[name] if not executable.strip(): status[name] = "BINARY UNSPECIFIED" error = True continue if not path.exists(executable): status[name] = "BINARY MISSING" error = True continue result = getstatusoutput(' '.join([executable] + binaries[name])) message = False if error: message = "ERROR\tOS Specific executables check failed\n\n" message += "\tFOLDER :" + executables_dir + "\n\n" message += "\tFIX : Please correct the location for \"OS Specific Executables\" in the Setup tab\n" message += "\t : Alternatively, you can update the EXECUTABLES_DIR key in the config file \"config/template_config.txt\"\n\n" for name in status.keys(): message += "\t" + name + " : " + status[name] + "\n" return message
def start_pathway_tools_api_mode(pathway_tools_exe): command = pathway_tools_exe + " -api" result = getstatusoutput(command)
def blast_against_itself(blast_executable, seq_subset_file, blast_table_out): cmd = '%s -outfmt 6 -db %s -query %s -out %s' % ( blast_executable, seq_subset_file.name, seq_subset_file.name, blast_table_out) result = getstatusoutput(cmd)
def format_db_last(formatdb_executable, seq_subset_file): dirname = os.path.dirname(seq_subset_file.name) cmd = '%s -p -c %s %s' % (formatdb_executable, dirname + PATHDELIM + 'subset_db', seq_subset_file.name) result = getstatusoutput(cmd)
def formatDB(tools, db, refdbspath, seqType, dbType, algorithm, configs, logger = None): """ Formats the sequences for the specified algorithm """ EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs['EXECUTABLES_DIR'] formatdb_executable = '' #EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType == 'prot' and algorithm=='LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if seqType =='nucl' or algorithm=='BLAST': if configs['FORMATDB_EXECUTABLE']: formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] else: formatdb_executable = which('makeblastdb') if formatdb_executable==None: eprintf("ERROR\tCannot find makeblastdb to format \"%s\"\n", db) logger.printf("ERROR\tCannot find makeblastdb to format \"%s\"\n",db ) return False formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db _temp_formatted_db = formatted_db + "__temp__" """ format with 4GB file size """ cmd = "" if seqType =='nucl' or algorithm=='BLAST': cmd='%s -dbtype %s -max_file_sz 2000000000 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) #cmd='%s -dbtype %s -max_file_sz 20267296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) formatted_db_size = 4000000000 if 'FORMATTED_DB_SIZE' in configs and configs['FORMATTED_DB_SIZE'].isdigit(): formatted_db_size = int(configs['FORMATTED_DB_SIZE']) if seqType=='prot' and algorithm=='LAST': # dirname = os.path.dirname(raw_sequence_file) cmd="" if seqType=="prot": cmd='%s -s %s -p -c %s %s' %(formatdb_executable, formatted_db_size, _temp_formatted_db, raw_sequence_file) # if seqType=="nucl": # cmd='%s -s %s -c %s %s' %(formatdb_executable, formatted_db_size, _temp_formatted_db, raw_sequence_file) eprintf("INFO\tCommand to format \"%s\"\n", cmd) logger.printf("INFO\tCommand to format \"%s\"\n", cmd) print 'COMMAND: ', cmd result= getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') _formatted_db_pal = _temp_formatted_db + ".pal" if algorithm=='BLAST' and path.exists(_formatted_db_pal): try: formatted_db_pal = formatted_db + ".pal" if seqType=="nucl": formatted_db_pal = formatted_db + ".nal" _openpal = open(_formatted_db_pal, 'r') openpal = open(formatted_db_pal, 'w') lines = _openpal.readlines() tempPATT =re.compile(r'__temp__') for line in lines: _result = tempPATT.search(line) modline = line.strip() if _result: modline = re.sub('__temp__','', modline) fprintf(openpal, "%s\n", modline) openpal.close() _openpal.close() remove(_formatted_db_pal) except: return False try: temp_fileList = glob(_temp_formatted_db + '*') for tempFile in temp_fileList: file = re.sub('__temp__','', tempFile) rename(tempFile, file); except: return False if result[0]==0: eprintf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) logger.printf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm) ) return True else: eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) eprintf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm) ) return False
def last_against_itself(last_executable, seq_subset_file, last_table_out): dirname = os.path.dirname(seq_subset_file.name) cmd = '%s -o %s -f 0 %s %s' % (last_executable, last_table_out, dirname + PATHDELIM + 'subset_db', seq_subset_file.name) result = getstatusoutput(cmd)
def checkbinaries(configs): message = None executables_dir = "---" if "METAPATHWAYS_PATH" in configs: executables_dir = configs["METAPATHWAYS_PATH"] if "EXECUTABLES_DIR" in configs: executables_dir += PATHDELIM + configs["EXECUTABLES_DIR"] else: executables_dir += PATHDELIM + "---" if not path.exists(executables_dir): message = "ERROR\tMissing executables folder under \'MetaPathways_Python/executables\' it set to " + executables_dir return message binaries = {} binaries["LASTDB_EXECUTABLE"] = ["-h"] binaries["LAST_EXECUTABLE"] = ["-h"] binaries["FORMATDB_EXECUTABLE"] = ["-help" ] binaries["BLASTP_EXECUTABLE"] = ['-h' ] binaries["BLASTN_EXECUTABLE"] = ['-h' ] binaries["PRODIGAL_EXECUTABLE"]= ["-h"] binaries["SCAN_tRNA_EXECUTABLE"] = ['-h' ] binaries["RPKM_EXECUTABLE"] = ['-h' ] status = {} error = False for name in binaries.keys(): if name in ["FORMATDB_EXECUTABLE", "BLASTP_EXECUTABLE", "BLASTN_EXECUTABLE"] and configs[name]=='': continue if not name in configs : status[name] = "BINARY UNSPECIFIED" error = True continue executable = executables_dir + PATHDELIM + configs[name]; if not executable.strip(): status[name] = "BINARY UNSPECIFIED" error = True continue if not path.exists(executable): status[name] = "BINARY MISSING" error = True continue result = getstatusoutput( ' '.join([ executable ] + binaries[name])) message = False if error: message = "ERROR\tOS Specific executables check failed\n\n" message += "\tFOLDER :" + executables_dir + "\n\n" message += "\tFIX : Please correct the location for \"OS Specific Executables\" in the Setup tab\n" message += "\t : Alternatively, you can update the EXECUTABLES_DIR key in the config file \"config/template_config.txt\"\n\n" for name in status.keys(): message += "\t" + name + " : " + status[name] + "\n"; return message