def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = ''; num_threads = multiprocessing.cpu_count() / 2; if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): # parameters = '-x illumina -v 5 -b 4 -B 0'; parameters = '-v 5 -t %d -B 0 -b 3' % num_threads; elif ((machine_name.lower() == 'pacbio')): # parameters = '-v 5 -b 4 -B 0'; parameters = '-v 5 -t %d -B 0 -b 3' % num_threads; elif ((machine_name.lower() == 'nanopore')): # parameters = '-x nanopore -v 5 -b 4 -B 0'; parameters = '-v 5 -t %d -B 0 -b 3' % num_threads; elif ((machine_name.lower() == 'debug')): # parameters = '-x nanopore -v 5 -C -B 0 -j 11 -v 7 -y 31676 -n 1 -t 1'; parameters = '-B 0 -b 3 -F 0.05 -l 9 -A 12 -v 7 -y 31676 -n 1 -t 1'; else: # default parameters = '-v 5 -t %d' % num_threads; if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix); else: output_filename = MAPPER_NAME; reads_basename = os.path.splitext(os.path.basename(reads_file))[0]; sam_file = '%s/%s.sam' % (output_path, output_filename); memtime_file = '%s/%s.memtime' % (output_path, output_filename); memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename); # Run the indexing process, and measure execution time and memory. if (os.path.exists(reference_file + '.gmidx') == False or os.path.exists(reference_file + '.gmidxsec') == False): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)); command = '%s %s/%s -I -r %s' % (basicdefines.measure_command(memtime_file_index), ALIGNER_PATH, BIN, reference_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); else: sys.stderr.write('[%s wrapper] Reference index already exists. Continuing.\n' % (MAPPER_NAME)); sys.stderr.flush(); # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)); command = '%s %s/%s %s -r %s -d %s -o %s' % (basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, parameters, reference_file, reads_file, sam_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)); return sam_file
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = ''; num_threads = multiprocessing.cpu_count() / 2; if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-t %s' % str(num_threads); elif ((machine_name.lower() == 'pacbio')): parameters = '-t %s -x pacbio' % str(num_threads); elif ((machine_name.lower() == 'nanopore')): parameters = '-t %s -x ont2d' % str(num_threads); elif ((machine_name.lower() == 'debug')): parameters = '-t %s' % str(num_threads); else: # default parameters = '-t %s' % str(num_threads); if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix); else: output_filename = MAPPER_NAME; reads_basename = os.path.splitext(os.path.basename(reads_file))[0]; sam_file = '%s/%s.sam' % (output_path, output_filename); memtime_file = '%s/%s.memtime' % (output_path, output_filename); memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename); # Run the indexing process, and measure execution time and memory. if (not os.path.exists(reference_file + '.bwt')): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)); command = '%s %s/%s index %s' % (basicdefines.measure_command(memtime_file_index), ALIGNER_PATH, BIN, reference_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); else: sys.stderr.write('[%s wrapper] Reference index already exists. Continuing.\n' % (MAPPER_NAME)); sys.stderr.flush(); # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)); command = '%s %s/%s mem %s %s %s > %s' % (basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, parameters, reference_file, reads_file, sam_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)); return sam_file
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): # Create a config file and then use it to run the assembly # 1. COPY CONFIG_PATH = os.path.join(output_path, 'sd2.config') shutil.copy(CONFIG_TEMPLATE_PATH, CONFIG_PATH) with open(CONFIG_PATH, 'a') as configfile: # If reads file is fastq if reads_file[-3:] == '.fq' or reads_file[-6:] == '.fastq': configfile.write('q=%s\n' % reads_file) # If reads file is fasta elif reads_file[-3:] == '.fa' or reads_file[-6:] == '.fasta': configfile.write('p=%s\n' % reads_file) else: sys.stderr.write('\n[%s wrapper] Unsupported file format (%s)!\n' % (ASSEMBLER_NAME, reads_file)) # Config file is closed (hopefully) num_threads = multiprocessing.cpu_count() / 2 memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') command = 'cd %s; %s %s all -s %s -p %d -K 127 -R -o graph_prefix 1>ass.log 2>ass.err' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, CONFIG_PATH, num_threads) subprocess.call(command, shell='True')
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = '' num_threads = os.environ["NUM_THREADS"] if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-t %s' % str(num_threads) elif ((machine_name.lower() == 'pacbio')): parameters = '-t %s -x pacbio' % str(num_threads) elif ((machine_name.lower() == 'nanopore')): parameters = '-ax map-ont' % str(num_threads) elif ((machine_name.lower() == 'longindel')): parameters = '-t %s -x ont2d -w 1200 -d 1200' % str(num_threads) elif ((machine_name.lower() == 'longindel2')): parameters = '-t %s -x ont2d -w 5000 -d 5000' % str(num_threads) elif ((machine_name.lower() == 'debug')): parameters = '-t %s' % str(num_threads) else: # default parameters = '-t %s' % str(num_threads) if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix) else: output_filename = MAPPER_NAME reads_basename = os.path.splitext(os.path.basename(reads_file))[0] sam_file = '%s/%s.sam' % (output_path, output_filename) memtime_file = '%s/%s.memtime' % (output_path, output_filename) memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename) # Run the indexing process, and measure execution time and memory. # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)) command = '%s %s/%s %s -ax map-ont %s %s > %s' % ( basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, parameters, reference_file, reads_file, sam_file) sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)) subprocess.call(command, shell=True) sys.stderr.write('\n\n') sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)) return sam_file
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): # IDBA works only on fasta files # If fastq file is provided, convert it to fasta splitname = os.path.splitext(reads_file) basename = splitname[0] ext = splitname[1] if ext == '.fq' or ext == '.fastq': fasta_filename = basename + '.fa' command = '%s %s %s' % (FQ2FA_BIN, reads_file, fasta_filename) subprocess.call(command, shell='True') # Use created fasta file as reads file from now on reads_file = fasta_filename num_threads = multiprocessing.cpu_count() / 2 memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') command = '%s %s --num_threads %d -r %s -o %s' % (basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, reads_file, output_path) subprocess.call(command, shell='True')
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): # Sparse also runs only on fasta # Atm parameters are hardcoded. # TODO: if fastq is given convert it to fasta # callculate estimated genome size (GS) from reference and/or reads files num_threads = multiprocessing.cpu_count() / 2 # ATM using the same set of parametars for all sequencers if machine_name in basicdefines.TECH: genomesize = 60000000 # Starting value / historical reasons # Calculating reference size reference_fastq = fastqparser.read_fastq(reference_file) reference_seq = reference_fastq[1][0] genomesize = len(reference_seq) memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') command = 'cd %s; %s %s -t %d k 21 GS %d f %s' % (output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, 10*genomesize, reads_file) subprocess.call(command, shell='True') else: sys.stderr.write('\}\nInvalid machine_name parameter for assembler %s' % ASSEMBLER_NAME) sys.stderr.write('\nSkipping ....')
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): num_threads = multiprocessing.cpu_count() / 2 memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') parrentdir = os.path.dirname(output_path.rstrip('/')) tmemtime_path = os.path.join(parrentdir, ASSEMBLER_NAME + '.memtime') # Ray creates his output folder which at this point doesnt exist # Therefore cgmemtime cannot create output file in output folder. # Creating in it upper folder instead command = '%s mpiexec -n %d %s -s %s -o %s' % ( basicdefines.measure_command(tmemtime_path), num_threads, ASSEMBLER_BIN, reads_file, output_path) subprocess.call(command, shell='True') # After Ray is finished, moving memtime file to the Ray output folder shutil.move(tmemtime_path, memtime_path)
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = ''; num_threads = multiprocessing.cpu_count() / 2; if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-v '; elif ((machine_name.lower() == 'pacbio')): parameters = '-v -q 1 -r 1 -a 1 -b 1'; elif ((machine_name.lower() == 'nanopore')): parameters = '-v -q 1 -r 1 -a 1 -b 1'; elif ((machine_name.lower() == 'debug')): parameters = '-v '; else: # default parameters = '-v '; if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix); else: output_filename = MAPPER_NAME; reads_fasta = reads_file; reads_basename = os.path.splitext(os.path.basename(reads_file))[0]; maf_file = '%s/%s.maf' % (output_path, output_filename); sam_file = '%s/%s.sam' % (output_path, output_filename); memtime_file = '%s/%s.memtime' % (output_path, output_filename); memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename); memtime_file_maftosam = '%s/%s-maftosam.memtime' % (output_path, output_filename); reference_db_file = reference_file + '.db'; # Check if the given input file is a FASTA or FASTQ, and convert to FASTA if necessary. if (reads_file[-1] == 'q'): sys.stderr.write('[%s wrapper] Converting FASTQ to FASTA...\n' % (MAPPER_NAME)); reads_fasta = reads_file[0:-1] + 'a'; fastqparser.convert_to_fasta(reads_file, reads_fasta); sys.stderr.write('\n'); # Run the indexing process, and measure execution time and memory. if not os.path.exists(reference_db_file + '.suf'): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)); command = '%s %s/lastdb %s %s' % (basicdefines.measure_command(memtime_file_index), ALIGNER_PATH, reference_db_file, reference_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); else: sys.stderr.write('[%s wrapper] Reference DB already exists. Continuing.\n' % (MAPPER_NAME)); sys.stderr.flush(); # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)); command = '%s %s/%s %s %s %s > %s' % (basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, parameters, reference_db_file, reads_fasta, maf_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Converting the output MAF to SAM file...\n' % (MAPPER_NAME)); fp = open(sam_file, 'w'); fp.write(get_sam_header(reference_file)); fp.close(); command = '%s %s/../scripts/maf-convert sam %s >> %s' % (basicdefines.measure_command(memtime_file_maftosam), ALIGNER_PATH, maf_file, sam_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)); return sam_file
def measure_command(measure_file): if (MODULE_BASICDEFINES == True and os.path.exists(CGMEMTIME_FILE)): return basicdefines.measure_command(measure_file) else: return '/usr/bin/time --format "Command line: %%C\\nReal time: %%e s\\nCPU time: -1.0 s\\nUser time: %%U s\\nSystem time: %%S s\\nMaximum RSS: %%M kB\\nExit status: %%x" --quiet -o %s ' % measure_file
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = '' num_threads = multiprocessing.cpu_count() / 2 if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-x illumina -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'pacbio')): parameters = '-v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'nanopore')): parameters = '-v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'nanoporecirc')): parameters = '-v 5 -t %d -C -B 0' % num_threads elif ((machine_name.lower() == 'myers')): parameters = '-a myers -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'gotoh')): parameters = '-a gotoh -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'anchor')): parameters = '-a anchor -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'anchorcirc')): parameters = '-a anchor -C -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'anchorgotoh')): parameters = '-a anchorgotoh -v 5 -t %d -B 0' % num_threads elif ((machine_name.lower() == 'metagen')): parameters = '-v 5 -t %d -C -B 0 -Z' % num_threads elif ((machine_name.lower() == 'metagenanchor')): parameters = '-a anchor -v 5 -t %d -C -B 0 -Z' % num_threads else: # default parameters = '-v 5 -t %d' % num_threads if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix) else: output_filename = MAPPER_NAME reads_basename = os.path.splitext(os.path.basename(reads_file))[0] sam_file = '%s/%s.sam' % (output_path, output_filename) memtime_file = '%s/%s.memtime' % (output_path, output_filename) memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename) # Run the indexing process, and measure execution time and memory. if (os.path.exists(reference_file + '.gmidx') == False or os.path.exists(reference_file + '.gmidxsec') == False): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)) command = '%s %s/%s -I -r %s' % (basicdefines.measure_command( memtime_file_index), ALIGNER_PATH, BIN, reference_file) sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)) subprocess.call(command, shell=True) sys.stderr.write('\n\n') else: sys.stderr.write( '[%s wrapper] Reference index already exists. Continuing.\n' % (MAPPER_NAME)) sys.stderr.flush() # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)) command = '%s %s/%s %s -r %s -d %s -o %s' % ( basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, parameters, reference_file, reads_file, sam_file) sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)) subprocess.call(command, shell=True) sys.stderr.write('\n\n') sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)) return sam_file
def measure_command_wrapper(out_filename): if (USE_BASICDEFINES_ == True): return basicdefines.measure_command(out_filename); else: return '/usr/bin/time --format "Command line: %%C\\nReal time: %%e s\\nCPU time: -1.0 s\\nUser time: %%U s\\nSystem time: %%S s\\nMaximum RSS: %%M kB\\nExit status: %%x" --quiet -o %s ' % out_filename;
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): # SGA has a rathar long series of steps to do to run an assembly # TODO: Here is one sequence of programs producing one results # Parameters are many and they could all influence end result # Some parameters should be inferred from the reads file or set by user # COMMENT: changing directory every time because it seems that it is not preserved # across multiple shell commands # The pipile implemented here will be based on sga-celegans example # because that example used pacbio dataset num_threads = multiprocessing.cpu_count() / 2 memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') # command = '%s %s --num_threads %d -r %s -o %s' % (basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, reads_file, output_path) # subprocess.call(command, shell='True') # 1. Preprocess memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_1.memtime') command = 'cd %s; %s %s preprocess -o sgaccs.fasta %s' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, reads_file) subprocess.call(command, shell='True') # Not doing error correction # # 2a. Build index for error correction # command = 'cd %s; %s index -a ropebwt -t 32 --no-reverse sgaccs.fasta' % (output_path, ASSEMBLER_BIN) # subprocess.call(command, shell='True') # # 2b. Perform error correction # command = 'cd %s; %s correct -k 21 --learn -t 32 -o reads.ec.k21.fasta sgaccs.fasta' % (output_path, ASSEMBLER_BIN) # subprocess.call(command, shell='True') # 3. Contig assembly #3a. Index data memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_2.memtime') command = 'cd %s; %s %s index -a ropebwt -t %d --no-reverse %s' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, reads_file) subprocess.call(command, shell='True') # Not doing filtering # #3b. Remove exact-match duplicates and reads with low frequency kmers # # COMMENT: In my experience filtering could filter out too much # # It might be better to skip it # # Not sure how to decide when to skip and when not to # command = 'cd %s; %s filter -x 2 -t %d --homopolymer-check --low-complexity-check reads.ec.k21.fasta' % (output_path, ASSEMBLER_BIN, num_threads) # subprocess.call(command, shell='True') #3c. Merge simple, unbranched chains of vertices memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_3.memtime') command = 'cd %s; %s %s fm-merge -m 30 -t %d -o merged.k21.fa %s' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads, reads_file) subprocess.call(command, shell='True') # 3d. Build an index of the merged sequences memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_4.memtime') command = 'cd %s; %s %s index -d 1000000 -t %d merged.k21.fa' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads) subprocess.call(command, shell='True') # 3e. Remove any substrings that were generated from the merge process memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_5.memtime') command = 'cd %s; %s %s rmdup -t %d merged.k21.fa' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads) subprocess.call(command, shell='True') # 3f. Compute the structure of the string graph memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_6.memtime') command = 'cd %s; %s %s overlap -m 30 -t %d merged.k21.rmdup.fa' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN, num_threads) subprocess.call(command, shell='True') # 3g. Perform the contig assembly without bubble popping memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '_7.memtime') command = 'cd %s; %s %s assemble -m 30 -o assemble merged.k21.rmdup.asqg.gz' % ( output_path, basicdefines.measure_command(memtime_path), ASSEMBLER_BIN) subprocess.call(command, shell='True') # Callculate memtime summary and write it in a file memtime_path = os.path.join(output_path, ASSEMBLER_NAME + '.memtime') real_time = 0.0 cpu_time = 0.0 user_time = 0.0 system_time = 0.0 max_rss = 0.0 with open(memtime_path, 'w') as fmemtime: fmemtime.write('SGA summary .memtime file (%s)' % output_path) for i in xrange(1, 8): memtime_file = '%s_%d.memtime' % (ASSEMBLER_NAME, i) tmemtime_path = os.path.join(output_path, memtime_file) with open(tmemtime_path, 'r') as tfmemtime: tfmemtime.readline() # skipping 1st line line = tfmemtime.readline() treal_time = float(line.split()[2]) # real time line = tfmemtime.readline() tcpu_time = float(line.split()[2]) # cpu time line = tfmemtime.readline() tuser_time = float(line.split()[2]) # user time line = tfmemtime.readline() tsystem_time = float(line.split()[2]) # system time line = tfmemtime.readline() tmax_rss = int(line.split()[2]) # Max RSS real_time += treal_time cpu_time += tcpu_time user_time += tuser_time system_time += tsystem_time if tmax_rss > max_rss: max_rss = tmax_rss # writing to summary .memtime file fmemtime.write('\nReal time: %.3f s' % real_time) fmemtime.write('\nCPU time: %.3f s' % cpu_time) fmemtime.write('\nUser time: %.3f s' % user_time) fmemtime.write('\nSystem time: %.3f s' % system_time) fmemtime.write('\nMaximum RSS: %d MB' % max_rss)
def measure_command_wrapper(out_filename): if (USE_BASICDEFINES_ == True): return basicdefines.measure_command(out_filename); else: return '/usr/bin/time --format "Command line: %%C\\nReal time: %%e s\\nCPU time: -1.0 s\\nUser time: %%U s\\nSystem time: %%S s\\nMaximum RSS: %%M kB\\nExit status: %%x" --quiet -o %s ' % out_filename;
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = ''; num_threads = multiprocessing.cpu_count() / 2; if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-num_threads %s' % str(num_threads); elif ((machine_name.lower() == 'pacbio')): parameters = '-reward 5 -penalty -4 -gapopen 8 -gapextend 6 -dust no'; parameters += ' -num_threads %s' % str(num_threads); elif ((machine_name.lower() == 'nanopore')): # These parameters used in the paper: "" # These parameters used in the paper: "Oxford Nanopore Sequencing and de novo Assembly of a Eukaryotic Genome", Supplemental Notes and Figures # http://biorxiv.org/content/biorxiv/suppl/2015/01/06/013490.DC1/013490-1.pdf # Quote: "Overall accuracy was calculated by aligning the raw Oxford Nanopore reads to the W303 pacbio assembly using Blast version 2.2.27+ with the following parameters:" # parameters += ' -reward 5 -penalty -4 -gapopen 8 -gapextend 6 -dust no -evalue 1e-10'; parameters = '-reward 5 -penalty -4 -gapopen 8 -gapextend 6 -dust no'; parameters += ' -num_threads %s' % str(num_threads); elif ((machine_name.lower() == 'debug')): parameters = '-num_threads %s' % str(num_threads); # sys.stderr.write('ERROR: Debug parameters not implemented yet!\n'); # exit(1); else: # default parameters = '-num_threads %s' % str(num_threads); # http://www.kenkraaijeveld.nl/genomics/bioinformatics/ # The first thing to do is to build your contig.fa file into a Blast database. Type: # $ makeblastdb -in [path to contigs.fa] -dbtype nucl -out [path to output directory] # You can now query this database with sequences that you want to find. For example: # $ blastn -query [path to file with sequence of interest] -task blastn -db [path to your database] -out [path to output directory] -num_threads 8 if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix); else: output_filename = MAPPER_NAME; reads_basename = os.path.splitext(os.path.basename(reads_file))[0]; sam_file = '%s/%s.sam' % (output_path, output_filename); out_file = '%s/%s.out' % (output_path, output_filename); filtered_out_file = '%s/%s-filtered.out' % (output_path, output_filename); out_db_path = '%s-blastdb' % (reference_file); memtime_file = '%s/%s.memtime' % (output_path, output_filename); memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename); # Run the indexing process, and measure execution time and memory. if (not os.path.exists(out_db_path + '.nsq')): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)); command = '%s %s/makeblastdb -in %s -dbtype nucl -out %s' % (basicdefines.measure_command(memtime_file_index), ALIGNER_PATH, reference_file, out_db_path); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); # subprocess.call(command, shell=True); sys.stderr.write('\n\n'); else: sys.stderr.write('[%s wrapper] Reference index already exists. Continuing.\n' % (MAPPER_NAME)); sys.stderr.flush(); # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)); # command = '%s %s/%s -task blastn -db %s -query %s -out %s %s' % (basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, out_db_path, reads_file, out_file, parameters); command = '%s %s/%s -task blastn -db %s -query %s -out %s %s -outfmt "6 %s"' % (basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, out_db_path, reads_file, out_file, parameters, outfmt); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); # Filter the BLAST out file and extract only one alignment per read (the one with highest alignment score). sys.stderr.write('[%s wrapper] Filtering BLAST output...\n' % (MAPPER_NAME)); command = '%s/filterblastout/bin/filterblastout %s > %s' % (SCRIPT_PATH, out_file, filtered_out_file); sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)); subprocess.call(command, shell=True); sys.stderr.write('\n\n'); convert_blast_to_sam(reference_file, reads_file, filtered_out_file, sam_file); sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)); return sam_file
def run(reads_file, reference_file, machine_name, output_path, output_suffix=''): parameters = '' num_threads = multiprocessing.cpu_count() / 2 if ((machine_name.lower() == 'illumina') or (machine_name.lower() == 'roche')): parameters = '-nproc %s -sam -bestn 1 -minMatch 7' % str(num_threads) elif ((machine_name.lower() == 'pacbio')): parameters = '-nproc %s -sam -bestn 1' % str(num_threads) elif ((machine_name.lower() == 'nanopore')): parameters = '-nproc %s -sam -bestn 1' % str(num_threads) elif ((machine_name.lower() == 'longindel')): parameters = '-nproc %s -sam -bestn 1 -clipping none' % str( num_threads) # -clipping [none|hard|subread|soft] (none) # Use no/hard/subread/soft clipping for SAM output. elif ((machine_name.lower() == 'debug')): parameters = '-nproc %s -sam -bestn 1' % str(num_threads) elif ((machine_name.lower() == 'pacbiom4')): parameters = '-nproc %s -bestn 1 -m 4' % str(num_threads) else: # default parameters = '-nproc %s -sam -bestn 1' % str(num_threads) if (output_suffix != ''): output_filename = '%s-%s' % (MAPPER_NAME, output_suffix) else: output_filename = MAPPER_NAME reads_basename = os.path.splitext(os.path.basename(reads_file))[0] sam_file = '%s/%s.sam' % (output_path, output_filename) memtime_file = '%s/%s.memtime' % (output_path, output_filename) memtime_file_index = '%s/%s-index.memtime' % (output_path, output_filename) # Run the indexing process, and measure execution time and memory. if ((not os.path.exists(reference_file + '.blasrsa'))): sys.stderr.write('[%s wrapper] Generating index...\n' % (MAPPER_NAME)) command = '%s %s/alignment/bin/sawriter %s.blasrsa %s' % ( basicdefines.measure_command(memtime_file_index), ALIGNER_PATH, reference_file, reference_file) sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)) subprocess.call(command, shell=True) sys.stderr.write('\n\n') else: sys.stderr.write( '[%s wrapper] Reference index already exists. Continuing.\n' % (MAPPER_NAME)) sys.stderr.flush() # Run the alignment process, and measure execution time and memory. sys.stderr.write('[%s wrapper] Running %s...\n' % (MAPPER_NAME, MAPPER_NAME)) command = '%s %s/%s %s %s %s -sa %s.blasrsa -out %s' % ( basicdefines.measure_command(memtime_file), ALIGNER_PATH, BIN, reads_file, reference_file, parameters, reference_file, sam_file) sys.stderr.write('[%s wrapper] %s\n' % (MAPPER_NAME, command)) subprocess.call(command, shell=True) sys.stderr.write('\n\n') sys.stderr.write('[%s wrapper] %s wrapper script finished processing.\n' % (MAPPER_NAME, MAPPER_NAME)) return sam_file