def grep_count(file_path, to_match, additional_flags=None, fixed_mode=True, starts_with=False): ''' This uses grep for fast counting of strings in a file ''' if not os.path.isfile(file_path) or os.path.getsize(file_path)==0: return 0 env = os.environ.copy() env['LC_ALL'] = 'C' #use C locale rather than UTF8 for faster grep cmd = ["grep"] # '-c' returns the match count cmd.append("-c") if additional_flags: cmd.extend(additional_flags) # fixed mode cannot be used with starts_with, since it does not match regular expressions # only add the fixed_mode flag if we're not using starts_with if not starts_with: if fixed_mode: cmd.append("-F") cmd.append(to_match) else: cmd.append("^"+to_match) cmd.append(file_path) number_of_seqs = util.misc.run_and_print(cmd, silent=False, check=True, env=env) return int(number_of_seqs.stdout.decode("utf-8").rstrip(os.linesep))
def lastal_chunked_fastq( inFastq, db, outFastq, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100, chunk_size=100000 ): lastal_path = tools.last.Lastal().install_and_get_path() mafsort_path = tools.last.MafSort().install_and_get_path() mafconvert_path = tools.last.MafConvert().install_and_get_path() no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') filtered_fastq_files = [] with open(inFastq, "rt") as fastqFile: record_iter = SeqIO.parse(fastqFile, "fastq") for batch in util.misc.batch_iterator(record_iter, chunk_size): chunk_fastq = mkstempfname('.fastq') with open(chunk_fastq, "wt") as handle: SeqIO.write(batch, handle, "fastq") batch = None lastal_out = mkstempfname('.lastal') with open(lastal_out, 'wt') as outf: cmd = [lastal_path, '-Q1', '-P0'] cmd.extend( [ '-n', max_gapless_alignments_per_position, '-l', min_length_for_initial_matches, '-L', max_length_for_initial_matches, '-m', max_initial_matches_per_position ] ) cmd = [str(x) for x in cmd] cmd.extend([db, chunk_fastq]) log.debug(' '.join(cmd) + ' > ' + lastal_out) util.misc.run_and_save(cmd, outf=outf) # everything below this point in this method should be replaced with # our own code that just reads lastal output and makes a list of read names mafsort_out = mkstempfname('.mafsort') with open(mafsort_out, 'wt') as outf: with open(lastal_out, 'rt') as inf: cmd = [mafsort_path, '-n2'] log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) + ' > ' + mafsort_out) subprocess.check_call(cmd, stdin=inf, stdout=outf) os.unlink(lastal_out) mafconvert_out = mkstempfname('.mafconvert') with open(mafconvert_out, 'wt') as outf: cmd = ["python", mafconvert_path, 'tab', mafsort_out] log.debug(' '.join(cmd) + ' > ' + mafconvert_out) subprocess.check_call(cmd, stdout=outf) os.unlink(mafsort_out) filtered_fastq_chunk = mkstempfname('.filtered.fastq') with open(filtered_fastq_chunk, 'wt') as outf: cmd = [no_blast_like_hits_path, '-b', mafconvert_out, '-r', chunk_fastq, '-m', 'hit'] log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk) subprocess.check_call(cmd, stdout=outf) filtered_fastq_files.append(filtered_fastq_chunk) os.unlink(mafconvert_out) # concatenate filtered fastq files to outFastq util.file.concat(filtered_fastq_files, outFastq) # remove temp fastq files for tempfastq in filtered_fastq_files: os.unlink(tempfastq)
def lastal_chunked_fastq(inFastq, db, outFastq, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100, chunk_size=100000): lastal_path = tools.last.Lastal().install_and_get_path() mafsort_path = tools.last.MafSort().install_and_get_path() mafconvert_path = tools.last.MafConvert().install_and_get_path() no_blast_like_hits_path = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') filtered_fastq_files = [] with open(inFastq, "rt") as fastqFile: record_iter = SeqIO.parse(fastqFile, "fastq") for batch in util.misc.batch_iterator(record_iter, chunk_size): chunk_fastq = mkstempfname('.fastq') with open(chunk_fastq, "wt") as handle: SeqIO.write(batch, handle, "fastq") batch = None lastal_out = mkstempfname('.lastal') with open(lastal_out, 'wt') as outf: cmd = [lastal_path, '-Q1', '-P0'] cmd.extend([ '-n', max_gapless_alignments_per_position, '-l', min_length_for_initial_matches, '-L', max_length_for_initial_matches, '-m', max_initial_matches_per_position ]) cmd = [str(x) for x in cmd] cmd.extend([db, chunk_fastq]) log.debug(' '.join(cmd) + ' > ' + lastal_out) util.misc.run_and_save(cmd, outf=outf) # everything below this point in this method should be replaced with # our own code that just reads lastal output and makes a list of read names mafsort_out = mkstempfname('.mafsort') with open(mafsort_out, 'wt') as outf: with open(lastal_out, 'rt') as inf: cmd = [mafsort_path, '-n2'] log.debug('cat ' + lastal_out + ' | ' + ' '.join(cmd) + ' > ' + mafsort_out) subprocess.check_call(cmd, stdin=inf, stdout=outf) os.unlink(lastal_out) mafconvert_out = mkstempfname('.mafconvert') with open(mafconvert_out, 'wt') as outf: cmd = ["python", mafconvert_path, 'tab', mafsort_out] log.debug(' '.join(cmd) + ' > ' + mafconvert_out) subprocess.check_call(cmd, stdout=outf) os.unlink(mafsort_out) filtered_fastq_chunk = mkstempfname('.filtered.fastq') with open(filtered_fastq_chunk, 'wt') as outf: cmd = [ no_blast_like_hits_path, '-b', mafconvert_out, '-r', chunk_fastq, '-m', 'hit' ] log.debug(' '.join(cmd) + ' > ' + filtered_fastq_chunk) subprocess.check_call(cmd, stdout=outf) filtered_fastq_files.append(filtered_fastq_chunk) os.unlink(mafconvert_out) # concatenate filtered fastq files to outFastq util.file.concat(filtered_fastq_files, outFastq) # remove temp fastq files for tempfastq in filtered_fastq_files: os.unlink(tempfastq)