def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs): tmpfq1_a = mkstempfname('.fastq') tmpfq1_b = mkstempfname('.fastq') tmpfq2_b = mkstempfname('.fastq') tmpfq2_c = mkstempfname('.fastq') # deplete fq1 deplete_blastn(infq1, tmpfq1_a, refDbs) # purge fq2 of read pairs lost in fq1 # (this should significantly speed up the second run of deplete_blastn) read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b) # deplete fq2 deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs) # purge fq1 of read pairs lost in fq2 read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads): 'Use blastn to remove reads that match at least one of the databases.' tmpfq1_a = mkstempfname('.fastq') tmpfq1_b = mkstempfname('.fastq') tmpfq2_b = mkstempfname('.fastq') tmpfq2_c = mkstempfname('.fastq') # deplete fq1 deplete_blastn(infq1, tmpfq1_a, refDbs) # purge fq2 of read pairs lost in fq1 # (this should significantly speed up the second run of deplete_blastn) read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b) # deplete fq2 deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads) # purge fq1 of read pairs lost in fq2 read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): ''' Take reads through Trimmomatic, Prinseq, and subsampling. This should probably move over to read_utils or taxon_filter. ''' # BAM -> fastq infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq'])) tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1]) # Trimmomatic trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq'])) taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb) os.unlink(infq[0]) os.unlink(infq[1]) # Prinseq rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq'])) read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0]) read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1]) os.unlink(trimfq[0]) os.unlink(trimfq[1]) # Purge unmated purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq'])) read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1]) os.unlink(rmdupfq[0]) os.unlink(rmdupfq[1]) # Log count with open(purgefq[0], 'rt') as inf: n = int(sum(1 for line in inf)/4) log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n) # Subsample subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'), '-n', str(n_reads), '-mode', 'p', '-in', purgefq[0], purgefq[1], '-out', subsampfq[0], subsampfq[1], ] subprocess.check_call(cmd) os.unlink(purgefq[0]) os.unlink(purgefq[1]) # Fastq -> BAM # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way # breaks out the read groups and perform the above steps in a way that preserves # the RG IDs. tmp_bam = util.file.mkstempfname('.subsamp.bam') tmp_header = util.file.mkstempfname('.header.sam') tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header) if n == 0: # FastqToSam cannot deal with empty input # but Picard SamFormatConverter can deal with empty files opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR'] tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m') else: tools.picard.FastqToSamTool().execute( subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) os.unlink(tmp_bam) os.unlink(tmp_header) os.unlink(subsampfq[0]) os.unlink(subsampfq[1])
def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): ''' Take reads through Trimmomatic, Prinseq, and subsampling. This should probably move over to read_utils or taxon_filter. ''' # BAM -> fastq infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq'])) tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1]) # Trimmomatic trimfq = list( map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq'])) taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb) os.unlink(infq[0]) os.unlink(infq[1]) # Prinseq rmdupfq = list( map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq'])) read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0]) read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1]) os.unlink(trimfq[0]) os.unlink(trimfq[1]) # Purge unmated purgefq = list( map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq'])) read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1]) os.unlink(rmdupfq[0]) os.unlink(rmdupfq[1]) # Log count with open(purgefq[0], 'rt') as inf: n = int(sum(1 for line in inf) / 4) log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n) # Subsample subsampfq = list( map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) cmd = [ os.path.join(util.file.get_scripts_path(), 'subsampler.py'), '-n', str(n_reads), '-mode', 'p', '-in', purgefq[0], purgefq[1], '-out', subsampfq[0], subsampfq[1], ] subprocess.check_call(cmd) os.unlink(purgefq[0]) os.unlink(purgefq[1]) # Fastq -> BAM # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way # breaks out the read groups and perform the above steps in a way that preserves # the RG IDs. tmp_bam = util.file.mkstempfname('.subsamp.bam') tmp_header = util.file.mkstempfname('.header.sam') tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header) if n == 0: # FastqToSam cannot deal with empty input # but Picard SamFormatConverter can deal with empty files opts = ['INPUT=' + tmp_header, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR'] tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m') else: tools.picard.FastqToSamTool().execute(subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) os.unlink(tmp_bam) os.unlink(tmp_header) os.unlink(subsampfq[0]) os.unlink(subsampfq[1])