示例#1
0
def deplete_blastn(inFastq, outFastq, refDbs) :
    'Use blastn to remove reads that match at least one of the databases.'
    
    ## Get tools
    noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(),
                                      'noBlastHits_v3.py')
    
    ## Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)
    
    ## Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs :
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db)

    ## Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout = outf)

    ## run noBlastHits_v3.py to extract reads with no blast hits
    # TODO: slurp the small amount of code in this script into here
    noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined,
                     '-r', inFastq, '-m', 'nohit']
    log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq)
    with util.file.open_or_gzopen(outFastq, 'wt') as outf :
        subprocess.check_call(noBlastHitsCmd, stdout = outf)
示例#2
0
def _merge_fastqs_and_mvicuna(lb, files):
    readList = mkstempfname('.keep_reads.txt')
    log.info("executing M-Vicuna DupRm on library " + lb)

    # create merged FASTQs per library
    infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq'))
    for d in range(2):
        with open(infastqs[d], 'wt') as outf:
            for fprefix in files:
                fn = '%s_%d.fastq' % (fprefix, d + 1)

                if os.path.isfile(fn):
                    with open(fn, 'rt') as inf:
                        for line in inf:
                            outf.write(line)
                    os.unlink(fn)
                else:
                    log.warn(
                        """no reads found in %s,
                                assuming that's because there's no reads in that read group""", fn
                    )

    # M-Vicuna DupRm to see what we should keep (append IDs to running file)
    if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0:
        mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList)
    for fn in infastqs:
        os.unlink(fn)

    return readList
示例#3
0
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2,
                clipFasta):
    '''Trim read sequences with Trimmomatic.'''
    trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path(
    )
    tmpUnpaired1 = mkstempfname()
    tmpUnpaired2 = mkstempfname()

    javaCmd = []

    # the conda version wraps the jar file with a shell script
    if trimmomaticPath.endswith(".jar"):
        #  This java program has a lot of argments...
        javaCmd.extend([
            'java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir,
            '-classpath', trimmomaticPath,
            'org.usadellab.trimmomatic.TrimmomaticPE'
        ])
    else:
        javaCmd.extend([trimmomaticPath, "PE"])

    javaCmd.extend([
        inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2,
        tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25',
        'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)
    ])

    log.debug(' '.join(javaCmd))
    util.misc.run_and_print(javaCmd, check=True)
    os.unlink(tmpUnpaired1)
    os.unlink(tmpUnpaired2)
示例#4
0
def filter_lastal_bam(inBam, db, outBam, JVMmemory=None):
    ''' Restrict input reads to those that align to the given
        reference database using LASTAL.
    '''
    # convert BAM to paired FASTQ
    inReads1 = mkstempfname('.1.fastq')
    inReads2 = mkstempfname('.2.fastq')
    tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)

    # look for hits in inReads1 and inReads2
    hitList1 = mkstempfname('.1.hits')
    hitList2 = mkstempfname('.2.hits')
    lastal_get_hits(inReads1, db, hitList1)
    os.unlink(inReads1)
    lastal_get_hits(inReads2, db, hitList2)
    os.unlink(inReads2)

    # merge hits
    hitList = mkstempfname('.hits')
    with open(hitList, 'wt') as outf:
        subprocess.check_call(['sort', '-u', hitList1, hitList2], stdout=outf)
    os.unlink(hitList1)
    os.unlink(hitList2)

    # filter original BAM file against keep list
    tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory)
    os.unlink(hitList)
示例#5
0
def blastn_chunked_fasta(fasta, db, chunkSize=1000000):
    """
    Helper function: blastn a fasta file, overcoming apparent memory leaks on
    an input with many query sequences, by splitting it into multiple chunks
    and running a new blastn process on each chunk. Return a list of output
    filenames containing hits
    """
    blastnPath = tools.blast.BlastnTool().install_and_get_path()

    hits_files = []
    with open(fasta, "rt") as fastaFile:
        record_iter = SeqIO.parse(fastaFile, "fasta")
        for batch in batch_iterator(record_iter, chunkSize):
            chunk_fasta = mkstempfname('.fasta')
            with open(chunk_fasta, "wt") as handle:
                SeqIO.write(batch, handle, "fasta")
            batch = None

            chunk_hits = mkstempfname('.hits.txt')
            blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs',
                         '2', '-query', chunk_fasta, '-out', chunk_hits]
            log.debug(' '.join(blastnCmd))
            subprocess.check_call(blastnCmd)

            os.unlink(chunk_fasta)
            hits_files.append(chunk_hits)

    return hits_files
示例#6
0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs):

    tmpDb = None
    if len(refDbs)>1 and not any(
            not os.path.exists(db)  # indexed db prefix
            or os.path.isdir(db)       # indexed db in directory
            or (os.path.isfile(db) and ('.tar' in db or '.tgz' in db or '.zip' in db)) # packaged indexed db
            for db in refDbs):
        # this is a scenario where all refDbs are unbuilt fasta
        # files. we can simplify and speed up execution by
        # concatenating them all and running deplete_method
        # just once
        tmpDb = mkstempfname('.fasta')
        merge_compressed_files(refDbs, tmpDb, sep='\n')
        refDbs = [tmpDb]

    samtools = tools.samtools.SamtoolsTool()
    tmpBamIn = inBam
    for db in refDbs:
        if not samtools.isEmpty(tmpBamIn):
            tmpBamOut = mkstempfname('.bam')
            deplete_method(tmpBamIn, db, tmpBamOut, **kwargs)
            if tmpBamIn != inBam:
                os.unlink(tmpBamIn)
            tmpBamIn = tmpBamOut
    shutil.copyfile(tmpBamIn, outBam)

    if tmpDb:
        os.unlink(tmpDb)
示例#7
0
def blastn_chunked_fasta(fasta, db, chunkSize=1000000):
    """
    Helper function: blastn a fasta file, overcoming apparent memory leaks on
    an input with many query sequences, by splitting it into multiple chunks
    and running a new blastn process on each chunk. Return a list of output
    filenames containing hits
    """
    blastnPath = tools.blast.BlastnTool().install_and_get_path()

    hits_files = []
    with open(fasta, "rt") as fastaFile:
        record_iter = SeqIO.parse(fastaFile, "fasta")
        for batch in batch_iterator(record_iter, chunkSize):
            chunk_fasta = mkstempfname('.fasta')
            with open(chunk_fasta, "wt") as handle:
                SeqIO.write(batch, handle, "fasta")
            batch = None

            chunk_hits = mkstempfname('.hits.txt')
            blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs',
                         '2', '-query', chunk_fasta, '-out', chunk_hits]
            log.debug(' '.join(blastnCmd))
            subprocess.check_call(blastnCmd)

            os.unlink(chunk_fasta)
            hits_files.append(chunk_hits)

    return hits_files
示例#8
0
def filter_lastal_bam(inBam, db, outBam, JVMmemory=None):
    ''' Restrict input reads to those that align to the given
        reference database using LASTAL.
    '''
    # convert BAM to paired FASTQ
    inReads1 = mkstempfname('.1.fastq')
    inReads2 = mkstempfname('.2.fastq')
    tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)

    # look for hits in inReads1 and inReads2
    hitList1 = mkstempfname('.1.hits')
    hitList2 = mkstempfname('.2.hits')
    lastal_get_hits(inReads1, db, hitList1)
    os.unlink(inReads1)
    lastal_get_hits(inReads2, db, hitList2)
    os.unlink(inReads2)

    # merge hits
    hitList = mkstempfname('.hits')
    with open(hitList, 'wt') as outf:
        subprocess.check_call(['sort', '-u', hitList1, hitList2], stdout=outf)
    os.unlink(hitList1)
    os.unlink(hitList2)

    # filter original BAM file against keep list
    tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory)
    os.unlink(hitList)
示例#9
0
def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None,
    novoalign_options='', JVMmemory=None):
    ''' Take reads, align to reference with Novoalign, mark duplicates
        with Picard, realign indels with GATK, and optionally filter
        final file to mapped/non-dupe reads.
    '''
    if not (outBamAll or outBamFiltered):
        log.warn("are you sure you meant to do nothing?")
        return
    
    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(
        inBam, refFasta, bam_aligned,
        options=novoalign_options.split(), JVMmemory=JVMmemory)
    
    bam_mkdup = mkstempfname('.mkdup.bam')
    tools.picard.MarkDuplicatesTool().execute(
        [bam_aligned], bam_mkdup,
        picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory)
    os.unlink(bam_aligned)
    
    bam_realigned = mkstempfname('.realigned.bam')
    tools.gatk.GATKTool().local_realign(
        bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory)
    os.unlink(bam_mkdup)
    
    if outBamAll:
        shutil.copyfile(bam_realigned, outBamAll)
        tools.picard.BuildBamIndexTool().execute(outBamAll)
    if outBamFiltered:
        tools.samtools.SamtoolsTool().view(
            ['-b', '-q', '1', '-F', '1028'],
            bam_realigned, outBamFiltered)
        tools.picard.BuildBamIndexTool().execute(outBamFiltered)
    os.unlink(bam_realigned)
示例#10
0
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2,
        clipFasta):
    trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path()
    tmpUnpaired1 = mkstempfname()
    tmpUnpaired2 = mkstempfname()

    #  This java program has a lot of argments...
    javaCmd = ['java', '-Xmx2g',
        '-Djava.io.tmpdir='+tempfile.tempdir,
        '-classpath',
        trimmomaticPath,
        'org.usadellab.trimmomatic.TrimmomaticPE',
        inFastq1,
        inFastq2,
        pairedOutFastq1,
        tmpUnpaired1,
        pairedOutFastq2,
        tmpUnpaired2,
        'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30',
        'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)
        ]

    log.debug(' '.join(javaCmd))
    subprocess.check_call(javaCmd)
    os.unlink(tmpUnpaired1)
    os.unlink(tmpUnpaired2)
示例#11
0
def rmdup_cdhit_bam(inBam, outBam, max_mismatches=None, jvm_memory=None):
    ''' Remove duplicate reads from BAM file using cd-hit-dup.
    '''
    max_mismatches = max_mismatches or 4
    tmp_dir = tempfile.mkdtemp()

    tools.picard.SplitSamByLibraryTool().execute(inBam, tmp_dir)

    s2fq_tool = tools.picard.SamToFastqTool()
    cdhit = tools.cdhit.CdHit()
    out_bams = []
    for f in os.listdir(tmp_dir):
        out_bam = mkstempfname('.bam')
        out_bams.append(out_bam)
        library_sam = os.path.join(tmp_dir, f)

        in_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')

        s2fq_tool.execute(library_sam, in_fastqs[0], in_fastqs[1])
        if not os.path.getsize(in_fastqs[0]) > 0 and not os.path.getsize(
                in_fastqs[1]) > 0:
            continue

        out_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq')
        options = {
            '-e': max_mismatches,
        }
        if in_fastqs[1] is not None and os.path.getsize(in_fastqs[1]) > 10:
            options['-i2'] = in_fastqs[1]
            options['-o2'] = out_fastqs[1]

        log.info("executing cd-hit-est on library " + library_sam)
        # cd-hit-dup cannot operate on piped fastq input because it reads twice
        cdhit.execute('cd-hit-dup',
                      in_fastqs[0],
                      out_fastqs[0],
                      options=options,
                      background=True)

        tools.picard.FastqToSamTool().execute(out_fastqs[0],
                                              out_fastqs[1],
                                              f,
                                              out_bam,
                                              JVMmemory=jvm_memory)
        for fn in in_fastqs:
            os.unlink(fn)

    with util.file.fifo(name='merged.sam') as merged_bam:
        merge_opts = ['SORT_ORDER=queryname']
        tools.picard.MergeSamFilesTool().execute(out_bams,
                                                 merged_bam,
                                                 picardOptions=merge_opts,
                                                 JVMmemory=jvm_memory,
                                                 background=True)
        tools.picard.ReplaceSamHeaderTool().execute(merged_bam,
                                                    inBam,
                                                    outBam,
                                                    JVMmemory=jvm_memory)
示例#12
0
def trimmomatic(
    inFastq1,
    inFastq2,
    pairedOutFastq1,
    pairedOutFastq2,
    clipFasta,
    unpairedOutFastq1=None,
    unpairedOutFastq2=None,
    leading_q_cutoff=15,
    trailing_q_cutoff=15,
    minlength_to_keep=30,
    sliding_window_size=4,
    sliding_window_q_cutoff=25
):
    '''Trim read sequences with Trimmomatic.'''
    trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path()
    unpairedFastq1 = unpairedOutFastq1 or mkstempfname()
    unpairedFastq2 = unpairedOutFastq2 or mkstempfname()

    javaCmd = []

    # the conda version wraps the jar file with a shell script
    if trimmomaticPath.endswith(".jar"):
        #  This java program has a lot of argments...
        javaCmd.extend(
            [
                'java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath,
                'org.usadellab.trimmomatic.TrimmomaticPE'
            ]
        )
    else:
        javaCmd.extend([trimmomaticPath, "PE"])

    # Explicitly use Phred-33 quality scores
    javaCmd.extend(['-phred33'])

    javaCmd.extend(
        [
            inFastq1, inFastq2, pairedOutFastq1, unpairedFastq1, pairedOutFastq2, unpairedFastq2,
            'LEADING:{leading_q_cutoff}'.format(leading_q_cutoff=leading_q_cutoff),
            'TRAILING:{trailing_q_cutoff}'.format(trailing_q_cutoff=trailing_q_cutoff),
            'SLIDINGWINDOW:{sliding_window_size}:{sliding_window_q_cutoff}'.format(
                sliding_window_size=sliding_window_size,
                sliding_window_q_cutoff=sliding_window_q_cutoff,
            ), 
            'MINLEN:{minlength_to_keep}'.format(minlength_to_keep=minlength_to_keep),
            'ILLUMINACLIP:{clipFasta}:2:30:12'.format(clipFasta=clipFasta)
        ]
    )

    log.debug(' '.join(javaCmd))
    util.misc.run_and_print(javaCmd, check=True)

    if not unpairedOutFastq1:
        os.unlink(unpairedFastq1)
    if not unpairedOutFastq2:
        os.unlink(unpairedFastq2)
示例#13
0
def deplete_bmtagger_bam(inBam,
                         db,
                         outBam,
                         threads=None,
                         srprism_memory=7168,
                         JVMmemory=None):
    """
    Use bmtagger to partition the input reads into ones that match at least one
        of the databases and ones that don't match any of the databases.
    inBam: paired-end input reads in BAM format.
    db: bmtagger expects files
        db.bitmask created by bmtool, and
        db.srprism.idx, db.srprism.map, etc. created by srprism mkindex
    outBam: the output BAM files to hold the unmatched reads.
    srprism_memory: srprism memory in megabytes.
    """
    bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path()

    # bmtagger calls several executables in the same directory, and blastn;
    # make sure they are accessible through $PATH
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    path = os.environ['PATH'].split(os.pathsep)
    for t in (bmtaggerPath, blastnPath):
        d = os.path.dirname(t)
        if d not in path:
            path = [d] + path
    path = os.pathsep.join(path)
    os.environ['PATH'] = path

    inReads1 = mkstempfname('.1.fastq')
    tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1)

    bmtaggerConf = mkstempfname('.bmtagger.conf')
    with open(bmtaggerConf, 'w') as f:
        # Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168"
        print(
            'srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M {srprism_memory} --paired false"'
            .format(srprism_memory=srprism_memory),
            file=f)
    tempDir = tempfile.mkdtemp()
    matchesFile = mkstempfname('.txt')
    cmdline = [
        bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x',
        db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-o',
        matchesFile
    ]
    log.debug(' '.join(cmdline))
    util.misc.run_and_print(cmdline, check=True)
    os.unlink(inReads1)
    os.unlink(bmtaggerConf)

    tools.picard.FilterSamReadsTool().execute(inBam,
                                              True,
                                              matchesFile,
                                              outBam,
                                              JVMmemory=JVMmemory)
示例#14
0
def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
    ''' Remove duplicate reads from BAM file using M-Vicuna. The
        primary advantage to this approach over Picard's MarkDuplicates tool
        is that Picard requires that input reads are aligned to a reference,
        and M-Vicuna can operate on unaligned reads.
    '''

    # Convert BAM -> FASTQ pairs per read group and load all read groups
    tempDir = tempfile.mkdtemp()
    tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT'])
    read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG']
    read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups]

    # Collect FASTQ pairs for each library
    lb_to_files = {}
    for rg in read_groups:
        lb_to_files.setdefault(rg.get('LB', 'none'), set())
        fname = rg['ID']
        if 'PU' in rg:
            fname = rg['PU']
        lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname))
    log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups))

    # For each library, merge FASTQs and run rmdup for entire library
    readList = mkstempfname('.keep_reads.txt')
    for lb, files in lb_to_files.items():
        log.info("executing M-Vicuna DupRm on library " + lb)

        # create merged FASTQs per library
        infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq'))
        for d in range(2):
            with open(infastqs[d], 'wt') as outf:
                for fprefix in files:
                    fn = '%s_%d.fastq' % (fprefix, d + 1)
                    if os.path.isfile(fn):
                        with open(fn, 'rt') as inf:
                            for line in inf:
                                outf.write(line)
                        os.unlink(fn)
                    else:
                        log.warn(
                            """no reads found in %s,
                                    assuming that's because there's no reads in that read group""", fn
                        )

        # M-Vicuna DupRm to see what we should keep (append IDs to running file)
        if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0:
            mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList)
        for fn in infastqs:
            os.unlink(fn)

    # Filter original input BAM against keep-list
    tools.picard.FilterSamReadsTool().execute(inBam, False, readList, outBam, JVMmemory=JVMmemory)
    return 0
示例#15
0
def filter_lastal(inFastq, refDb, outFastq):
    ''' Restrict input reads to those that align to the given
        reference database using LASTAL.  Also, remove duplicates with prinseq.
    '''
    assert outFastq.endswith('.fastq')
    tempFilePath = mkstempfname('.hits')
    lastalPath = tools.last.Lastal().install_and_get_path()
    mafSortPath = tools.last.MafSort().install_and_get_path()
    mafConvertPath = tools.last.MafConvert().install_and_get_path()
    prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path()
    noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(),
                                        'noBlastLikeHits.py')

    # each pipe separated cmd gets own line
    # unfortunately, it doesn't seem to work to do .format(**locals()) on the
    # final string as opposed to the individual parts.
    lastalCmd = ' '.join([
        '{lastalPath} -Q1 {refDb} {inFastq}'.format(**locals()),
        '| {mafSortPath} -n2'.format(**locals()),
        '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(**locals()),
        ])
    log.debug(lastalCmd)
    assert not os.system(lastalCmd)

    # filter inFastq against lastal hits
    filteredFastq = mkstempfname('.filtered.fastq')
    with open(filteredFastq, 'wt') as outf:
        noBlastLikeHitsCmd = [
            noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit']
        log.debug(' '.join(noBlastLikeHitsCmd) + ' > ' + filteredFastq)
        subprocess.check_call(noBlastLikeHitsCmd, stdout=outf)
    
    # remove duplicate reads and reads with multiple Ns
    if os.path.getsize(filteredFastq) == 0:
        # prinseq-lite fails on empty file input (which can happen in real life
        # if no reads match the refDb) so handle this scenario specially
        log.info("output is empty: no reads in input match refDb")
        shutil.copyfile(filteredFastq, outFastq)
    else:
        prinseqCmd = [
            'perl', prinseqPath,
                '-ns_max_n', '1',
                '-derep', '1',
                '-fastq', filteredFastq,
                '-out_bad', 'null',
                '-line_width', '0',
                '-out_good', outFastq[:-6]
            ]
        log.debug(' '.join(prinseqCmd))
        subprocess.check_call(prinseqCmd)
    os.unlink(filteredFastq)
示例#16
0
def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
    ''' Remove duplicate reads from BAM file using M-Vicuna. The
        primary advantage to this approach over Picard's MarkDuplicates tool
        is that Picard requires that input reads are aligned to a reference,
        and M-Vicuna can operate on unaligned reads.
    '''

    # Convert BAM -> FASTQ pairs per read group and load all read groups
    tempDir = tempfile.mkdtemp()
    tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT'])
    read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG']
    read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups]

    # Collect FASTQ pairs for each library
    lb_to_files = {}
    for rg in read_groups:
        lb_to_files.setdefault(rg.get('LB', 'none'), set())
        fname = rg['ID']
        if 'PU' in rg:
            fname = rg['PU']
        lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname))
    log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups))

    # For each library, merge FASTQs and run rmdup for entire library
    readList = mkstempfname('.keep_reads.txt')
    for lb, files in lb_to_files.items():
        log.info("executing M-Vicuna DupRm on library " + lb)

        # create merged FASTQs per library
        infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq'))
        for d in range(2):
            with open(infastqs[d], 'wt') as outf:
                for fprefix in files:
                    fn = '%s_%d.fastq' % (fprefix, d + 1)
                    if os.path.isfile(fn):
                        with open(fn, 'rt') as inf:
                            for line in inf:
                                outf.write(line)
                        os.unlink(fn)
                    else:
                        log.warn("""no reads found in %s, 
                                    assuming that's because there's no reads in that read group""", fn)

        # M-Vicuna DupRm to see what we should keep (append IDs to running file)
        mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList)
        for fn in infastqs:
            os.unlink(fn)

    # Filter original input BAM against keep-list
    tools.picard.FilterSamReadsTool().execute(inBam, False, readList, outBam, JVMmemory=JVMmemory)
    return 0
示例#17
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs):
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
示例#18
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads):
    'Use blastn to remove reads that match at least one of the databases.'
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
示例#19
0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads):
    'Use blastn to remove reads that match at least one of the databases.'
    tmpfq1_a = mkstempfname('.fastq')
    tmpfq1_b = mkstempfname('.fastq')
    tmpfq2_b = mkstempfname('.fastq')
    tmpfq2_c = mkstempfname('.fastq')
    # deplete fq1
    deplete_blastn(infq1, tmpfq1_a, refDbs)
    # purge fq2 of read pairs lost in fq1
    # (this should significantly speed up the second run of deplete_blastn)
    read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b)
    # deplete fq2
    deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads)
    # purge fq1 of read pairs lost in fq2
    read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
示例#20
0
def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None,
                 JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, picardOptions=None):
    ''' Convert a pair of fastq paired-end read files and optional text header
        to a single bam file.
    '''
    picardOptions = picardOptions or []

    if header:
        fastqToSamOut = mkstempfname('.bam')
    else:
        fastqToSamOut = outBam
    if sampleName is None:
        sampleName = 'Dummy'  # Will get overwritten by rehead command
    if header:
        # With the header option, rehead will be called after FastqToSam.
        # This will invalidate any md5 file, which would be a slow to construct
        # on our own, so just disallow and let the caller run md5sum if desired.
        if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions):
            raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""")
    tools.picard.FastqToSamTool().execute(
        inFastq1,
        inFastq2,
        sampleName,
        fastqToSamOut,
        picardOptions=picardOptions,
        JVMmemory=JVMmemory)

    if header:
        tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam)

    return 0
示例#21
0
def main_reheader_bams(args):
    ''' Copy BAM files while renaming elements of the BAM header.
        The mapping file specifies which (key, old value, new value) mappings. For
        example:
            LB  lib1  lib_one
            SM  sample1 Sample_1
            SM  sample2 Sample_2
            SM  sample3 Sample_3
            CN  broad   BI
            FN  in1.bam out1.bam
            FN  in2.bam out2.bam
    '''
    # read mapping file
    mapper = dict((a+':'+b, a+':'+c) for a,b,c in util.file.read_tabfile(args.rgMap) if a != 'FN')
    files = list((b,c) for a,b,c in util.file.read_tabfile(args.rgMap) if a == 'FN')
    header_file = mkstempfname('.sam')
    # read and convert bam headers
    for inBam, outBam in files:
        if os.path.isfile(inBam):
            with open(header_file, 'wt') as outf:
                for row in tools.samtools.SamtoolsTool().getHeader(inBam):
                    if row[0] == '@RG':
                        row = [mapper.get(x, x) for x in row]
                    outf.write('\t'.join(row)+'\n')
            # write new bam with new header
            tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam)
    os.unlink(header_file)
    return 0
示例#22
0
def bwamem_idxstats(inBam,
                    refFasta,
                    outBam=None,
                    outStats=None,
                    min_score_to_filter=None,
                    aligner_options=None):
    ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
    '''

    assert outBam or outStats, "Either outBam or outStats must be specified"

    if outBam is None:
        bam_aligned = mkstempfname('.aligned.bam')
    else:
        bam_aligned = outBam

    samtools = tools.samtools.SamtoolsTool()
    bwa = tools.bwa.Bwa()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(refFasta, ref_indexed)
    bwa.index(ref_indexed)

    bwa_opts = [] if aligner_options is None else aligner_options.split()
    bwa.mem(inBam,
            refFasta,
            bam_aligned,
            options=bwa_opts,
            min_score_to_filter=min_score_to_filter)

    if outStats is not None:
        samtools.idxstats(bam_aligned, outStats)

    if outBam is None:
        os.unlink(bam_aligned)
示例#23
0
def lastal_get_hits(inFastq,
                    db,
                    outList,
                    max_gapless_alignments_per_position=1,
                    min_length_for_initial_matches=5,
                    max_length_for_initial_matches=50,
                    max_initial_matches_per_position=100):
    filteredFastq = mkstempfname('.filtered.fastq')
    lastal_chunked_fastq(
        inFastq,
        db,
        filteredFastq,
        max_gapless_alignments_per_position=max_gapless_alignments_per_position,
        min_length_for_initial_matches=min_length_for_initial_matches,
        max_length_for_initial_matches=max_length_for_initial_matches,
        max_initial_matches_per_position=max_initial_matches_per_position)

    with open(outList, 'wt') as outf:
        with open(filteredFastq, 'rt') as inf:
            line_num = 0
            for line in inf:
                if (line_num % 4) == 0:
                    seq_id = line.rstrip('\n\r')[1:]
                    if seq_id.endswith('/1') or seq_id.endswith('/2'):
                        seq_id = seq_id[:-2]
                    outf.write(seq_id + '\n')
                line_num += 1

    os.unlink(filteredFastq)
示例#24
0
def main_reheader_bams(args):
    ''' Copy BAM files while renaming elements of the BAM header.
        The mapping file specifies which (key, old value, new value) mappings. For
        example:
            LB  lib1  lib_one
            SM  sample1 Sample_1
            SM  sample2 Sample_2
            SM  sample3 Sample_3
            CN  broad   BI
            FN  in1.bam out1.bam
            FN  in2.bam out2.bam
    '''
    # read mapping file
    mapper = dict((a + ':' + b, a + ':' + c)
                  for a, b, c in util.file.read_tabfile(args.rgMap)
                  if a != 'FN')
    files = list(
        (b, c) for a, b, c in util.file.read_tabfile(args.rgMap) if a == 'FN')
    header_file = mkstempfname('.sam')
    # read and convert bam headers
    for inBam, outBam in files:
        if os.path.isfile(inBam):
            with open(header_file, 'wt') as outf:
                for row in tools.samtools.SamtoolsTool().getHeader(inBam):
                    if row[0] == '@RG':
                        row = [mapper.get(x, x) for x in row]
                    outf.write('\t'.join(row) + '\n')
            # write new bam with new header
            tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam)
    os.unlink(header_file)
    return 0
示例#25
0
def align_and_count_hits(inBam,
                         refFasta,
                         outCounts,
                         includeZeros=False,
                         JVMmemory=None):
    ''' Take reads, align to reference with Novoalign and return aligned
        read counts for each reference sequence.
    '''

    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(inBam,
                                            refFasta,
                                            bam_aligned,
                                            options=['-r', 'Random'],
                                            JVMmemory=JVMmemory)

    samtools = tools.samtools.SamtoolsTool()
    seqs = list(
        dict(x.split(':', 1) for x in row[1:])['SN']
        for row in samtools.getHeader(bam_aligned) if row[0] == '@SQ')

    with util.file.open_or_gzopen(outCounts, 'w') as outf:
        for seq in seqs:
            n = samtools.count(bam_aligned, regions=[seq])
            if n > 0 or includeZeros:
                outf.write("{}\t{}\n".format(seq, n))

    os.unlink(bam_aligned)
示例#26
0
def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None,
                 JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, picardOptions=None):
    ''' Convert a pair of fastq paired-end read files and optional text header
        to a single bam file.
    '''
    picardOptions = picardOptions or []

    if header:
        fastqToSamOut = mkstempfname('.bam')
    else:
        fastqToSamOut = outBam
    if sampleName is None:
        sampleName = 'Dummy'  # Will get overwritten by rehead command
    if header:
        # With the header option, rehead will be called after FastqToSam.
        # This will invalidate any md5 file, which would be a slow to construct
        # on our own, so just disallow and let the caller run md5sum if desired.
        if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions):
            raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""")
    tools.picard.FastqToSamTool().execute(
        inFastq1,
        inFastq2,
        sampleName,
        fastqToSamOut,
        picardOptions=picardOptions,
        JVMmemory=JVMmemory)

    if header:
        tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam)

    return 0
示例#27
0
def lastal_get_hits(
    inFastq,
    db,
    outList,
    max_gapless_alignments_per_position=1,
    min_length_for_initial_matches=5,
    max_length_for_initial_matches=50,
    max_initial_matches_per_position=100
):
    filteredFastq = mkstempfname('.filtered.fastq')
    lastal_chunked_fastq(
        inFastq,
        db,
        filteredFastq,
        max_gapless_alignments_per_position=max_gapless_alignments_per_position,
        min_length_for_initial_matches=min_length_for_initial_matches,
        max_length_for_initial_matches=max_length_for_initial_matches,
        max_initial_matches_per_position=max_initial_matches_per_position
    )

    with open(outList, 'wt') as outf:
        with open(filteredFastq, 'rt') as inf:
            line_num = 0
            for line in inf:
                if (line_num % 4) == 0:
                    seq_id = line.rstrip('\n\r')[1:]
                    if seq_id.endswith('/1') or seq_id.endswith('/2'):
                        seq_id = seq_id[:-2]
                    outf.write(seq_id + '\n')
                line_num += 1

    os.unlink(filteredFastq)
示例#28
0
def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False,
                  JVMmemory=None, threads=1):
    ''' Take reads, align to reference with Novoalign and return aligned
        read counts for each reference sequence.
    '''

    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(
        inBam,
        refFasta,
        bam_aligned,
        options=['-r', 'Random'],
        JVMmemory=JVMmemory)

    samtools = tools.samtools.SamtoolsTool()
    seqs = list(dict(x.split(':', 1) for x in row[1:])['SN']
        for row in samtools.getHeader(bam_aligned)
        if row[0]=='@SQ')

    with util.file.open_or_gzopen(outCounts, 'w') as outf:
        for seq in seqs:
            n = samtools.count(bam_aligned, regions=[seq])
            if n>0 or includeZeros:
                outf.write("{}\t{}\n".format(seq, n))

    os.unlink(bam_aligned)
示例#29
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each",
             totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute("SamFormatConverter", [
                'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam,
                'VERBOSITY=WARNING'
            ],
                           JVMmemory='512m')
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
示例#30
0
def lastal_get_hits(inFastq, db, outList):
    lastalPath = tools.last.Lastal().install_and_get_path()
    mafSortPath = tools.last.MafSort().install_and_get_path()
    mafConvertPath = tools.last.MafConvert().install_and_get_path()
    noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(),
                                        'noBlastLikeHits.py')
    
    lastalOut = mkstempfname('.lastal')
    with open(lastalOut, 'wt') as outf:
        cmd = [lastalPath, '-Q1', db, inFastq]
        log.debug(' '.join(cmd) + ' > ' + lastalOut)
        subprocess.check_call(cmd, stdout=outf)
    # everything below this point in this method should be replaced with
    # our own code that just reads lastal output and makes a list of read names
    
    mafSortOut = mkstempfname('.mafsort')
    with open(mafSortOut, 'wt') as outf:
        with open(lastalOut, 'rt') as inf:
            cmd = [mafSortPath, '-n2']
            log.debug('cat ' + lastalOut + ' | ' + ' '.join(cmd) + ' > ' + mafSortOut)
            subprocess.check_call(cmd, stdin=inf, stdout=outf)
    os.unlink(lastalOut)
    
    mafConvertOut = mkstempfname('.mafconvert')
    with open(mafConvertOut, 'wt') as outf:
        cmd = [mafConvertPath, 'tab', mafSortOut]
        log.debug(' '.join(cmd) + ' > ' + mafConvertOut)
        subprocess.check_call(cmd, stdout=outf)
    os.unlink(mafSortOut)
    
    filteredFastq = mkstempfname('.filtered.fastq')
    with open(filteredFastq, 'wt') as outf:
        cmd = [noBlastLikeHitsPath, '-b', mafConvertOut, '-r', inFastq, '-m', 'hit']
        log.debug(' '.join(cmd) + ' > ' + filteredFastq)
        subprocess.check_call(cmd, stdout=outf)
    os.unlink(mafConvertOut)
    
    with open(outList, 'wt') as outf:
        with open(filteredFastq, 'rt') as inf:
            line_num = 0
            for line in inf:
                if (line_num % 4) == 0:
                    id = line.rstrip('\n\r')[1:]
                    if id.endswith('/1') or id.endswith('/2'):
                        id = id[:-2]
                    outf.write(id+'\n')
                line_num += 1
示例#31
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute(
                "SamFormatConverter", [
                    'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING'
                ],
                JVMmemory='512m'
            )
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
示例#32
0
def align_and_fix(inBam,
                  refFasta,
                  outBamAll=None,
                  outBamFiltered=None,
                  novoalign_options='',
                  JVMmemory=None,
                  threads=1):
    ''' Take reads, align to reference with Novoalign, mark duplicates
        with Picard, realign indels with GATK, and optionally filter
        final file to mapped/non-dupe reads.
    '''
    if not (outBamAll or outBamFiltered):
        log.warn("are you sure you meant to do nothing?")
        return

    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(inBam,
                                            refFasta,
                                            bam_aligned,
                                            options=novoalign_options.split(),
                                            JVMmemory=JVMmemory)

    bam_mkdup = mkstempfname('.mkdup.bam')
    tools.picard.MarkDuplicatesTool().execute(
        [bam_aligned],
        bam_mkdup,
        picardOptions=['CREATE_INDEX=true'],
        JVMmemory=JVMmemory)
    os.unlink(bam_aligned)

    bam_realigned = mkstempfname('.realigned.bam')
    tools.gatk.GATKTool().local_realign(bam_mkdup,
                                        refFasta,
                                        bam_realigned,
                                        JVMmemory=JVMmemory,
                                        threads=threads)
    os.unlink(bam_mkdup)

    if outBamAll:
        shutil.copyfile(bam_realigned, outBamAll)
        tools.picard.BuildBamIndexTool().execute(outBamAll)
    if outBamFiltered:
        tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'],
                                           bam_realigned, outBamFiltered)
        tools.picard.BuildBamIndexTool().execute(outBamFiltered)
    os.unlink(bam_realigned)
示例#33
0
def lastal_get_hits(inFastq, db, outList):
    lastalPath = tools.last.Lastal().install_and_get_path()
    mafSortPath = tools.last.MafSort().install_and_get_path()
    mafConvertPath = tools.last.MafConvert().install_and_get_path()
    noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py')

    lastalOut = mkstempfname('.lastal')
    with open(lastalOut, 'wt') as outf:
        cmd = [lastalPath, '-Q1', db, inFastq]
        log.debug(' '.join(cmd) + ' > ' + lastalOut)
        subprocess.check_call(cmd, stdout=outf)
    # everything below this point in this method should be replaced with
    # our own code that just reads lastal output and makes a list of read names

    mafSortOut = mkstempfname('.mafsort')
    with open(mafSortOut, 'wt') as outf:
        with open(lastalOut, 'rt') as inf:
            cmd = [mafSortPath, '-n2']
            log.debug('cat ' + lastalOut + ' | ' + ' '.join(cmd) + ' > ' + mafSortOut)
            subprocess.check_call(cmd, stdin=inf, stdout=outf)
    os.unlink(lastalOut)

    mafConvertOut = mkstempfname('.mafconvert')
    with open(mafConvertOut, 'wt') as outf:
        cmd = [mafConvertPath, 'tab', mafSortOut]
        log.debug(' '.join(cmd) + ' > ' + mafConvertOut)
        subprocess.check_call(cmd, stdout=outf)
    os.unlink(mafSortOut)

    filteredFastq = mkstempfname('.filtered.fastq')
    with open(filteredFastq, 'wt') as outf:
        cmd = [noBlastLikeHitsPath, '-b', mafConvertOut, '-r', inFastq, '-m', 'hit']
        log.debug(' '.join(cmd) + ' > ' + filteredFastq)
        subprocess.check_call(cmd, stdout=outf)
    os.unlink(mafConvertOut)

    with open(outList, 'wt') as outf:
        with open(filteredFastq, 'rt') as inf:
            line_num = 0
            for line in inf:
                if (line_num % 4) == 0:
                    seq_id = line.rstrip('\n\r')[1:]
                    if seq_id.endswith('/1') or seq_id.endswith('/2'):
                        seq_id = seq_id[:-2]
                    outf.write(seq_id + '\n')
                line_num += 1
示例#34
0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam):
    tmpBamIn = inBam
    for db in refDbs:
        tmpBamOut = mkstempfname('.bam')
        deplete_method(tmpBamIn, db, tmpBamOut)
        if tmpBamIn != inBam:
            os.unlink(tmpBamIn)
        tmpBamIn = tmpBamOut
    shutil.copyfile(tmpBamIn, outBam)
示例#35
0
def main_deplete_human(args):
    ''' Run the entire depletion pipeline: bmtagger, mvicuna, blastn.
        Optionally, use lastal to select a specific taxon of interest.'''

    # only RevertSam if inBam is already aligned
    # Most of the time the input will be unaligned
    # so we can save save time if we can skip RevertSam in the unaligned case
    #
    # via the SAM/BAM spec, if the file is aligned, an SQ line should be present
    # in the header. Using pysam, we can check this if header['SQ'])>0
    #   https://samtools.github.io/hts-specs/SAMv1.pdf

    # if the user has requested a revertBam
    revertBamOut = args.revertBam if args.revertBam else mkstempfname('.bam')

    bamToDeplete = args.inBam
    with pysam.AlignmentFile(args.inBam, 'rb', check_sq=False) as bam:
        # if it looks like the bam is aligned, revert it
        if 'SQ' in bam.header and len(bam.header['SQ'])>0:      
            tools.picard.RevertSamTool().execute(
                args.inBam, revertBamOut, picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']
            )
            bamToDeplete = revertBamOut
        else:
            # if we don't need to produce a revertBam file
            # but the user has specified one anyway
            # simply touch the output
            if args.revertBam:
                log.warning("An output was specified for 'revertBam', but the input is unaligned, so RevertSam was not needed. Touching the output.")
                util.file.touch(revertBamOut)
                # TODO: error out? run RevertSam anyway?

    multi_db_deplete_bam(
        bamToDeplete,
        args.bmtaggerDbs,
        deplete_bmtagger_bam,
        args.bmtaggerBam,
        threads=args.threads,
        JVMmemory=args.JVMmemory
    )

    # if the user has not specified saving a revertBam, we used a temp file and can remove it
    if not args.revertBam:
        os.unlink(revertBamOut)

    read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory)
    multi_db_deplete_bam(
        args.rmdupBam,
        args.blastDbs,
        deplete_blastn_bam,
        args.blastnBam,
        threads=args.threads,
        JVMmemory=args.JVMmemory
    )
    if args.taxfiltBam and args.lastDb:
        filter_lastal_bam(args.blastnBam, args.lastDb, args.taxfiltBam, JVMmemory=args.JVMmemory)
    return 0
示例#36
0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, JVMmemory=None):
    tmpBamIn = inBam
    for db in refDbs:
        tmpBamOut = mkstempfname('.bam')
        deplete_method(tmpBamIn, db, tmpBamOut, JVMmemory=JVMmemory)
        if tmpBamIn != inBam:
            os.unlink(tmpBamIn)
        tmpBamIn = tmpBamOut
    shutil.copyfile(tmpBamIn, outBam)
示例#37
0
def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList):
    # Run M-Vicuna on FASTQ files
    outFastq1 = mkstempfname('.1.fastq')
    outFastq2 = mkstempfname('.2.fastq')
    tools.mvicuna.MvicunaTool().rmdup((inFastq1, inFastq2), (outFastq1, outFastq2), None)

    # Make a list of reads to keep
    with open(readList, 'at') as outf:
        for fq in (outFastq1, outFastq2):
            with util.file.open_or_gzopen(fq, 'rt') as inf:
                line_num = 0
                for line in inf:
                    if (line_num % 4) == 0:
                        idVal = line.rstrip('\n')[1:]
                        if idVal.endswith('/1'):
                            outf.write(idVal[:-2] + '\n')
                    line_num += 1
    os.unlink(outFastq1)
    os.unlink(outFastq2)
示例#38
0
def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList):
    # Run M-Vicuna on FASTQ files
    outFastq1 = mkstempfname('.1.fastq')
    outFastq2 = mkstempfname('.2.fastq')
    tools.mvicuna.MvicunaTool().rmdup((inFastq1, inFastq2), (outFastq1, outFastq2), None)

    # Make a list of reads to keep
    with open(readList, 'at') as outf:
        for fq in (outFastq1, outFastq2):
            with util.file.open_or_gzopen(fq, 'rt') as inf:
                line_num = 0
                for line in inf:
                    if (line_num % 4) == 0:
                        idVal = line.rstrip('\n')[1:]
                        if idVal.endswith('/1'):
                            outf.write(idVal[:-2] + '\n')
                    line_num += 1
    os.unlink(outFastq1)
    os.unlink(outFastq2)
示例#39
0
def deplete_blastn_bam(inBam,
                       db,
                       outBam,
                       threads=None,
                       chunkSize=1000000,
                       JVMmemory=None):
    #def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=0, JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    blast_hits = mkstempfname('.blast_hits.txt')
    if threads is None:
        threads = util.misc.available_cpu_count()
    else:
        threads = max(min(util.misc.available_cpu_count(), threads), 1)

    if chunkSize:
        ## chunk up input and perform blastn in several parallel threads

        # Initial BAM -> FASTA sequences
        fasta = mkstempfname('.fasta')
        tools.samtools.SamtoolsTool().bam2fa(inBam, fasta)

        # Find BLAST hits
        log.info("running blastn on %s against %s", inBam, db)
        blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
        util.file.cat(blast_hits, blastOutFiles)
        os.unlink(fasta)

    else:
        ## pipe tools together and run blastn multithreaded
        with open(blast_hits, 'wt') as outf:
            for read_id in tools.blast.BlastnTool().get_hits(inBam,
                                                             db,
                                                             threads=threads):
                outf.write(read_id + '\n')

    # Deplete BAM of hits
    tools.picard.FilterSamReadsTool().execute(inBam,
                                              True,
                                              blast_hits,
                                              outBam,
                                              JVMmemory=JVMmemory)
    os.unlink(blast_hits)
示例#40
0
def deplete_bmtagger_bam(inBam, db, outBam, threads=None, JVMmemory=None):
    """
    Use bmtagger to partition the input reads into ones that match at least one
        of the databases and ones that don't match any of the databases.
    inBam: paired-end input reads in BAM format.
    db: bmtagger expects files
        db.bitmask created by bmtool, and
        db.srprism.idx, db.srprism.map, etc. created by srprism mkindex
    outBam: the output BAM files to hold the unmatched reads.
    """
    bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path()

    # bmtagger calls several executables in the same directory, and blastn;
    # make sure they are accessible through $PATH
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    path = os.environ['PATH'].split(os.pathsep)
    for t in (bmtaggerPath, blastnPath):
        d = os.path.dirname(t)
        if d not in path:
            path = [d] + path
    path = os.pathsep.join(path)
    os.environ['PATH'] = path

    inReads1 = mkstempfname('.1.fastq')
    inReads2 = mkstempfname('.2.fastq')
    tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1, inReads2)

    bmtaggerConf = mkstempfname('.bmtagger.conf')
    with open(bmtaggerConf, 'w') as f:
        # Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168"
        print('srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"', file=f)
    tempDir = tempfile.mkdtemp()
    matchesFile = mkstempfname('.txt')
    cmdline = [
        bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x', db + '.srprism', '-T', tempDir, '-q1', '-1',
        inReads1, '-2', inReads2, '-o', matchesFile
    ]
    log.debug(' '.join(cmdline))
    util.misc.run_and_print(cmdline, check=True)

    tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory)
    os.unlink(matchesFile)
示例#41
0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, threads=1, JVMmemory=None):
    samtools = tools.samtools.SamtoolsTool()
    tmpBamIn = inBam
    for db in refDbs:
        if not samtools.isEmpty(tmpBamIn):
            tmpBamOut = mkstempfname('.bam')
            deplete_method(tmpBamIn, db, tmpBamOut, threads=threads, JVMmemory=JVMmemory)
            if tmpBamIn != inBam:
                os.unlink(tmpBamIn)
            tmpBamIn = tmpBamOut
    shutil.copyfile(tmpBamIn, outBam)
示例#42
0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs):
    samtools = tools.samtools.SamtoolsTool()
    tmpBamIn = inBam
    for db in refDbs:
        if not samtools.isEmpty(tmpBamIn):
            tmpBamOut = mkstempfname('.bam')
            deplete_method(tmpBamIn, db, tmpBamOut, **kwargs)
            if tmpBamIn != inBam:
                os.unlink(tmpBamIn)
            tmpBamIn = tmpBamOut
    shutil.copyfile(tmpBamIn, outBam)
示例#43
0
def filter_lastal(inFastq, refDb, outFastq):
    ''' Restrict input reads to those that align to the given
        reference database using LASTAL.  Also, remove duplicates with prinseq.
    '''
    assert outFastq.endswith('.fastq')
    tempFilePath = mkstempfname('.hits')
    lastalPath = tools.last.Lastal().install_and_get_path()
    mafSortPath = tools.last.MafSort().install_and_get_path()
    mafConvertPath = tools.last.MafConvert().install_and_get_path()
    prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path()
    noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py')

    lastalCmd = ' '.join([
        '{lastalPath} -Q1 {refDb} {inFastq}'.format(lastalPath=lastalPath, refDb=refDb, inFastq=inFastq),
        '| {mafSortPath} -n2'.format(mafSortPath=mafSortPath),
        '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(mafConvertPath=mafConvertPath, tempFilePath=tempFilePath),
    ])
    log.debug(lastalCmd)
    assert not os.system(lastalCmd)

    # filter inFastq against lastal hits
    filteredFastq = mkstempfname('.filtered.fastq')
    with open(filteredFastq, 'wt') as outf:
        noBlastLikeHitsCmd = [noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit']
        log.debug(' '.join(noBlastLikeHitsCmd) + ' > ' + filteredFastq)
        subprocess.check_call(noBlastLikeHitsCmd, stdout=outf)

    # remove duplicate reads and reads with multiple Ns
    if os.path.getsize(filteredFastq) == 0:
        # prinseq-lite fails on empty file input (which can happen in real life
        # if no reads match the refDb) so handle this scenario specially
        log.info("output is empty: no reads in input match refDb")
        shutil.copyfile(filteredFastq, outFastq)
    else:
        prinseqCmd = [
            'perl', prinseqPath, '-ns_max_n', '1', '-derep', '1', '-fastq', filteredFastq, '-out_bad', 'null',
            '-line_width', '0', '-out_good', outFastq[:-6]
        ]
        log.debug(' '.join(prinseqCmd))
        subprocess.check_call(prinseqCmd)
    os.unlink(filteredFastq)
示例#44
0
def run_blastn(blastn_path, db, input_fasta, blast_threads=1):
    """ run blastn on the input fasta file. this is intended to be run in parallel """
    chunk_hits = mkstempfname('.hits.txt')
    blastnCmd = [
        blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt',
        '6', '-max_target_seqs', '2', '-query', input_fasta, '-out', chunk_hits
    ]
    log.debug(' '.join(blastnCmd))
    util.misc.run_and_print(blastnCmd, check=True)

    os.unlink(input_fasta)
    return chunk_hits
示例#45
0
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000):
    'Use blastn to remove reads that match at least one of the databases.'

    # Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)

    # Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs:
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads)

    # Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout=outf)

    # extract reads with no blast hits
    no_blast_hits(blastOutCombined, inFastq, outFastq)
示例#46
0
def deplete_blastn_bam(inBam,
                       db,
                       outBam,
                       threads,
                       chunkSize=1000000,
                       JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    fastq1 = mkstempfname('.1.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1)

    # Find BLAST hits
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    log.info("running blastn on %s against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)
    os.unlink(fasta)

    # Deplete BAM of hits
    tools.picard.FilterSamReadsTool().execute(inBam,
                                              True,
                                              blast_hits,
                                              outBam,
                                              JVMmemory=JVMmemory)
    os.unlink(blast_hits)
示例#47
0
def run_blastn(blastn_path, db, input_fasta, blast_threads=1):
    """ run blastn on the input fasta file. this is intended to be run in parallel """
    chunk_hits = mkstempfname('.hits.txt')
    blastnCmd = [
        blastn_path, '-db', db, '-word_size', '16', '-num_threads',
        str(blast_threads), '-evalue', '1e-6', '-outfmt', '6',
        '-max_target_seqs', '2', '-query', input_fasta, '-out', chunk_hits
    ]
    log.debug(' '.join(blastnCmd))
    util.misc.run_and_print(blastnCmd, check=True)

    os.unlink(input_fasta)
    return chunk_hits
示例#48
0
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000):
    'Use blastn to remove reads that match at least one of the databases.'

    # Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)

    # Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs:
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads)

    # Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout=outf)

    # extract reads with no blast hits
    no_blast_hits(blastOutCombined, inFastq, outFastq)
示例#49
0
def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    #blastnPath = tools.blast.BlastnTool().install_and_get_path()
    fastq1 = mkstempfname('.1.fastq')
    fastq2 = mkstempfname('.2.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    halfBam = mkstempfname('.half.bam')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ1
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 1 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits in FASTQ1
    tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory)

    # Depleted BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ2 (which is already smaller than before)
    read_utils.fastq_to_fasta(fastq2, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 2 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits against FASTQ2
    tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory)

    # Clean up
    for fn in (fasta, blast_hits, halfBam):
        os.unlink(fn)
示例#50
0
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    #blastnPath = tools.blast.BlastnTool().install_and_get_path()
    fastq1 = mkstempfname('.1.fastq')
    fastq2 = mkstempfname('.2.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    halfBam = mkstempfname('.half.bam')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ1
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 1 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits in FASTQ1
    tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory)

    # Depleted BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(halfBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ2 (which is already smaller than before)
    read_utils.fastq_to_fasta(fastq2, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 2 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits against FASTQ2
    tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory)

    # Clean up
    for fn in (fasta, blast_hits, halfBam):
        os.unlink(fn)
示例#51
0
def deplete_bmtagger_bam(inBam, db, outBam) :
    """
    Use bmtagger to partition the input reads into ones that match at least one
        of the databases and ones that don't match any of the databases.
    inBam: paired-end input reads in BAM format.
    db: bmtagger expects files
        db.bitmask created by bmtool, and
        db.srprism.idx, db.srprism.map, etc. created by srprism mkindex
    outBam: the output BAM files to hold the unmatched reads.
    """
    bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path()
    
    # bmtagger calls several executables in the same directory, and blastn;
    # make sure they are accessible through $PATH
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    path = os.environ['PATH'].split(os.pathsep)
    for t in (bmtaggerPath, blastnPath):
        d = os.path.dirname(t)
        if d not in path:
            path = [d] + path
    path = os.pathsep.join(path)
    os.environ['PATH'] = path
    
    inReads1 = mkstempfname('.1.fastq')
    inReads2 = mkstempfname('.2.fastq')
    tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)
    
    tempDir = tempfile.mkdtemp()
    matchesFile = mkstempfname('.txt')
    cmdline = [bmtaggerPath,
               '-b', db+'.bitmask', '-x', db+'.srprism', '-T', tempDir,
               '-q1', '-1', inReads1, '-2', inReads2,
               '-o', matchesFile]
    log.debug(' '.join(cmdline))
    subprocess.check_call(cmdline)
    
    tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam)
    os.unlink(matchesFile)
示例#52
0
def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex=r'^@(\S+)/[1|2]$'):
    '''Use mergeShuffledFastqSeqs to purge unmated reads, and
       put corresponding reads in the same order.
       Corresponding sequences must have sequence identifiers
       of the form SEQID/1 and SEQID/2.
    '''
    tempOutput = mkstempfname()
    mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(), 'mergeShuffledFastqSeqs.pl')
    cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex, '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput]
    log.debug(' '.join(cmdline))
    subprocess.check_call(cmdline)
    shutil.move(tempOutput + '.1.fastq', outFastq1)
    shutil.move(tempOutput + '.2.fastq', outFastq2)
    return 0
示例#53
0
def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex=r'^@(\S+)/[1|2]$'):
    '''Use mergeShuffledFastqSeqs to purge unmated reads, and
       put corresponding reads in the same order.
       Corresponding sequences must have sequence identifiers
       of the form SEQID/1 and SEQID/2.
    '''
    tempOutput = mkstempfname()
    mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(), 'mergeShuffledFastqSeqs.pl')
    cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex, '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput]
    log.debug(' '.join(cmdline))
    subprocess.check_call(cmdline)
    shutil.move(tempOutput + '.1.fastq', outFastq1)
    shutil.move(tempOutput + '.2.fastq', outFastq2)
    return 0
示例#54
0
def deplete_blastn(inFastq, outFastq, refDbs) :
    'Use blastn to remove reads that match at least one of the databases.'
    
    ## Get tools
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(),
                                      'noBlastHits_v3.py')
    
    ## Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)
    
    ## Run blastn using each of the databases in turn
    blastOutFiles = [mkstempfname() for db in refDbs]
    for db, blastOutFile in zip(refDbs, blastOutFiles) :
        log.info("running blastn on {} against {}".format(inFastq, db))
        blastnCmd = [blastnPath, '-db', db,
                    '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6',
                    '-num_descriptions', '2', '-num_alignments', '2',
                    '-query', inFasta, '-out', blastOutFile]
        log.debug(' '.join(blastnCmd))
        subprocess.check_call(blastnCmd)

    ## Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout = outf)

    ## run noBlastHits_v3.py to extract reads with no blast hits
    # TODO: slurp the small amount of code in this script into here
    noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined,
                     '-r', inFastq, '-m', 'nohit']
    log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq)
    with util.file.open_or_gzopen(outFastq, 'wt') as outf :
        subprocess.check_call(noBlastHitsCmd, stdout = outf)
示例#55
0
def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None):
    """
    Use bmtagger to partition the input reads into ones that match at least one
        of the databases and ones that don't match any of the databases.
    inBam: paired-end input reads in BAM format.
    db: bmtagger expects files
        db.bitmask created by bmtool, and
        db.srprism.idx, db.srprism.map, etc. created by srprism mkindex
    outBam: the output BAM files to hold the unmatched reads.
    """
    bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path()

    # bmtagger calls several executables in the same directory, and blastn;
    # make sure they are accessible through $PATH
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    path = os.environ['PATH'].split(os.pathsep)
    for t in (bmtaggerPath, blastnPath):
        d = os.path.dirname(t)
        if d not in path:
            path = [d] + path
    path = os.pathsep.join(path)
    os.environ['PATH'] = path

    inReads1 = mkstempfname('.1.fastq')
    inReads2 = mkstempfname('.2.fastq')
    tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2)

    tempDir = tempfile.mkdtemp()
    matchesFile = mkstempfname('.txt')
    cmdline = [bmtaggerPath, '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-2',
               inReads2, '-o', matchesFile]
    log.debug(' '.join(cmdline))
    subprocess.check_call(cmdline)

    tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory)
    os.unlink(matchesFile)
示例#56
0
def filter_lastal_bam(inBam,
                      db,
                      outBam,
                      max_gapless_alignments_per_position=1,
                      min_length_for_initial_matches=5,
                      max_length_for_initial_matches=50,
                      max_initial_matches_per_position=100,
                      JVMmemory=None):
    ''' Restrict input reads to those that align to the given
        reference database using LASTAL.
    '''
    # convert BAM to paired FASTQ
    inReads = mkstempfname('.all.fastq')
    tools.samtools.SamtoolsTool().bam2fq(inBam, inReads)

    # look for hits in FASTQ
    hitList1 = mkstempfname('.hits')
    lastal_get_hits(inReads, db, hitList1, max_gapless_alignments_per_position,
                    min_length_for_initial_matches,
                    max_length_for_initial_matches,
                    max_initial_matches_per_position)
    os.unlink(inReads)

    # merge & uniqify hits
    hitList = mkstempfname('.hits')
    with open(hitList, 'wt') as outf:
        subprocess.check_call(['sort', '-u', hitList1], stdout=outf)
    os.unlink(hitList1)

    # filter original BAM file against keep list
    tools.picard.FilterSamReadsTool().execute(inBam,
                                              False,
                                              hitList,
                                              outBam,
                                              JVMmemory=JVMmemory)
    os.unlink(hitList)
示例#57
0
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta):
    '''Trim read sequences with Trimmomatic.'''
    trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path()
    tmpUnpaired1 = mkstempfname()
    tmpUnpaired2 = mkstempfname()

    javaCmd = []

    # the conda version wraps the jar file with a shell script
    if trimmomaticPath.endswith(".jar"):
        #  This java program has a lot of argments...
        javaCmd.extend(['java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath,
                   'org.usadellab.trimmomatic.TrimmomaticPE'])
    else:
        javaCmd.extend([trimmomaticPath, "PE"])

    javaCmd.extend([inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1,
                   pairedOutFastq2, tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30',
                   'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)])

    log.debug(' '.join(javaCmd))
    subprocess.check_call(javaCmd)
    os.unlink(tmpUnpaired1)
    os.unlink(tmpUnpaired2)
示例#58
0
def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2):
    """
    Use bmtagger to partition the input reads into ones that match at least one
        of the databases and ones that don't match any of the databases.
    inFastq1, inFastq2: paired-end input reads in fastq format
        The names of the reads must be in one-to-one correspondence.
    databases: for each db in databases bmtagger expects files
        db.bitmask created by bmtool, and
        db.srprism.idx, db.srprism.map, etc. created by srprism mkindex
    outFastq1, outFastq2: pair of output fastq files depleted of reads present
        in the databases
    This version is optimized for the case of only requiring depletion, which
    allows us to avoid time-intensive lookups.
    """
    bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path()
    blastnPath = tools.blast.BlastnTool().install_and_get_path()

    # bmtagger calls several executables in the same directory, and blastn;
    # make sure they are accessible through $PATH
    path = os.environ['PATH'].split(os.pathsep)
    for t in (bmtaggerPath, blastnPath):
        d = os.path.dirname(t)
        if d not in path:
            path = [d] + path
    path = os.pathsep.join(path)
    os.environ['PATH'] = path

    tempDir = tempfile.mkdtemp()
    curReads1, curReads2 = inFastq1, inFastq2
    tempfiles = []
    for db in databases:
        outprefix = mkstempfname()
        cmdline = [
            bmtaggerPath, '-X', '-b', db + '.bitmask', '-x', db + '.srprism',
            '-T', tempDir, '-q1', '-1', curReads1, '-2', curReads2, '-o',
            outprefix
        ]
        log.debug(' '.join(cmdline))
        util.misc.run_and_print(cmdline, check=True)
        curReads1, curReads2 = [
            outprefix + suffix for suffix in ('_1.fastq', '_2.fastq')
        ]
        tempfiles += [curReads1, curReads2]
    shutil.copyfile(curReads1, outFastq1)
    shutil.copyfile(curReads2, outFastq2)
    for fn in tempfiles:
        os.unlink(fn)
    log.debug("deplete_bmtagger complete")
示例#59
0
def filter_lastal(inFastq, refDbs, outFastq):
    """
    TODO: make this operate on BAM files
    """
    assert outFastq.endswith('.fastq')
    outFastq = outFastq[:-6]
    tempFilePath = mkstempfname()
    lastalPath = tools.last.Lastal().install_and_get_path()
    mafSortPath = tools.last.MafSort().install_and_get_path()
    mafConvertPath = tools.last.MafConvert().install_and_get_path()
    prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path()
    noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(),
                                        'noBlastLikeHits.py')

    # each pipe separated cmd gets own line
    # unfortunately, it doesn't seem to work to do .format(**locals()) on the
    # final string as opposed to the individual parts.
    lastalCmd = ' '.join([
        '{lastalPath} -Q1 {refDbs} {inFastq}'.format(**locals()),
        '| {mafSortPath} -n2'.format(**locals()),
        '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(**locals()),
        ])
    log.debug(lastalCmd)
    assert not os.system(lastalCmd)

    # each option/flag on own line
    noBlastLikeHitsCmd = ' '.join([
        'python', noBlastLikeHitsPath,
            '-b', tempFilePath,
            '-r', inFastq,
            '-m hit' ])

    prinseqCmd = ' '.join([
        'perl', prinseqPath,
            '-ns_max_n 1',
            '-derep 1',
            '-fastq stdin',
            '-out_bad null',
            '-line_width 0',
            '-out_good', outFastq
        ])

    fullCmd = "{noBlastLikeHitsCmd} | {prinseqCmd}".format(**locals())
    log.debug(fullCmd)
    assert not os.system(fullCmd)
    
    log.debug("done")