Пример #1
0
def discover(flavor, out, records, sampleLabel, pipeline, numThreads, gtf):
    mu.logTime(out, 'START DISCOVER')

    sample, replicate = sampleLabel.split('#')

    subprocess.check_call('mkdir -p ' + pipeline + '/06-discover', shell=True)
    subprocess.check_call('mkdir -p ' + pipeline + '/06-discover/' +
                          sampleLabel,
                          shell=True)
    out.write('echo discover \n')

    if flavor == 'cufflinks':
        bam = pipeline + '/04-aligned/' + sampleLabel + '.bam'
        out.write(
            'cufflinks -p ' + numThreads +
            ' --max-bundle-frags 10000000 --library-type fr-unstranded -o ' +
            pipeline + '/06-discover/' + sampleLabel + ' ' + bam + '\n')
        #        out.write('cp ' + pipeline + '/05-quantified/' + sampleLabel + '/genes.fpkm_tracking ' + pipeline + '/05-quantified/' + sampleLabel + '-genes.fpkm' + '\n')
        #        out.write('cp ' + pipeline + '/05-quantified/' + sampleLabel + '/isoforms.fpkm_tracking ' + pipeline + '/05-quantified/' + sampleLabel + '-isoforms.fpkm' + '\n')
        out.write('\n')
    else:
        print flavor
        raise 'DONT RECOGNIZE DISCOVER FLAVOR'

    mu.logTime(out, 'FINISH DISCOVER')
Пример #2
0
def clean(pars):
    mu.logTime(pars['out'],'START CLEAN')

    if pars['flavor'] == '02-reads':
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*-' + pars['sampleReplicate'] + '*.fastq')
    if pars['flavor'] == 'gdc':
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/00-downloads/' + pars['sampleReplicate'] + '/' + pars['fileName'])
        mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'])
        mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'] + '.bai')
    if pars['flavor'] == 'standard':
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq')
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz')
#        mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'])
#        mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'] + '.bai')
    if pars['flavor'] == 'sequencingCore':
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq')
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz')
    if pars['flavor'] == 'custom1':        
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq')
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz')
    if pars['flavor'] == 'rsem':
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq')
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz')
        mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.transcript.bam')

    mu.logTime(pars['out'],'FINISH CLEAN')
Пример #3
0
def quantify(pars):
    mu.logTime(pars['out'],'START QUANTIFY')

    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/05-quantified',shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'],shell=True)
    pars['out'].write('echo quantify \n')

    # *** salmon ***
    if pars['flavor'] == 'salmon' or pars['flavor'] == 'salmon-bias' or pars['flavor'] == 'salmon-bias-stranded':
        mu.writeCmd(pars['out'], 'cp ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/quant.sf ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-abundance.txt')
        mu.writeCmd(pars['out'], ' ')

    # *** stringtie ***
    elif pars['flavor'] == 'stringtie':

        # program call
        prefix = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-stringtie-'
        outGTF = prefix + 'transcript-exon.gtf'
        outGene = prefix + 'gene.txt'
        outCov = prefix + 'cov.gtf'

        pars['out'].write('stringtie ' + pars['bamFile'] + ' -e -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' -e -o ' + outGTF + ' -A ' + outGene + ' -C ' + outCov + '\n')
        pars['out'].write('\n')

    # *** cufflinks ***
    elif pars['flavor'] == 'cufflinks':

        # call cufflinks
        if pars['stranded']:
            pars['out'].write('cufflinks --quiet --no-update-check -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' --max-bundle-frags 10000000 --library-type fr-firststrand -o ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + ' ' + pars['bamFile'] + '\n')
        else:
            pars['out'].write('cufflinks --quiet --no-update-check -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' --max-bundle-frags 10000000 --library-type fr-unstranded -o ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + ' ' + pars['bamFile'] + '\n')

        # copy and rename quantification files
        pars['out'].write('cp ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '/genes.fpkm_tracking ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-genes.fpkm' + '\n')
        pars['out'].write('cp ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '/isoforms.fpkm_tracking ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-isoforms.fpkm' + '\n')

        pars['out'].write('\n')

    # *** rsem *** 
    elif pars['flavor'] == 'rsem':
        inFile1 = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.genes.results'
        outFile1 = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-rsem-gene.txt'
        inFile2 = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.isoforms.results'
        outFile2 = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-rsem-transcript.txt'

        pars['out'].write('cp ' + inFile1 + ' ' + outFile1 + '\n')
        pars['out'].write('cp ' + inFile2 + ' ' + outFile2 + '\n')
        pars['out'].write('\n')

    # *** kallisto *** 
    elif pars['flavor'] == 'kallisto':
        inFile = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/abundance.tsv'
        outFile = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-kallisto-transcript.txt'
        pars['out'].write('cp ' + inFile + ' ' + outFile + '\n')
        pars['out'].write('\n')

    mu.logTime(pars['out'],'FINISH QUANTIFY')
Пример #4
0
def qc(pars):
    mu.logTime(pars['out'], 'START QC')

    # use when this function has already been called
    # assumes we have 44GB of memory
    # assumes we have 64GB of memory
    # this can be lowered but by default we need to be robust to TCGA
    if pars['flavor'] == 'qorts':

        qorts = '/nfs/turbo/bankheadTurbo/software/QoRTs/v1.3.6/QoRTs.jar'

        # assemble parameters
        outDir = pars['pipeline'] + '/10-qc/' + pars['sampleReplicate']
        subprocess.check_call('mkdir -p ' + outDir, shell=True)

        sortedBam = outDir + '/' + pars['sampleReplicate'] + '.bam'
        pars['out'].write('samtools sort -n -@ ' + pars['threads'] + ' ' +
                          pars['bamFile'] + ' -o ' + sortedBam + ' \n')

        # specifty endedness and strandedness
        params = '--keepMultiMapped '  # for hisat compatibility
        if pars['end'] == 'single':
            params += '--singleEnded '
        if pars['stranded']:
            params += '--stranded '

        # run qorts here
        pars['out'].write(
            'java -Xmx12G -jar ' + qorts + ' QC ' + params +
            ' --skipFunctions overlapMatch,NVC,GCDistribution,readLengthDistro,QualityScoreDistribution,writeClippedNVC,CigarOpDistribution,cigarLocusCounts,chromCounts --generatePlots '
            + sortedBam + ' ' + pars['gtf'] + ' ' + outDir + '\n')

        # clean up name sorted bam
        pars['out'].write('echo rm -f ' + sortedBam + '*' + '\n')
        pars['out'].write('rm -f ' + sortedBam + '*' + '\n')

    # use when this function has already been called
    elif pars['flavor'] == 'picard':

        outDir = pars['pipeline'] + '/06-qc/' + pars['sampleReplicate'] + '/'
        outFile = outDir + pars['flavor'] + '.txt'
        call = 'time java -Xmx28G -jar /sw/med/centos7/picard/2.4.1/picard.jar CollectRnaSeqMetrics I=' + pars[
            'bam'] + ' O=' + outFile + ' REF_FLAT=' + pars[
                'gtf'] + ' STRAND_SPECIFICITY=NONE'

        # call yo program
        subprocess.check_call('mkdir -p ' + outDir, shell=True)
        pars['out'].write(call + '\n')

    mu.logTime(pars['out'], 'FINISH QC')
Пример #5
0
def isCompleted(pars):
    mu.logTime(pars['out'],'START COMPLETION CHECK')

    if pars['flavor'] == 'cufflinks':
        file = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-genes.fpkm'
    elif pars['flavor'] == 'qorts':
        file = pars['pipeline'] + '/10-qc/' + pars['sampleReplicate'] + '/QC.summary.txt'
    elif pars['flavor'] == 'salmon' or pars['flavor'] == 'salmon-bias' or pars['flavor'] == 'salmon-bias-stranded':
        file = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-abundance.txt'

    pars['out'].write('echo "checking if ' + file + ' exists..."' + '\n')
    pars['out'].write('if [ -e ' + file + ' ]; then ' + '\n')
    pars['out'].write('  echo "already processed ' + pars['sampleReplicate'] + '...exiting..." ' + '\n')
    pars['out'].write('  exit ' + '\n')
    pars['out'].write('else ' + '\n')
    pars['out'].write('  echo "not found...processing..."' + '\n')
    pars['out'].write('fi ' + '\n')



    mu.logTime(pars['out'],'FINISH COMPLETION CHECK')
Пример #6
0
def download(pars):
    mu.logTime(pars['out'], 'START DOWNLOAD')

    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/00-downloads',
                          shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/00-downloads/' +
                          pars['sampleReplicate'],
                          shell=True)
    outDir = pars['pipeline'] + '/00-downloads/' + pars['sampleReplicate']

    if pars['flavor'] == 'gdc':

        # download bam file
        mu.writeCmd(pars['out'], 'echo ' + pars['sampleReplicate'])
        mu.writeCmd(pars['out'], 'cd ' + outDir)
        #        mu.writeCmd(pars['out'], 'time curl -O -J -H "X-Auth-Token: ' + pars['token'] + '" https://api.gdc.cancer.gov/data/' + pars['fileID'])
        mu.writeCmd(
            pars['out'],
            'time curl --remote-name --remote-header-name  --header "X-Auth-Token: '
            + pars['token'] + '" https://api.gdc.cancer.gov/data/' +
            pars['fileID'])
        pars['bamFile'] = outDir + '/' + pars['fileName']
        mu.writeCmd(pars['out'], 'cd ../../.. \n')

        # get size and md5sum
        mu.writeCmd(
            pars['out'], "du -chs " + pars['bamFile'] +
            " | grep -v total | awk '{ print $1 }' > " + outDir +
            "/bam-size.txt")
        mu.writeCmd(
            pars['out'], "md5sum " + pars['bamFile'] +
            " | grep -v total | awk '{ print $1 }' > " + outDir +
            "/bam-md5sum.txt")

    if pars['flavor'] == 'skip' or pars['flavor'] == 'passthru':

        pars['bamFile'] = outDir + '/' + pars['fileName']

    mu.logTime(pars['out'], 'FINISH DOWNLOAD')
        else:
            out1.write(line)
    out1.write(execute + ' $SLURM_ARRAY_TASK_ID' + '\n')

# write execute script
with open(execute, 'w') as out1:
    out1.write('#!/bin/bash' + '\n')
    out1.write('echo -e "SLURM_ARRAY_TASK_ID\t$1"' + '\n')

    # assign variables based on array id
    #    out1.write('sampleReplicate=$(head -n$((1+$1)) ' + inFile1 + '| tail -n1 | cut -f4)' + '\n')
    out1.write('sampleScript=$(ls -1 ' + pipeline +
               '/scripts/*.sh | head -n$(($1)) | tail -n1) ' + '\n')

    # print sampleReplicate
    out1.write('echo -e "sampleScript\t$sampleScript"' + '\n')
    out1.write('echo' + '\n')

    mu.logTime(out1, 'ALL START')

    # set up custom script
    out1.write('cmd=$sampleScript ' + '\n')
    out1.write('echo $cmd; eval $cmd' + '\n')

    mu.logTime(out1, 'ALL FINISHED!')
    out1.write('echo' + '\n')

# update script permissions
cmd = 'chmod 755 ' + execute
subprocess.check_call(cmd, shell=True)
Пример #8
0
def align(pars):
    mu.logTime(pars['out'],'START ALIGN')

    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned',shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'],shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned/totalFragments',shell=True)
    pars['out'].write('echo align \n')

    pars['outDir'] = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/'
    pars['bamFile'] = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam'

    # *** salmon-bias ***
    if pars['flavor'] == 'salmon-bias':
        pars['fqgz1'] = pars['fagzFiles'][0]
        pars['fqgz2'] = pars['fagzFiles'][1]

        mu.writeCmd(pars['out'], 'salmon quant --libType A -p ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' --posBias --seqBias --gcBias -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2'])
        mu.writeCmd(pars['out'], ' ')

    # *** salmon ***
    elif pars['flavor'] == 'salmon':
        pars['fqgz1'] = pars['fagzFiles'][0]
        pars['fqgz2'] = pars['fagzFiles'][1]

        mu.writeCmd(pars['out'], 'salmon quant --libType A -p ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2'])
        mu.writeCmd(pars['out'], ' ')

    # *** star ***
    elif pars['flavor'] == 'star':
        pars['out'].write('STAR  --runThreadN ' + pars['threads'] + ' --genomeDir ' + pars['alignerIndexDir'] + ' --readFilesIn ' + ' '.join(pars['fagzFiles']) + ' --outFileNamePrefix ' + pars['outDir'] + ' --outSAMtype BAM Unsorted  --outFilterType BySJout  --outFilterMultimapNmax 20  --outFilterMismatchNmax 999  --outFilterMismatchNoverLmax 0.04  --alignIntronMin 20  --alignIntronMax 1000000 --alignMatesGapMax 1000000  --alignSJoverhangMin 8  --alignSJDBoverhangMin 1 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMunmapped Within ' + '\n')

        pars['out'].write('samtools sort -@ ' + pars['threads'] + ' ' + pars['outDir'] + 'Aligned.out.bam -o ' + pars['bamFile'] + ' \n')
        pars['out'].write('samtools index ' + pars['bamFile'] + ' \n')
        pars['out'].write('rm -f ' + pars['outDir'] + 'Aligned.out.bam' + ' \n')
        pars['out'].write('\n')

    # *** hisat2 ***
    elif 'hisat2' in pars['flavor']:
        pars['fqgz1'] = pars['fagzFiles'][0]
        pars['fqgz2'] = pars['fagzFiles'][1]
        sam = pars['outDir'] + 'tmp.sam'
        bam1 = pars['outDir'] + 'tmp.bam'
        summary = pars['outDir'] + 'summary.log'
        dta = '--dta-cufflinks' if 'cufflinks' in pars['flavor'] else '--dta'
        strandCommand = '' if pars['stranded'] == False else '--rna-strandness RF'

        # run hisat2
        pars['out'].write('hisat2 -p ' + pars['threads'] + ' ' + dta + ' ' + strandCommand + ' --summary-file ' + summary + ' -x ' + pars['alignerIndexDir'] + ' -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2'] + ' -S ' + sam + '\n')

        # polish up the bam
        pars['out'].write('samtools sort -O BAM -@ ' + pars['threads'] + ' ' + sam + ' -o ' + pars['bamFile'] + ' \n')
        pars['out'].write('samtools index ' + pars['bamFile'] + ' \n')

        # delete the unnecessary artifacts (the crap)
        pars['out'].write('rm -f ' + sam + ' ' + ' \n')
        pars['out'].write('\n')

    elif pars['flavor'] == 'rsem':
        # for when we are workign with files directly from fastq 
        pars['fqgz1'] = pars['fagzFiles'][0]
        pars['fqgz2'] = pars['fagzFiles'][1]

        # rsem call here...
#        pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --bowtie2 --bowtie2-path /sw/med/centos7/bowtie2/2.3.4.3/ --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n')
#        pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --star --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n')
#        pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --star --star-gzipped-read-file --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n')
        pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --bowtie2 --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n')
        pars['out'].write('\n')

    elif pars['flavor'] == 'kallisto':
        # for when we are workign with files directly from fastq 
        pars['fqgz1'] = pars['fagzFiles'][0]
        pars['fqgz2'] = pars['fagzFiles'][1]

        # to reduce processing time and save space bams will not be generated
        pars['out'].write('kallisto quant -t ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + '\n')
        pars['out'].write('\n')

    elif pars['flavor'] == 'skip':
        mu.logTime(pars['out'],'FINISH ALIGN')
        return

    """
    # need to be tested - refactoring
    elif pars['flavor'] == 'tophat':
        # coverage search is only recommended for <45bp reads or <10m reads per sample
        pars['out'].write('# tophat -p ' + pars['threads'] + ' --output-dir ' + pars['outDir'] + ' --no-coverage-search ' + pars['alignerIndexDir'] + ' ' + ' '.join(pars['fagzFiles']) + '\n')
        pars['out'].write('tophat -p ' + pars['threads'] + ' --output-dir ' + pars['outDir'] + ' --no-coverage-search ' + pars['alignerIndexDir'] + ' ' + ' '.join(pars['fagzFiles']) + '\n')
        pars['out'].write('cp ' + pars['outDir'] + 'accepted_hits.bam ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' + '\n')
        pars['out'].write('samtools index ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' + '\n')
        pars['out'].write('\n')
    elif pars['flavor'] == 'kallisto-pseudo':
        # run kallisto specifically to generate pseudo bams
        pars['out'].write('kallisto quant --pseudobam -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' ' + ' '.join(pars['fagzFiles']) + ' > ' + pars['outDir'] + 'pseudo.sam' + '\n')

        # use sam tools to compress, sort, and index
        pars['out'].write('samtools view -@ ' + pars['threads'] + ' -bS ' + pars['outDir'] + 'pseudo.sam > ' + pars['outDir'] + 'pseudo.bam \n')
        pars['out'].write('samtools sort -@ ' + pars['threads'] + ' ' + pars['outDir'] + 'pseudo.bam ' + pars['outDir'] + 'pseudo-sorted \n')
        pars['out'].write('samtools index ' + pars['outDir'] + 'pseudo-sorted.bam \n')
        
        # clean up un necessary artifacts
        pars['out'].write('rm ' + pars['outDir'] + 'pseudo.sam \n') 
        pars['out'].write('rm ' + pars['outDir'] + 'pseudo.bam \n') 
        pars['out'].write('mv ' + pars['outDir'] + 'pseudo-sorted.bam ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam \n')
        pars['out'].write('mv ' + pars['outDir'] + 'pseudo-sorted.bam.bai ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam.bai \n')

        pars['out'].write('\n')
    """
    # count total aligned fragments
    if pars['flavor'] == 'star' or pars['flavor'] == 'tophat' or 'hisat2' in pars['flavor']:
        countFile = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.txt'
        bam = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam'
        tmp1 = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.tmp1.txt'
        tmp2 = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.tmp2.txt'
        pars['out'].write('samtools view -F 4 ' + bam + ' | cut -f1 > ' + tmp1 + '\n')
        pars['out'].write('sort ' + tmp1 + ' | uniq > ' + tmp2 + '\n')
        pars['out'].write('wc -l ' + tmp2 + ' | sed "s/ .*//" > ' + countFile + '\n')
        pars['out'].write('rm -f ' + tmp1 + ' ' + tmp2 + '\n')

    mu.logTime(pars['out'],'FINISH ALIGN')
Пример #9
0
def preprocess(pars):
    mu.logTime(pars['out'], 'START PREPROCESS')

    records = pars['records']
    files = sorted([
        records[key]['file'] for key in records.keys()
        if records[key]['sampleReplicate'] == pars['sampleReplicate']
    ])
    pars['fastqFiles'] = []

    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads',
                          shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads/fastq',
                          shell=True)
    if pars['fastqc']:
        subprocess.check_call('mkdir -p ' + pars['pipeline'] +
                              '/02-reads/fastqc',
                              shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] +
                          '/02-reads/totalFragments',
                          shell=True)

    pars['out'].write('echo preprocess \n')

    # no trimming no fastqc just set up symbolic link for next step
    if pars['flavor'] == 'passthru':
        uniqueNames = sorted(
            set(
                list([
                    records[key]['uniqueName'] for key in records.keys() if
                    records[key]['sampleReplicate'] == pars['sampleReplicate']
                ])))

        # need to get pairs of files that are associated with a given coreSampleLabel
        for uniqueName in uniqueNames:

            myFiles = sorted([
                records[key]['file'] for key in records.keys()
                if records[key]['uniqueName'] == uniqueName
            ])
            assert len(
                myFiles
            ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...'

            # process first fastq file
            in1 = myFiles[0]
            fastqFile1 = pars[
                'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq.gz'
            pars['out'].write('ln -s ' + in1 + ' ' + fastqFile1 + '\n')

            # for paired end data we have two sets of fasteq reads per uniqueName
            if pars['end'] == 'paired':
                in2 = myFiles[1] if pars['end'] == 'paired' else none
                fastqFile2 = pars[
                    'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq.gz'
                pars['out'].write('ln -s ' + in2 + ' ' + fastqFile2 + '\n')
            pars['out'].write('\n')

            # assign
            pars['fagzFiles'] = [fastqFile1]
            if pars['end'] == 'paired':
                pars['fagzFiles'].append(fastqFile2)

            # count reads
            readCountFile = pars[
                'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt'
            pars['out'].write("zcat " + myFiles[0] +
                              " | wc -l | awk '{print $1/4}' > " +
                              readCountFile + '\n')

    # no preprocessing or trimming - just convert to fastq files
    elif pars['flavor'] == 'none':
        uniqueNames = sorted(
            set(
                list([
                    records[key]['uniqueName'] for key in records.keys() if
                    records[key]['sampleReplicate'] == pars['sampleReplicate']
                ])))

        # need to get pairs of files that are associated with a given coreSampleLabel
        for uniqueName in uniqueNames:

            myFiles = sorted([
                records[key]['file'] for key in records.keys()
                if records[key]['uniqueName'] == uniqueName
            ])
            assert len(
                myFiles
            ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...'

            # process first fastq file
            in1 = myFiles[0]
            fastqFile1 = pars[
                'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq'
            pars['out'].write('zcat ' + in1 + ' > ' + fastqFile1 + '\n')
            if pars['fastqc']:
                pars['out'].write('fastqc ' + fastqFile1 + ' --outdir=' +
                                  pars['pipeline'] + '/02-reads/fastqc/' +
                                  '\n')

            # for paired end data we have two sets of fasteq reads per uniqueName
            if pars['end'] == 'paired':
                in2 = myFiles[1] if pars['end'] == 'paired' else none
                fastqFile2 = pars[
                    'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq'
                pars['out'].write('zcat ' + in2 + ' > ' + fastqFile2 + '\n')
                if pars['fastqc']:
                    pars['out'].write('fastqc ' + fastqFile2 + ' --outdir=' +
                                      pars['pipeline'] + '/02-reads/fastqc/' +
                                      '\n')

            pars['out'].write('\n')

            # count reads
            rawReadCountFile = pars[
                'pipeline'] + '/02-reads/totalFragments/raw-' + uniqueName + '.txt'
            readCountFile = pars[
                'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt'
            pars['out'].write("zcat " + myFiles[0] +
                              " | wc -l | awk '{print $1/4}' > " +
                              rawReadCountFile + '\n')
            pars['out'].write("wc -l " + fastqFile1 +
                              "| awk '{print $1/4}' > " + readCountFile + '\n')

    elif pars['flavor'] == 'rrna-removal':

        uniqueNames = sorted(
            set(
                list([
                    records[key]['uniqueName'] for key in records.keys() if
                    records[key]['sampleReplicate'] == pars['sampleReplicate']
                ])))

        # need to get pairs of files that are associated with a given coreSampleLabel
        for uniqueName in uniqueNames:

            myFiles = sorted([
                records[key]['file'] for key in records.keys()
                if records[key]['uniqueName'] == uniqueName
            ])
            assert len(
                myFiles
            ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...'

            # process first fastq file
            in1 = myFiles[0]
            fastqFile1_raw = pars[
                'pipeline'] + '/02-reads/fastq/' + uniqueName + '-raw-R1.fastq'

            pars['out'].write('zcat ' + in1 + ' > ' + fastqFile1_raw + '\n')

            # for paired end data we have two sets of fasteq reads per uniqueName
            if pars['end'] == 'paired':
                in2 = myFiles[1] if pars['end'] == 'paired' else none
                fastqFile2_raw = pars[
                    'pipeline'] + '/02-reads/fastq/' + uniqueName + '-raw-R2.fastq'
                pars['out'].write('zcat ' + in2 + ' > ' + fastqFile2_raw +
                                  '\n')

            pars['out'].write('\n')

            # remove reads mapping to ribosomal rna
            # use start to align to align to rrna reference
            # what ever does not align goes to fastqFile
            fastqFile1 = pars[
                'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq'
            fastqFile2 = pars[
                'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq'
            pars['out'].write('bbduk.sh in1=' + fastqFile1_raw + ' in2=' +
                              fastqFile2_raw + ' out1=' + fastqFile1 +
                              ' out2=' + fastqFile2 + ' ref=' +
                              pars['rrnaReference'] + '\n')

            # fastqc
            if pars['fastqc']:
                pars['out'].write('fastqc ' + fastqFile1 + ' --outdir=' +
                                  pars['pipeline'] + '/02-reads/fastqc/' +
                                  '\n')
                if pars['end'] == 'paired':
                    pars['out'].write('fastqc ' + fastqFile2 + ' --outdir=' +
                                      pars['pipeline'] + '/02-reads/fastqc/' +
                                      '\n')

            # count reads
            rawReadCountFile = pars[
                'pipeline'] + '/02-reads/totalFragments/raw-' + uniqueName + '.txt'
            readCountFile = pars[
                'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt'
            pars['out'].write("wc -l " + fastqFile1_raw +
                              "| awk '{print $1/4}' > " + rawReadCountFile +
                              '\n')
            pars['out'].write("wc -l " + fastqFile1 +
                              "| awk '{print $1/4}' > " + readCountFile + '\n')

        pass

    elif pars['flavor'] == 'skip':
        pass

    # dont understand
    else:
        print pars['flavor']
        raise 'DONT UNDERSTAND PREPROCESS FLAVOR!!'

    # get fastq file entries
    myKeys = sorted([
        key for key in records.keys()
        if records[key]['sampleReplicate'] == pars['sampleReplicate']
    ])
    for myKey in myKeys:
        record = records[myKey]
        label = record['uniqueName'] + '-R' + record['read']
        fastqFile = pars['pipeline'] + '/02-reads/fastq/' + label + '.fastq'
        pars['fastqFiles'].append(fastqFile)
    """
    NEEDS TO BE REFACTORED USING DICTIONARY PARAMS AND SINGLE END SUPPORT

    elif pars['flavor'] == 'trimmomatic':
        uniqueNames = sorted(set(list([records[key]['uniqueName'] for key in records.keys() if records[key]['sample'] == sample and records[key]['replicate'] == replicate])))

        # need to get pairs of files that are associated with a given coreSampleLabel
        for uniqueName in uniqueNames:
            myFiles = sorted([records[key]['file'] for key in records.keys() if records[key]['uniqueName'] == uniqueName])
            assert len(myFiles) == 2, 'SHOULD ONLY BE 2 MYFILES ASSOCIATED...'
            in1 = myFiles[0]
            in2 = myFiles[1]
            label = uniqueName + '-' + sample + '-' + replicate

            out1=pars['pipeline'] + '/02-reads/fastq/' + label + '-R1.fastq'
            out2=pars['pipeline'] + '/02-reads/fastq/' + label + '-R1-U.fastq'
            out3=pars['pipeline'] + '/02-reads/fastq/' + label + '-R2.fastq'
            out4=pars['pipeline'] + '/02-reads/fastq/' + label + '-R2-U.fastq'
            fastqFiles += [out1,out3]
            
            # write to pbs file
            pars['out'].write('java -jar /sw/lsa/centos7/trimmomatic/0.36/bin/trimmomatic-0.36.jar PE -threads ' + numThreads + ' ' + ' '.join([in1,in2,out1,out2,out3,out4]) + ' ILLUMINACLIP:/sw/lsa/centos7/trimmomatic/0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 \n')
            pars['out'].write('fastqc ' + out1 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n')
            pars['out'].write('fastqc ' + out3 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n')
            pars['out'].write('\n')

            # get total read count
            rawReadCountFile = pars['pipeline'] + '/02-reads/totalFragments/raw-' + label + '.txt'
            readCountFile = pars['pipeline'] + '/02-reads/totalFragments/' + label + '.txt'
            pars['out'].write("zcat " + in1 + " | wc -l | awk '{print $1/4}' > " + rawReadCountFile + '\n')            
            pars['out'].write("wc -l " + out1 + "| awk '{print $1/4}' > " + readCountFile + '\n')
    """

    mu.logTime(pars['out'], 'FINISH PREPROCESS')
Пример #10
0
def combine(pars):
    mu.logTime(pars['out'], 'START COMBINE')

    # add location of fagz files
    fastqFile1 = pars['pipeline'] + '/03-combined/fastq/' + pars[
        'sampleReplicate'] + '-R1.fastq'
    pars['fagzFiles'] = [fastqFile1 + '.gz']
    if pars['end'] == 'paired':
        fastqFile2 = pars['pipeline'] + '/03-combined/fastq/' + pars[
            'sampleReplicate'] + '-R2.fastq'
        pars['fagzFiles'].append(fastqFile2 + '.gz')

    # use when this function has already been called
    if pars['flavor'] == 'skip':
        return

    path = pars['pipeline'] + '/02-reads/fastq/'
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/03-combined',
                          shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] +
                          '/03-combined/fastq',
                          shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] +
                          '/03-combined/totalFragments',
                          shell=True)
    pars['out'].write('echo combine \n')

    # combine together all R1 reads into a single compressed file
    relevantFiles = sorted([
        file for file in pars['fastqFiles']
        if pars['sampleReplicate'] in file and 'R1.fastq' in file
    ])
    #    print relevantFiles

    pars['out'].write('cat ' + ' '.join(relevantFiles) + ' > ' + fastqFile1 +
                      '\n')

    # count total fragments
    countFile = pars['pipeline'] + '/03-combined/totalFragments/' + pars[
        'sampleReplicate'] + '.txt'
    pars['out'].write("wc -l " + fastqFile1 + "| awk '{print $1/4}' > " +
                      countFile + '\n')

    # zip it good
    pars['out'].write('gzip ' + fastqFile1 + '\n')
    pars['out'].write('\n')

    if pars['end'] == 'paired':
        # combine together all R2 reads int a single compressed file
        relevantFiles = sorted([
            file for file in pars['fastqFiles']
            if pars['sampleReplicate'] in file and 'R2.fastq' in file
        ])
        #        print relevantFiles

        pars['out'].write('cat ' + ' '.join(relevantFiles) + ' > ' +
                          fastqFile2 + '\n')
        pars['out'].write('gzip ' + fastqFile2 + '\n')
        pars['out'].write('\n')

    mu.logTime(pars['out'], 'FINISH COMBINE')
    return
Пример #11
0
def bam2fastq(pars):
    mu.logTime(pars['out'], 'START BAM2FASTQ')

    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/01-fastqs/',
                          shell=True)
    subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/01-fastqs/' +
                          pars['sampleReplicate'],
                          shell=True)
    outDir = pars['pipeline'] + '/01-fastqs/' + pars['sampleReplicate']
    pars['sortedBam'] = outDir + '/sorted.bam'
    pars['fastq1'], pars['fastq2'] = outDir + '/R1.fastq', outDir + '/R2.fastq'
    pars['fqgz1'], pars[
        'fqgz2'] = outDir + '/R1.fastq.gz', outDir + '/R2.fastq.gz'
    pars['fagzFiles'] = [pars['fqgz1'], pars['fqgz2']]

    if pars['flavor'] == 'bam2fastq':
        # sort bam
        mu.writeCmd(
            pars['out'], 'samtools sort -@ ' + pars['threads'] + ' -n ' +
            pars['bamFile'] + ' -o ' + pars['sortedBam'])

        # bam2fastq
        mu.writeCmd(
            pars['out'],
            'samtools fastq -@ ' + pars['threads'] + ' ' + pars['sortedBam'] +
            ' -1 ' + pars['fastq1'] + ' -2 ' + pars['fastq2'])

        # zcat
        mu.writeCmd(pars['out'],
                    'gzip -c ' + pars['fastq1'] + ' > ' + pars['fqgz1'])
        mu.writeCmd(pars['out'],
                    'gzip -c ' + pars['fastq2'] + ' > ' + pars['fqgz2'])

        # count reads
        mu.writeCmd(
            pars['out'], "wc -l " + pars['fastq1'] +
            " | awk '{print $1/4}' > " + outDir + "/readCount1.txt")
        mu.writeCmd(
            pars['out'], "wc -l " + pars['fastq2'] +
            " | awk '{print $1/4}' > " + outDir + "/readCount2.txt")

        # clean up
        mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'])
        mu.writeCmd(pars['out'], 'rm -f ' + pars['sortedBam'])
        mu.writeCmd(pars['out'], 'rm -f ' + pars['fastq1'])
        mu.writeCmd(pars['out'], 'rm -f ' + pars['fastq2'])

    if pars['flavor'] == 'link':
        assert 'fastqPipeline' in pars

        fqgz1_source = pars['pwd'] + pars['fqgz1'].replace(
            pars['pipeline'], pars['fastqPipeline'])
        fqgz2_source = pars['pwd'] + pars['fqgz2'].replace(
            pars['pipeline'], pars['fastqPipeline'])

        # link with old files
        mu.writeCmd(pars['out'], 'ln -s ' + fqgz1_source + ' ' + pars['fqgz1'])
        mu.writeCmd(pars['out'], 'ln -s ' + fqgz2_source + ' ' + pars['fqgz2'])

        # read counts not attained here - get this from fastqPipeline...

    if pars['flavor'] == 'skip' or pars['flavor'] == 'passthru':
        # nothing to here pars has already been updated above
        next

    mu.logTime(pars['out'], 'FINISH BAM2FASTQ')