def discover(flavor, out, records, sampleLabel, pipeline, numThreads, gtf): mu.logTime(out, 'START DISCOVER') sample, replicate = sampleLabel.split('#') subprocess.check_call('mkdir -p ' + pipeline + '/06-discover', shell=True) subprocess.check_call('mkdir -p ' + pipeline + '/06-discover/' + sampleLabel, shell=True) out.write('echo discover \n') if flavor == 'cufflinks': bam = pipeline + '/04-aligned/' + sampleLabel + '.bam' out.write( 'cufflinks -p ' + numThreads + ' --max-bundle-frags 10000000 --library-type fr-unstranded -o ' + pipeline + '/06-discover/' + sampleLabel + ' ' + bam + '\n') # out.write('cp ' + pipeline + '/05-quantified/' + sampleLabel + '/genes.fpkm_tracking ' + pipeline + '/05-quantified/' + sampleLabel + '-genes.fpkm' + '\n') # out.write('cp ' + pipeline + '/05-quantified/' + sampleLabel + '/isoforms.fpkm_tracking ' + pipeline + '/05-quantified/' + sampleLabel + '-isoforms.fpkm' + '\n') out.write('\n') else: print flavor raise 'DONT RECOGNIZE DISCOVER FLAVOR' mu.logTime(out, 'FINISH DISCOVER')
def clean(pars): mu.logTime(pars['out'],'START CLEAN') if pars['flavor'] == '02-reads': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*-' + pars['sampleReplicate'] + '*.fastq') if pars['flavor'] == 'gdc': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/00-downloads/' + pars['sampleReplicate'] + '/' + pars['fileName']) mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile']) mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'] + '.bai') if pars['flavor'] == 'standard': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq') mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz') # mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile']) # mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile'] + '.bai') if pars['flavor'] == 'sequencingCore': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq') mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz') if pars['flavor'] == 'custom1': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq') mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz') if pars['flavor'] == 'rsem': mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/02-reads/fastq/*' + pars['sampleReplicate'] + '*.fastq') mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/03-combined/fastq/' + pars['sampleReplicate'] + '*.fastq.gz') mu.writeCmd(pars['out'], 'rm -f ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.transcript.bam') mu.logTime(pars['out'],'FINISH CLEAN')
def quantify(pars): mu.logTime(pars['out'],'START QUANTIFY') subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/05-quantified',shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'],shell=True) pars['out'].write('echo quantify \n') # *** salmon *** if pars['flavor'] == 'salmon' or pars['flavor'] == 'salmon-bias' or pars['flavor'] == 'salmon-bias-stranded': mu.writeCmd(pars['out'], 'cp ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/quant.sf ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-abundance.txt') mu.writeCmd(pars['out'], ' ') # *** stringtie *** elif pars['flavor'] == 'stringtie': # program call prefix = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-stringtie-' outGTF = prefix + 'transcript-exon.gtf' outGene = prefix + 'gene.txt' outCov = prefix + 'cov.gtf' pars['out'].write('stringtie ' + pars['bamFile'] + ' -e -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' -e -o ' + outGTF + ' -A ' + outGene + ' -C ' + outCov + '\n') pars['out'].write('\n') # *** cufflinks *** elif pars['flavor'] == 'cufflinks': # call cufflinks if pars['stranded']: pars['out'].write('cufflinks --quiet --no-update-check -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' --max-bundle-frags 10000000 --library-type fr-firststrand -o ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + ' ' + pars['bamFile'] + '\n') else: pars['out'].write('cufflinks --quiet --no-update-check -p ' + pars['threads'] + ' -G ' + pars['gtf'] + ' --max-bundle-frags 10000000 --library-type fr-unstranded -o ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + ' ' + pars['bamFile'] + '\n') # copy and rename quantification files pars['out'].write('cp ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '/genes.fpkm_tracking ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-genes.fpkm' + '\n') pars['out'].write('cp ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '/isoforms.fpkm_tracking ' + pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-isoforms.fpkm' + '\n') pars['out'].write('\n') # *** rsem *** elif pars['flavor'] == 'rsem': inFile1 = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.genes.results' outFile1 = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-rsem-gene.txt' inFile2 = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' + pars['sampleReplicate'] + '.isoforms.results' outFile2 = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-rsem-transcript.txt' pars['out'].write('cp ' + inFile1 + ' ' + outFile1 + '\n') pars['out'].write('cp ' + inFile2 + ' ' + outFile2 + '\n') pars['out'].write('\n') # *** kallisto *** elif pars['flavor'] == 'kallisto': inFile = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/abundance.tsv' outFile = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-kallisto-transcript.txt' pars['out'].write('cp ' + inFile + ' ' + outFile + '\n') pars['out'].write('\n') mu.logTime(pars['out'],'FINISH QUANTIFY')
def qc(pars): mu.logTime(pars['out'], 'START QC') # use when this function has already been called # assumes we have 44GB of memory # assumes we have 64GB of memory # this can be lowered but by default we need to be robust to TCGA if pars['flavor'] == 'qorts': qorts = '/nfs/turbo/bankheadTurbo/software/QoRTs/v1.3.6/QoRTs.jar' # assemble parameters outDir = pars['pipeline'] + '/10-qc/' + pars['sampleReplicate'] subprocess.check_call('mkdir -p ' + outDir, shell=True) sortedBam = outDir + '/' + pars['sampleReplicate'] + '.bam' pars['out'].write('samtools sort -n -@ ' + pars['threads'] + ' ' + pars['bamFile'] + ' -o ' + sortedBam + ' \n') # specifty endedness and strandedness params = '--keepMultiMapped ' # for hisat compatibility if pars['end'] == 'single': params += '--singleEnded ' if pars['stranded']: params += '--stranded ' # run qorts here pars['out'].write( 'java -Xmx12G -jar ' + qorts + ' QC ' + params + ' --skipFunctions overlapMatch,NVC,GCDistribution,readLengthDistro,QualityScoreDistribution,writeClippedNVC,CigarOpDistribution,cigarLocusCounts,chromCounts --generatePlots ' + sortedBam + ' ' + pars['gtf'] + ' ' + outDir + '\n') # clean up name sorted bam pars['out'].write('echo rm -f ' + sortedBam + '*' + '\n') pars['out'].write('rm -f ' + sortedBam + '*' + '\n') # use when this function has already been called elif pars['flavor'] == 'picard': outDir = pars['pipeline'] + '/06-qc/' + pars['sampleReplicate'] + '/' outFile = outDir + pars['flavor'] + '.txt' call = 'time java -Xmx28G -jar /sw/med/centos7/picard/2.4.1/picard.jar CollectRnaSeqMetrics I=' + pars[ 'bam'] + ' O=' + outFile + ' REF_FLAT=' + pars[ 'gtf'] + ' STRAND_SPECIFICITY=NONE' # call yo program subprocess.check_call('mkdir -p ' + outDir, shell=True) pars['out'].write(call + '\n') mu.logTime(pars['out'], 'FINISH QC')
def isCompleted(pars): mu.logTime(pars['out'],'START COMPLETION CHECK') if pars['flavor'] == 'cufflinks': file = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-genes.fpkm' elif pars['flavor'] == 'qorts': file = pars['pipeline'] + '/10-qc/' + pars['sampleReplicate'] + '/QC.summary.txt' elif pars['flavor'] == 'salmon' or pars['flavor'] == 'salmon-bias' or pars['flavor'] == 'salmon-bias-stranded': file = pars['pipeline'] + '/05-quantified/' + pars['sampleReplicate'] + '-abundance.txt' pars['out'].write('echo "checking if ' + file + ' exists..."' + '\n') pars['out'].write('if [ -e ' + file + ' ]; then ' + '\n') pars['out'].write(' echo "already processed ' + pars['sampleReplicate'] + '...exiting..." ' + '\n') pars['out'].write(' exit ' + '\n') pars['out'].write('else ' + '\n') pars['out'].write(' echo "not found...processing..."' + '\n') pars['out'].write('fi ' + '\n') mu.logTime(pars['out'],'FINISH COMPLETION CHECK')
def download(pars): mu.logTime(pars['out'], 'START DOWNLOAD') subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/00-downloads', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/00-downloads/' + pars['sampleReplicate'], shell=True) outDir = pars['pipeline'] + '/00-downloads/' + pars['sampleReplicate'] if pars['flavor'] == 'gdc': # download bam file mu.writeCmd(pars['out'], 'echo ' + pars['sampleReplicate']) mu.writeCmd(pars['out'], 'cd ' + outDir) # mu.writeCmd(pars['out'], 'time curl -O -J -H "X-Auth-Token: ' + pars['token'] + '" https://api.gdc.cancer.gov/data/' + pars['fileID']) mu.writeCmd( pars['out'], 'time curl --remote-name --remote-header-name --header "X-Auth-Token: ' + pars['token'] + '" https://api.gdc.cancer.gov/data/' + pars['fileID']) pars['bamFile'] = outDir + '/' + pars['fileName'] mu.writeCmd(pars['out'], 'cd ../../.. \n') # get size and md5sum mu.writeCmd( pars['out'], "du -chs " + pars['bamFile'] + " | grep -v total | awk '{ print $1 }' > " + outDir + "/bam-size.txt") mu.writeCmd( pars['out'], "md5sum " + pars['bamFile'] + " | grep -v total | awk '{ print $1 }' > " + outDir + "/bam-md5sum.txt") if pars['flavor'] == 'skip' or pars['flavor'] == 'passthru': pars['bamFile'] = outDir + '/' + pars['fileName'] mu.logTime(pars['out'], 'FINISH DOWNLOAD')
else: out1.write(line) out1.write(execute + ' $SLURM_ARRAY_TASK_ID' + '\n') # write execute script with open(execute, 'w') as out1: out1.write('#!/bin/bash' + '\n') out1.write('echo -e "SLURM_ARRAY_TASK_ID\t$1"' + '\n') # assign variables based on array id # out1.write('sampleReplicate=$(head -n$((1+$1)) ' + inFile1 + '| tail -n1 | cut -f4)' + '\n') out1.write('sampleScript=$(ls -1 ' + pipeline + '/scripts/*.sh | head -n$(($1)) | tail -n1) ' + '\n') # print sampleReplicate out1.write('echo -e "sampleScript\t$sampleScript"' + '\n') out1.write('echo' + '\n') mu.logTime(out1, 'ALL START') # set up custom script out1.write('cmd=$sampleScript ' + '\n') out1.write('echo $cmd; eval $cmd' + '\n') mu.logTime(out1, 'ALL FINISHED!') out1.write('echo' + '\n') # update script permissions cmd = 'chmod 755 ' + execute subprocess.check_call(cmd, shell=True)
def align(pars): mu.logTime(pars['out'],'START ALIGN') subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned',shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'],shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/04-aligned/totalFragments',shell=True) pars['out'].write('echo align \n') pars['outDir'] = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '/' pars['bamFile'] = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' # *** salmon-bias *** if pars['flavor'] == 'salmon-bias': pars['fqgz1'] = pars['fagzFiles'][0] pars['fqgz2'] = pars['fagzFiles'][1] mu.writeCmd(pars['out'], 'salmon quant --libType A -p ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' --posBias --seqBias --gcBias -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2']) mu.writeCmd(pars['out'], ' ') # *** salmon *** elif pars['flavor'] == 'salmon': pars['fqgz1'] = pars['fagzFiles'][0] pars['fqgz2'] = pars['fagzFiles'][1] mu.writeCmd(pars['out'], 'salmon quant --libType A -p ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2']) mu.writeCmd(pars['out'], ' ') # *** star *** elif pars['flavor'] == 'star': pars['out'].write('STAR --runThreadN ' + pars['threads'] + ' --genomeDir ' + pars['alignerIndexDir'] + ' --readFilesIn ' + ' '.join(pars['fagzFiles']) + ' --outFileNamePrefix ' + pars['outDir'] + ' --outSAMtype BAM Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMunmapped Within ' + '\n') pars['out'].write('samtools sort -@ ' + pars['threads'] + ' ' + pars['outDir'] + 'Aligned.out.bam -o ' + pars['bamFile'] + ' \n') pars['out'].write('samtools index ' + pars['bamFile'] + ' \n') pars['out'].write('rm -f ' + pars['outDir'] + 'Aligned.out.bam' + ' \n') pars['out'].write('\n') # *** hisat2 *** elif 'hisat2' in pars['flavor']: pars['fqgz1'] = pars['fagzFiles'][0] pars['fqgz2'] = pars['fagzFiles'][1] sam = pars['outDir'] + 'tmp.sam' bam1 = pars['outDir'] + 'tmp.bam' summary = pars['outDir'] + 'summary.log' dta = '--dta-cufflinks' if 'cufflinks' in pars['flavor'] else '--dta' strandCommand = '' if pars['stranded'] == False else '--rna-strandness RF' # run hisat2 pars['out'].write('hisat2 -p ' + pars['threads'] + ' ' + dta + ' ' + strandCommand + ' --summary-file ' + summary + ' -x ' + pars['alignerIndexDir'] + ' -1 ' + pars['fqgz1'] + ' -2 ' + pars['fqgz2'] + ' -S ' + sam + '\n') # polish up the bam pars['out'].write('samtools sort -O BAM -@ ' + pars['threads'] + ' ' + sam + ' -o ' + pars['bamFile'] + ' \n') pars['out'].write('samtools index ' + pars['bamFile'] + ' \n') # delete the unnecessary artifacts (the crap) pars['out'].write('rm -f ' + sam + ' ' + ' \n') pars['out'].write('\n') elif pars['flavor'] == 'rsem': # for when we are workign with files directly from fastq pars['fqgz1'] = pars['fagzFiles'][0] pars['fqgz2'] = pars['fagzFiles'][1] # rsem call here... # pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --bowtie2 --bowtie2-path /sw/med/centos7/bowtie2/2.3.4.3/ --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n') # pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --star --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n') # pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --star --star-gzipped-read-file --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n') pars['out'].write('rsem-calculate-expression -p ' + pars['threads'] + ' --paired-end --bowtie2 --estimate-rspd --append-names ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + ' ' + pars['alignerIndexDir'] + ' ' + pars['outDir'] + pars['sampleReplicate'] + ' \n') pars['out'].write('\n') elif pars['flavor'] == 'kallisto': # for when we are workign with files directly from fastq pars['fqgz1'] = pars['fagzFiles'][0] pars['fqgz2'] = pars['fagzFiles'][1] # to reduce processing time and save space bams will not be generated pars['out'].write('kallisto quant -t ' + pars['threads'] + ' -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' ' + pars['fqgz1'] + ' ' + pars['fqgz2'] + '\n') pars['out'].write('\n') elif pars['flavor'] == 'skip': mu.logTime(pars['out'],'FINISH ALIGN') return """ # need to be tested - refactoring elif pars['flavor'] == 'tophat': # coverage search is only recommended for <45bp reads or <10m reads per sample pars['out'].write('# tophat -p ' + pars['threads'] + ' --output-dir ' + pars['outDir'] + ' --no-coverage-search ' + pars['alignerIndexDir'] + ' ' + ' '.join(pars['fagzFiles']) + '\n') pars['out'].write('tophat -p ' + pars['threads'] + ' --output-dir ' + pars['outDir'] + ' --no-coverage-search ' + pars['alignerIndexDir'] + ' ' + ' '.join(pars['fagzFiles']) + '\n') pars['out'].write('cp ' + pars['outDir'] + 'accepted_hits.bam ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' + '\n') pars['out'].write('samtools index ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' + '\n') pars['out'].write('\n') elif pars['flavor'] == 'kallisto-pseudo': # run kallisto specifically to generate pseudo bams pars['out'].write('kallisto quant --pseudobam -i ' + pars['alignerIndexDir'] + ' -o ' + pars['outDir'] + ' ' + ' '.join(pars['fagzFiles']) + ' > ' + pars['outDir'] + 'pseudo.sam' + '\n') # use sam tools to compress, sort, and index pars['out'].write('samtools view -@ ' + pars['threads'] + ' -bS ' + pars['outDir'] + 'pseudo.sam > ' + pars['outDir'] + 'pseudo.bam \n') pars['out'].write('samtools sort -@ ' + pars['threads'] + ' ' + pars['outDir'] + 'pseudo.bam ' + pars['outDir'] + 'pseudo-sorted \n') pars['out'].write('samtools index ' + pars['outDir'] + 'pseudo-sorted.bam \n') # clean up un necessary artifacts pars['out'].write('rm ' + pars['outDir'] + 'pseudo.sam \n') pars['out'].write('rm ' + pars['outDir'] + 'pseudo.bam \n') pars['out'].write('mv ' + pars['outDir'] + 'pseudo-sorted.bam ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam \n') pars['out'].write('mv ' + pars['outDir'] + 'pseudo-sorted.bam.bai ' + pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam.bai \n') pars['out'].write('\n') """ # count total aligned fragments if pars['flavor'] == 'star' or pars['flavor'] == 'tophat' or 'hisat2' in pars['flavor']: countFile = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.txt' bam = pars['pipeline'] + '/04-aligned/' + pars['sampleReplicate'] + '.bam' tmp1 = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.tmp1.txt' tmp2 = pars['pipeline'] + '/04-aligned/totalFragments/' + pars['sampleReplicate'] + '.tmp2.txt' pars['out'].write('samtools view -F 4 ' + bam + ' | cut -f1 > ' + tmp1 + '\n') pars['out'].write('sort ' + tmp1 + ' | uniq > ' + tmp2 + '\n') pars['out'].write('wc -l ' + tmp2 + ' | sed "s/ .*//" > ' + countFile + '\n') pars['out'].write('rm -f ' + tmp1 + ' ' + tmp2 + '\n') mu.logTime(pars['out'],'FINISH ALIGN')
def preprocess(pars): mu.logTime(pars['out'], 'START PREPROCESS') records = pars['records'] files = sorted([ records[key]['file'] for key in records.keys() if records[key]['sampleReplicate'] == pars['sampleReplicate'] ]) pars['fastqFiles'] = [] subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads/fastq', shell=True) if pars['fastqc']: subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads/fastqc', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/02-reads/totalFragments', shell=True) pars['out'].write('echo preprocess \n') # no trimming no fastqc just set up symbolic link for next step if pars['flavor'] == 'passthru': uniqueNames = sorted( set( list([ records[key]['uniqueName'] for key in records.keys() if records[key]['sampleReplicate'] == pars['sampleReplicate'] ]))) # need to get pairs of files that are associated with a given coreSampleLabel for uniqueName in uniqueNames: myFiles = sorted([ records[key]['file'] for key in records.keys() if records[key]['uniqueName'] == uniqueName ]) assert len( myFiles ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...' # process first fastq file in1 = myFiles[0] fastqFile1 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq.gz' pars['out'].write('ln -s ' + in1 + ' ' + fastqFile1 + '\n') # for paired end data we have two sets of fasteq reads per uniqueName if pars['end'] == 'paired': in2 = myFiles[1] if pars['end'] == 'paired' else none fastqFile2 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq.gz' pars['out'].write('ln -s ' + in2 + ' ' + fastqFile2 + '\n') pars['out'].write('\n') # assign pars['fagzFiles'] = [fastqFile1] if pars['end'] == 'paired': pars['fagzFiles'].append(fastqFile2) # count reads readCountFile = pars[ 'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt' pars['out'].write("zcat " + myFiles[0] + " | wc -l | awk '{print $1/4}' > " + readCountFile + '\n') # no preprocessing or trimming - just convert to fastq files elif pars['flavor'] == 'none': uniqueNames = sorted( set( list([ records[key]['uniqueName'] for key in records.keys() if records[key]['sampleReplicate'] == pars['sampleReplicate'] ]))) # need to get pairs of files that are associated with a given coreSampleLabel for uniqueName in uniqueNames: myFiles = sorted([ records[key]['file'] for key in records.keys() if records[key]['uniqueName'] == uniqueName ]) assert len( myFiles ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...' # process first fastq file in1 = myFiles[0] fastqFile1 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq' pars['out'].write('zcat ' + in1 + ' > ' + fastqFile1 + '\n') if pars['fastqc']: pars['out'].write('fastqc ' + fastqFile1 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') # for paired end data we have two sets of fasteq reads per uniqueName if pars['end'] == 'paired': in2 = myFiles[1] if pars['end'] == 'paired' else none fastqFile2 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq' pars['out'].write('zcat ' + in2 + ' > ' + fastqFile2 + '\n') if pars['fastqc']: pars['out'].write('fastqc ' + fastqFile2 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') pars['out'].write('\n') # count reads rawReadCountFile = pars[ 'pipeline'] + '/02-reads/totalFragments/raw-' + uniqueName + '.txt' readCountFile = pars[ 'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt' pars['out'].write("zcat " + myFiles[0] + " | wc -l | awk '{print $1/4}' > " + rawReadCountFile + '\n') pars['out'].write("wc -l " + fastqFile1 + "| awk '{print $1/4}' > " + readCountFile + '\n') elif pars['flavor'] == 'rrna-removal': uniqueNames = sorted( set( list([ records[key]['uniqueName'] for key in records.keys() if records[key]['sampleReplicate'] == pars['sampleReplicate'] ]))) # need to get pairs of files that are associated with a given coreSampleLabel for uniqueName in uniqueNames: myFiles = sorted([ records[key]['file'] for key in records.keys() if records[key]['uniqueName'] == uniqueName ]) assert len( myFiles ) <= 2, 'SHOULD ONLY BE NO MORE THAN 2 MYFILES ASSOCIATED...' # process first fastq file in1 = myFiles[0] fastqFile1_raw = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-raw-R1.fastq' pars['out'].write('zcat ' + in1 + ' > ' + fastqFile1_raw + '\n') # for paired end data we have two sets of fasteq reads per uniqueName if pars['end'] == 'paired': in2 = myFiles[1] if pars['end'] == 'paired' else none fastqFile2_raw = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-raw-R2.fastq' pars['out'].write('zcat ' + in2 + ' > ' + fastqFile2_raw + '\n') pars['out'].write('\n') # remove reads mapping to ribosomal rna # use start to align to align to rrna reference # what ever does not align goes to fastqFile fastqFile1 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R1.fastq' fastqFile2 = pars[ 'pipeline'] + '/02-reads/fastq/' + uniqueName + '-R2.fastq' pars['out'].write('bbduk.sh in1=' + fastqFile1_raw + ' in2=' + fastqFile2_raw + ' out1=' + fastqFile1 + ' out2=' + fastqFile2 + ' ref=' + pars['rrnaReference'] + '\n') # fastqc if pars['fastqc']: pars['out'].write('fastqc ' + fastqFile1 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') if pars['end'] == 'paired': pars['out'].write('fastqc ' + fastqFile2 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') # count reads rawReadCountFile = pars[ 'pipeline'] + '/02-reads/totalFragments/raw-' + uniqueName + '.txt' readCountFile = pars[ 'pipeline'] + '/02-reads/totalFragments/' + uniqueName + '.txt' pars['out'].write("wc -l " + fastqFile1_raw + "| awk '{print $1/4}' > " + rawReadCountFile + '\n') pars['out'].write("wc -l " + fastqFile1 + "| awk '{print $1/4}' > " + readCountFile + '\n') pass elif pars['flavor'] == 'skip': pass # dont understand else: print pars['flavor'] raise 'DONT UNDERSTAND PREPROCESS FLAVOR!!' # get fastq file entries myKeys = sorted([ key for key in records.keys() if records[key]['sampleReplicate'] == pars['sampleReplicate'] ]) for myKey in myKeys: record = records[myKey] label = record['uniqueName'] + '-R' + record['read'] fastqFile = pars['pipeline'] + '/02-reads/fastq/' + label + '.fastq' pars['fastqFiles'].append(fastqFile) """ NEEDS TO BE REFACTORED USING DICTIONARY PARAMS AND SINGLE END SUPPORT elif pars['flavor'] == 'trimmomatic': uniqueNames = sorted(set(list([records[key]['uniqueName'] for key in records.keys() if records[key]['sample'] == sample and records[key]['replicate'] == replicate]))) # need to get pairs of files that are associated with a given coreSampleLabel for uniqueName in uniqueNames: myFiles = sorted([records[key]['file'] for key in records.keys() if records[key]['uniqueName'] == uniqueName]) assert len(myFiles) == 2, 'SHOULD ONLY BE 2 MYFILES ASSOCIATED...' in1 = myFiles[0] in2 = myFiles[1] label = uniqueName + '-' + sample + '-' + replicate out1=pars['pipeline'] + '/02-reads/fastq/' + label + '-R1.fastq' out2=pars['pipeline'] + '/02-reads/fastq/' + label + '-R1-U.fastq' out3=pars['pipeline'] + '/02-reads/fastq/' + label + '-R2.fastq' out4=pars['pipeline'] + '/02-reads/fastq/' + label + '-R2-U.fastq' fastqFiles += [out1,out3] # write to pbs file pars['out'].write('java -jar /sw/lsa/centos7/trimmomatic/0.36/bin/trimmomatic-0.36.jar PE -threads ' + numThreads + ' ' + ' '.join([in1,in2,out1,out2,out3,out4]) + ' ILLUMINACLIP:/sw/lsa/centos7/trimmomatic/0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 \n') pars['out'].write('fastqc ' + out1 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') pars['out'].write('fastqc ' + out3 + ' --outdir=' + pars['pipeline'] + '/02-reads/fastqc/' + '\n') pars['out'].write('\n') # get total read count rawReadCountFile = pars['pipeline'] + '/02-reads/totalFragments/raw-' + label + '.txt' readCountFile = pars['pipeline'] + '/02-reads/totalFragments/' + label + '.txt' pars['out'].write("zcat " + in1 + " | wc -l | awk '{print $1/4}' > " + rawReadCountFile + '\n') pars['out'].write("wc -l " + out1 + "| awk '{print $1/4}' > " + readCountFile + '\n') """ mu.logTime(pars['out'], 'FINISH PREPROCESS')
def combine(pars): mu.logTime(pars['out'], 'START COMBINE') # add location of fagz files fastqFile1 = pars['pipeline'] + '/03-combined/fastq/' + pars[ 'sampleReplicate'] + '-R1.fastq' pars['fagzFiles'] = [fastqFile1 + '.gz'] if pars['end'] == 'paired': fastqFile2 = pars['pipeline'] + '/03-combined/fastq/' + pars[ 'sampleReplicate'] + '-R2.fastq' pars['fagzFiles'].append(fastqFile2 + '.gz') # use when this function has already been called if pars['flavor'] == 'skip': return path = pars['pipeline'] + '/02-reads/fastq/' subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/03-combined', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/03-combined/fastq', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/03-combined/totalFragments', shell=True) pars['out'].write('echo combine \n') # combine together all R1 reads into a single compressed file relevantFiles = sorted([ file for file in pars['fastqFiles'] if pars['sampleReplicate'] in file and 'R1.fastq' in file ]) # print relevantFiles pars['out'].write('cat ' + ' '.join(relevantFiles) + ' > ' + fastqFile1 + '\n') # count total fragments countFile = pars['pipeline'] + '/03-combined/totalFragments/' + pars[ 'sampleReplicate'] + '.txt' pars['out'].write("wc -l " + fastqFile1 + "| awk '{print $1/4}' > " + countFile + '\n') # zip it good pars['out'].write('gzip ' + fastqFile1 + '\n') pars['out'].write('\n') if pars['end'] == 'paired': # combine together all R2 reads int a single compressed file relevantFiles = sorted([ file for file in pars['fastqFiles'] if pars['sampleReplicate'] in file and 'R2.fastq' in file ]) # print relevantFiles pars['out'].write('cat ' + ' '.join(relevantFiles) + ' > ' + fastqFile2 + '\n') pars['out'].write('gzip ' + fastqFile2 + '\n') pars['out'].write('\n') mu.logTime(pars['out'], 'FINISH COMBINE') return
def bam2fastq(pars): mu.logTime(pars['out'], 'START BAM2FASTQ') subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/01-fastqs/', shell=True) subprocess.check_call('mkdir -p ' + pars['pipeline'] + '/01-fastqs/' + pars['sampleReplicate'], shell=True) outDir = pars['pipeline'] + '/01-fastqs/' + pars['sampleReplicate'] pars['sortedBam'] = outDir + '/sorted.bam' pars['fastq1'], pars['fastq2'] = outDir + '/R1.fastq', outDir + '/R2.fastq' pars['fqgz1'], pars[ 'fqgz2'] = outDir + '/R1.fastq.gz', outDir + '/R2.fastq.gz' pars['fagzFiles'] = [pars['fqgz1'], pars['fqgz2']] if pars['flavor'] == 'bam2fastq': # sort bam mu.writeCmd( pars['out'], 'samtools sort -@ ' + pars['threads'] + ' -n ' + pars['bamFile'] + ' -o ' + pars['sortedBam']) # bam2fastq mu.writeCmd( pars['out'], 'samtools fastq -@ ' + pars['threads'] + ' ' + pars['sortedBam'] + ' -1 ' + pars['fastq1'] + ' -2 ' + pars['fastq2']) # zcat mu.writeCmd(pars['out'], 'gzip -c ' + pars['fastq1'] + ' > ' + pars['fqgz1']) mu.writeCmd(pars['out'], 'gzip -c ' + pars['fastq2'] + ' > ' + pars['fqgz2']) # count reads mu.writeCmd( pars['out'], "wc -l " + pars['fastq1'] + " | awk '{print $1/4}' > " + outDir + "/readCount1.txt") mu.writeCmd( pars['out'], "wc -l " + pars['fastq2'] + " | awk '{print $1/4}' > " + outDir + "/readCount2.txt") # clean up mu.writeCmd(pars['out'], 'rm -f ' + pars['bamFile']) mu.writeCmd(pars['out'], 'rm -f ' + pars['sortedBam']) mu.writeCmd(pars['out'], 'rm -f ' + pars['fastq1']) mu.writeCmd(pars['out'], 'rm -f ' + pars['fastq2']) if pars['flavor'] == 'link': assert 'fastqPipeline' in pars fqgz1_source = pars['pwd'] + pars['fqgz1'].replace( pars['pipeline'], pars['fastqPipeline']) fqgz2_source = pars['pwd'] + pars['fqgz2'].replace( pars['pipeline'], pars['fastqPipeline']) # link with old files mu.writeCmd(pars['out'], 'ln -s ' + fqgz1_source + ' ' + pars['fqgz1']) mu.writeCmd(pars['out'], 'ln -s ' + fqgz2_source + ' ' + pars['fqgz2']) # read counts not attained here - get this from fastqPipeline... if pars['flavor'] == 'skip' or pars['flavor'] == 'passthru': # nothing to here pars has already been updated above next mu.logTime(pars['out'], 'FINISH BAM2FASTQ')