def processSample(sampleName, sampleNameDict): '''pipeline for sample ''' # process communication mng = Manager() libraryBamFileList = mng.list() # This libraryBamFileList is a list contains seval bams from one sample. # init mult-processer record = [] for sampleID in sampleNameDict: Processer = Process(name=sampleID, target=processSampleFromLibarary, args=( sampleID, sampleNameDict[sampleID], libraryBamFileList, )) Processer.start() record.append(Processer) # wait for processer for proc in record: proc.join() ###################### 2.1 post mapping #################### bamFileDir = os.path.dirname(libraryBamFileList[0]) finalBamFilePath = os.path.join(bamFileDir, "%s_properly.bam" % sampleName) mergedBamFilePath = os.path.join(bamFileDir, "%s_merged.bam" % sampleName) # merge the bam from each line to one final bam command = NGSTools.picard_merge(libraryBamFileList, mergedBamFilePath, cfg) NGSTools.writeCommands(command, bamFileDir + '/picard_mergebam_' + sampleName + '.sh', run=_run) ###################### 3. filter bam ###################### #command = 'samtools view -Sb -h -f 2 -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) command = 'samtools view -Sb -h -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) NGSTools.writeCommands(command, bamFileDir + '/filterBam_' + sampleName + '.sh', run=_run) ###################### 4. romove duplicates ################## if RMDUP: command = NGSTools.picard_rmdup(finalBamFilePath, True, cfg) NGSTools.writeCommands(command, bamFileDir + '/picard_rmdup_' + sampleName + '.sh', run=_run) finalBamFilePath = re.sub(r'.bam$', '.rmdup.bam', finalBamFilePath)
def processSampleFromLibarary(sampleID, sampleIDList, libraryBamFileList): '''''' ########################## 0. init ######################### #__init__(self, sampleName, outdir, fq1, fq2='', qualityBase='33', cfgfile='~/.NGSTools.cfg') mySample = NGSTools.NGSTools(sampleID, args.outDir, fq1=sampleIDList[0], fq2=sampleIDList[1], qualityBase=args.qbase, cfgfile=os.path.abspath(args.config)) ########################## 1. QC ######################### if QC: mySample.cutadapter(run=_run) mySample.QC_fastqc(run=_run) ########################## 2. Mapping ####################### if BWA: finalBam = mySample.bwa(run=_run) #finalBam = mySample.bowtie2(mode='--end-to-end', run=_run) finalBam = mySample.samtools_sort(run=_run) libraryBamFileList.append(finalBam)
def processSampleFromLibarary(sampleID, sampleIDList, libraryBamFileList): '''''' ########################## 0. init ######################### #__init__(self, sampleName, outdir, fq1, fq2='', quanlityBase='33', cfgfile='~/.NGSTools.cfg') mySample = NGSTools.NGSTools(sampleID, args.outDir, fq1=sampleIDList[0], fq2=sampleIDList[1], cfgfile=os.path.abspath(args.config)) ########################## 1. QC ######################### if QC: mySample.trim_Galore(args.rrbs, run=_run) mySample.QC_fastqc(run=_run) ########################## 2. Mapping ####################### if Bismark: finalBam = mySample.Bismark(run=_run) elif Bs_seeker2: finalBam = mySample.Bs_seeker2(run=_run) else: print 'WARNING: not choose a mapping tools' libraryBamFileList.append(finalBam)
def processSample(sampleName, sampleNameDict): '''pipeline for sample ''' # process communication mng = Manager() libraryBamFileList = mng.list() # This libraryBamFileList is a list contains seval bams from one sample. # init mult-processer record = [] for sampleID in sampleNameDict: Processer = Process(name=sampleID, target=processSampleFromLibarary, args=( sampleID, sampleNameDict[sampleID], libraryBamFileList, )) Processer.start() record.append(Processer) # wait for processer for proc in record: proc.join() ###################### 2.1 post mapping #################### bamFileDir = os.path.dirname(libraryBamFileList[0]) finalBamFilePath = os.path.join(bamFileDir, "%s_final.bam" % sampleName) # merge the bam from each line to one final bam command = NGSTools.picard_merge(libraryBamFileList, finalBamFilePath, cfg) NGSTools.writeCommands(command, args.outDir + '/mapping/picard_mergebam_' + sampleName + '.sh', run=_run) ###################### 3. romove duplicates ################## if RMDUP: command = NGSTools.picard_rmdup(finalBamFilePath, True, cfg) NGSTools.writeCommands(command, args.outDir + '/mapping/picard_rmdup_' + sampleName + '.sh', run=_run) finalBamFilePath = re.sub(r'.bam$', '.rmdup.bam', finalBamFilePath) ##################### 4. DMR calling ################## if Methylation_extractor: NGSTools.methylation_extractor(finalBamFilePath, bamFileDir + '/' + sampleName, cfg)
def processSample(sampleName, sampleNameDict): '''pipeline for sample ''' # process communication mng = Manager() libraryBamFileList = mng.list() # This libraryBamFileList is a list contains seval bams from one sample. # init mult-processer record = [] for sampleID in sampleNameDict: Processer = Process(name = sampleID, target = processSampleFromLibarary, args = (sampleID, sampleNameDict[sampleID], libraryBamFileList, )) Processer.start() record.append(Processer) # wait for processer for proc in record: proc.join() ###################### 2.1 post mapping #################### bamFileDir = os.path.dirname(libraryBamFileList[0]) finalBamFilePath = os.path.join(bamFileDir, "%s_properly.bam" % sampleName) mergedBamFilePath = os.path.join(bamFileDir, "%s_merged.bam" % sampleName) # merge the bam from each line to one final bam command = NGSTools.picard_merge(libraryBamFileList, mergedBamFilePath, cfg) NGSTools.writeCommands(command, bamFileDir+'/picard_mergebam_'+sampleName+'.sh', run=_run) ###################### 3. filter bam ###################### #command = 'samtools view -Sb -h -f 2 -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) command = 'samtools view -Sb -h -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) NGSTools.writeCommands(command, bamFileDir+'/filterBam_'+sampleName+'.sh', run=_run) ###################### 4. romove duplicates ################## if RMDUP: command = NGSTools.picard_rmdup(finalBamFilePath, True, cfg) NGSTools.writeCommands(command, bamFileDir+'/picard_rmdup_'+sampleName+'.sh', run=_run) finalBamFilePath = re.sub(r'.bam$', '.rmdup.bam', finalBamFilePath)
def processSample(sampleName, sampleNameDict): '''pipeline for sample ''' # process communication mng = Manager() libraryBamFileList = mng.list() # This libraryBamFileList is a list contains seval bams from one sample. # init mult-processer record = [] for sampleID in sampleNameDict: Processer = Process(name = sampleID, target = processSampleFromLibarary, args = (sampleID, sampleNameDict[sampleID], libraryBamFileList, )) Processer.start() record.append(Processer) # wait for processer for proc in record: proc.join() ###################### 2.1 post mapping #################### bamFileDir = os.path.dirname(libraryBamFileList[0]) finalBamFilePath = os.path.join(bamFileDir, "%s_final.bam" % sampleName) # merge the bam from each line to one final bam command = NGSTools.picard_merge(libraryBamFileList, finalBamFilePath, cfg) NGSTools.writeCommands(command, args.outDir+'/mapping/picard_mergebam_'+sampleName+'.sh', run=_run) ###################### 3. romove duplicates ################## if RMDUP: command = NGSTools.picard_rmdup(finalBamFilePath, True, cfg) NGSTools.writeCommands(command, args.outDir+'/mapping/picard_rmdup_'+sampleName+'.sh', run=_run) finalBamFilePath = re.sub(r'.bam$', '.rmdup.bam', finalBamFilePath) ##################### 4. DMR calling ################## if Methylation_extractor: NGSTools.methylation_extractor(finalBamFilePath, bamFileDir+'/'+sampleName, cfg)
Methylation_extractor = True if 6 in analy: swDMR = True if 7 in analy: bismark_methylation_extractor = True if args.debug: _run = False else: _run = True #read config file cfg = NGSTools.getConfig(os.path.abspath(args.config)) # sample list parse def sampleListParser(): ''' parse the sample list file. return a dictionary: { sampleName1: { sample ID: [ fastq1 file path, fastq2 file path ] }, sampleName2:
def processSample(line, condition, transcripts, countsFiles, finalBam, expressCXB): cols = line.strip().split('\t') if len(cols) == 3: # single end library fq2 = '-' else: # paired end fq2 = cols[3] sample = { 'name': cols[0], 'condition': cols[1], 'fq1': cols[2], 'fq2': fq2, 'bam': '' } ########################## 0. init ######################### #__init__(self, sampleName, outdir, fq1, fq2='', quanlityBase='32', cfgfile='~/.NGSTools.cfg'): mySample = NGSTools.NGSTools(sample['name'], args.outDir, sample['fq1'], sample['fq2'], libType=args.libraryType, cfgfile=os.path.abspath(args.config)) if QC: #################### 1. Quality Control #################### ###### 1.1 cut adapter ###### if args.dataType == 'raw': #mySample.cutadapter(adapter5='', adapter3='AATGATACGGCGACCACCGAGATCT', run = _run) mySample.cutadapter(run=_run) ### Nextera Kit #mySample.cutadapter(adapter5='CTGTCTCTTATACAC', adapter3='CTGTCTCTTATACAC',run = _run) #mySample.rm_lowQual(run = _run) else: pass ##### 1.2 fastqc ##### mySample.QC_fastqc(run=_run) if Mapping: ######################## 2. Mapping ######################## sample['bam'] = mySample.tophat2(run=_run) if condition.has_key(sample['condition']): #condition[sample['condition']][sample['name']] = sample['bam'] condition[sample['condition']] += "," + sample['bam'] else: #condition[sample['condition']] = {sample['name'] : sample['bam']} condition[sample['condition']] = sample['bam'] if GFold: # GFold count mySample.gfoldCount(run=_run) if DESeq: # DESeq2 count = mySample.HTSeq_count(run=_run) countsFiles[count] = sample['condition'] + '|' + sample['name'] if GATK: # remove duplicates mySample.rmdup(run=_run) # picard reorder mySample.picard_reorder(run=_run) # splitN mySample.splitN(run=_run) # realign realnBam = mySample.realn(run=_run) # recal need known SNP site # recal #recalBam = mySample.recal(run = _run) finalBam[realnBam] = sample['condition'] # samtools call SNP/InDel mySample.samtools_call(run=_run) mySample.samtools_filter(run=_run) ######################## DEGs calling preparation ######################## if Cufflinks: ##### 3. cufflinks ##### cuffdir = os.path.join(args.outDir, 'cufflinks') if not os.path.exists(cuffdir): os.mkdir(cuffdir) # cufflinks # command = 'cufflinks --library-type %s -p 4 -g %s -o %s %s' % ( args.libraryType, cfg.gtf, os.path.join(cuffdir, sample['condition'] + '_' + sample['name']), sample['bam']) NGSTools.writeCommands(command, cuffdir + '/cufflinks_%s.sh' % sample['name'], _run) transcripts.append( os.path.join(cuffdir, sample['condition'] + '_' + sample['name'], 'transcripts.gtf'))
if 5 in analy: DEXSeq = True if 6 in analy: GATK = True if 7 in analy: GFold = True global _run _run = '' if args.debug == 'False': _run = True else: _run = False ## cfg = NGSTools.getConfig(os.path.abspath(args.config)) def processSample(line, condition, transcripts, countsFiles, finalBam, expressCXB): cols = line.strip().split('\t') if len(cols) == 3: # single end library fq2 = '-' else: # paired end fq2 = cols[3] sample = {
def processSample(line, condition, transcripts, countsFiles, finalBam): cols = line.strip().split('\t') if len(cols) == 3: # single end library fq2 = '-' else: # paired end fq2 = cols[3] sample = { 'name' : cols[0], 'condition' : cols[1], 'fq1' : cols[2], 'fq2' : fq2, 'bam' : '' } ########################## 0. init ######################### #__init__(self, sampleName, outdir, fq1, fq2='', quanlityBase='32', cfgfile='~/.NGSTools.cfg'): mySample = NGSTools.NGSTools(sample['name'], args.outDir, sample['fq1'], sample['fq2'], cfgfile=os.path.abspath(args.config)) if QC: #################### 1. Quality Control #################### ###### 1.1 cut adapter ###### if args.dataType == 'raw': #mySample.cutadapter(adapter5='', adapter3='AATGATACGGCGACCACCGAGATCT', run = _run) mySample.cutadapter(run = _run) #mySample.rm_lowQual(run = _run) else: pass ##### 1.2 fastqc ##### mySample.QC_fastqc(run = _run) if Mapping: ######################## 2. Mapping ######################## sample['bam'] = mySample.tophat2(run = _run) if condition.has_key(sample['condition']): #condition[sample['condition']][sample['name']] = sample['bam'] condition[sample['condition']] += ","+sample['bam'] else: #condition[sample['condition']] = {sample['name'] : sample['bam']} condition[sample['condition']] = sample['bam'] if GFold: # GFold count mySample.gfoldCount(run = _run) if DESeq2: # DESeq2 count = mySample.HTSeq_count(run = _run) countsFiles[count] = sample['condition'] if GATK: # remove duplicates mySample.rmdup(run = _run) # picard reorder mySample.picard_reorder(run = _run) # splitN mySample.splitN(run = _run) # realign realnBam = mySample.realn(run = _run) # recal need known SNP site # recal #recalBam = mySample.recal(run = _run) finalBam[realnBam] = sample['condition'] # samtools call SNP/InDel mySample.samtools_call(run = _run) mySample.samtools_filter(run = _run) ######################## DEGs calling preparation ######################## if Cufflinks: ##### 3. cufflinks ##### cuffdir = os.path.join(args.outDir, 'cufflinks') if not os.path.exists(cuffdir): os.mkdir(cuffdir) # cufflinks # command = 'cufflinks -p 4 -g %s -o %s %s' % (cfg.gtf, os.path.join(cuffdir, sample['condition']+'_'+sample['name']), sample['bam']) NGSTools.writeCommands(command, cuffdir+'/cufflinks_%s.sh' % sample['name'], _run) transcripts.append(os.path.join(cuffdir, sample['condition']+'_'+sample['name'], 'transcripts.gtf'))
if 5 in analy: DEXSeq = True if 6 in analy: GATK = True if 7 in analy: GFold = True global _run _run = '' if args.debug == False: _run = True else: _run = False ## cfg = NGSTools.getConfig(os.path.abspath(args.config)) def processSample(line, condition, transcripts, countsFiles, finalBam): cols = line.strip().split('\t') if len(cols) == 3: # single end library fq2 = '-' else: # paired end fq2 = cols[3] sample = {
def processSample(sampleName, sampleNameDict): '''pipeline for sample ''' # process communication mng = Manager() libraryBamFileList = mng.list() # This libraryBamFileList is a list contains seval bams from one sample. # init mult-processer record = [] for sampleID in sampleNameDict: Processer = Process(name=sampleID, target=processSampleFromLibarary, args=( sampleID, sampleNameDict[sampleID], libraryBamFileList, )) Processer.start() record.append(Processer) # wait for processer for proc in record: proc.join() ###################### 2.1 post mapping #################### bamFileDir = os.path.dirname(libraryBamFileList[0]) mergedBamFilePath = os.path.join(bamFileDir, "%s_merged.bam" % sampleName) finalBamFilePath = mergedBamFilePath # merge the bam from each lane to one final bam command = NGSTools.picard_merge(libraryBamFileList, mergedBamFilePath, cfg) NGSTools.writeCommands(command, bamFileDir + '/picard_mergebam_' + sampleName + '.sh', run=_run) ###################### 2.2. filter bam ###################### #command = 'samtools view -Sb -h -f 2 -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) #command = 'samtools view -Sb -h -q 10 %s > %s ' % (mergedBamFilePath, finalBamFilePath) #NGSTools.writeCommands(command, bamFileDir+'/filterBam_'+sampleName+'.sh', run=_run) ###################### 3. romove duplicates ################## if RMDUP: command = NGSTools.picard_rmdup(mergedBamFilePath, True, cfg) NGSTools.writeCommands(command, bamFileDir + '/picard_rmdup_' + sampleName + '.sh', run=_run) finalBamFilePath = re.sub(r'.bam$', '.rmdup.bam', finalBamFilePath) ####################### 4. call SNP ###################### if MPILEUP: SNP_out = os.path.join(args.outDir, "SNP", sampleName) NGSTools._mkdir(SNP_out) command, rawVcf = NGSTools.bcftools_call(finalBamFilePath, cfg, outdir=SNP_out, sampleName=sampleName) NGSTools.writeCommands(command, SNP_out + '/bcftools_call_' + sampleName + '.sh', run=_run) outVcf = rawVcf.replace("vcf$", "flt.vcf") command = NGSTools.bcftools_filter(rawVcf, outVcf, cfg) NGSTools.writeCommands(command, SNP_out + '/bcftools_filter_' + sampleName + '.sh', run=_run)
if 4 in analy: RMDUP = True if 5 in analy: Methylation_extractor = True if 6 in analy: swDMR = True if args.debug: _run = False else: _run = True #read config file cfg = NGSTools.getConfig(os.path.abspath(args.config)) # sample list parse def sampleListParser(): ''' parse the sample list file. return a dictionary: { sampleName1: { sample ID: [ fastq1 file path, fastq2 file path ] }, sampleName2: