def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/phylotree']) import merge_count, make_phylotree return [ { 'name': 'merge counts', 'desc': 'merge mutation loci and allele counts', 'fun': merge_count.merge_count, 'paramL': (baseDir, baseDir, 5, 0.05, 5), # 'paramL': (baseDir, baseDir, 20, 0.2, 5), 'paramH': {}, 'logPostFix': '.merge_count.log', 'logExistsFn': lambda x: 'done' in x[-1], 'outFilePostFix': ['.mutations','.filtered'], 'clean': False, 'rerun': False }, { 'name': 'build tree', 'desc': 'make phylogenetic tree', 'fun': make_phylotree.main, 'paramL': (baseDir, baseDir), 'paramH': {}, 'logPostFix': '.make_phylotree.log', 'logExistsFn': lambda x: 'done' in x[-1], 'outFilePostFix': ['.infile', '.outfile','.tree','.pars_tree.pdf','.outfile_report.txt'], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/mutation']) import mutect_batch, somaticindeldetector_batch return [ ## PARAMTERS { 'name': 'Run MuTect', 'desc': '.recal.bam -> .mutect, mutect.vcf', 'fun': mutect_batch.mutect_pair, 'paramL': (baseDir, baseDir, genome, server, False), 'paramH': {}, 'logPostFix': '.mutect_pair.log', 'logExistsFn': lambda x: 'done' in x[-9], 'outFilePostFix': ['.mutect', '.mutect_pair.vcf'], 'clean': False, 'rerun': False }, { 'name': 'somaticindeldetector', 'desc': '.recal.bam -> indels_filter.vcf', 'fun': somaticindeldetector_batch.paired_mode, 'paramL': (baseDir, baseDir, 'SS', genome, server, False), 'paramH': {}, 'logPostFix': '.somaticindeldetector_pair.log', 'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]), 'outFilePostFix': ['indels_pair_filter.vcf', 'indels_pair_filter.out'], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/copynumber']) import cn_corr_batch, corrcgh2seg_batch, drawCNATraj_batch, corrseg2gene_batch return [ ## PARAMETERS { 'name': 'copy number correction', 'desc': 'ngCGH -> corr.ngCGH', 'fun': cn_corr_batch.main, 'paramL': (baseDir, baseDir, False, server), 'paramH': {}, 'logPostFix': '.cn_corr.qlog', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['corr.ngCGH'], 'clean': False, 'rerun': False }, { 'name': 'Segmenation', 'desc': 'corr.ngCGH -> corr.ngCGH.seg', 'fun': corrcgh2seg_batch.cgh2seg, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.corr.seg.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1], 'outFilePostFix': ['corr.ngCGH.seg'], 'clean': False, 'rerun': False }, { 'name': 'Calculate gene copy number from segments', 'desc': 'corr.seg -> corr.cn_gene.dat', 'fun': corrseg2gene_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False), 'paramH': {}, 'logPostFix': '.corr.cn_gene.log', 'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1], 'outFilePostFix': ['corr.cn_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Plot corrected segmentation', 'desc': 'Plot segmentations for corrected copy number profile', # 'fun': drawCNATraj.main, # 'paramL': (baseDir, baseDir), 'fun' : drawCNATraj_batch.draw_single, 'paramL': (baseDir, baseDir, genome), 'paramH': {}, 'logPostFix': '', 'logExistsFn': lambda x: True, 'outFilePostFix': [], 'clean': False, 'rerun': False }, ]
def genSpec_single(baseDir, server="smc1", genome="hg19"): mybasic.add_module_path(["NGS/mutation"]) import mutect_batch, somaticindeldetector_batch return [ ## PARAMETERS { "name": "Run MuTect (single)", "desc": ".recal.bam -> .mutect, mutect_single_filter.vcf", "fun": mutect_batch.mutect_PON, "paramL": (baseDir, genome, server, False), "paramH": {}, "logPostFix": ".mutect_single.log", "logExistsFn": lambda x: "done" in x[-9], "outFilePostFix": ["mutect_single_filter.vcf"], "clean": False, "rerun": False, }, { "name": "somaticindeldetector", "desc": ".recal.bam -> indels_single_filter.vcf", "fun": somaticindeldetector_batch.single_mode, "paramL": (baseDir, baseDir, "SS", genome, server, False), "paramH": {}, "logPostFix": ".somaticindeldetector_single.log", "logExistsFn": lambda x: ("chrX" in x[-1] or "chrX" in x[-2]), "outFilePostFix": ["indels_single_filter.vcf", "indels_single_filter.out"], "clean": False, "rerun": False, }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/copynumber']) import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch return [ ## PARAMETERS { 'name': 'run ngCGH for pairs of bam', 'desc': 'bam -> .ngCGH', 'fun': ngCGH_batch.main, 'paramL': (baseDir, baseDir, 1000, False), 'paramH': {}, 'logPostFix': '.cn_ngCGH.log', 'logExistsFn': lambda x: len(x)>0 and 'finalizers' in x[-1], 'outFilePostFix': ['ngCGH'], 'clean': False, 'rerun': False }, { 'name': 'Segmenation', 'desc': 'ngCGH -> seg', 'fun': cgh2seg_batch.cgh2seg, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.seg.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1], 'outFilePostFix': ['ngCGH.seg'], 'clean': False, 'rerun': False }, { 'name': 'Calculate gene copy number from segments', 'desc': 'seg -> cn_gene.dat', 'fun': seg2gene_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False), 'paramH': {}, 'logPostFix': '.cn_gene.log', 'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1], 'outFilePostFix': ['cn_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Draw Plot', 'desc': 'seg->plot', # 'fun' : drawCNATraj_batch.batch, 'fun' : drawCNATraj_batch.draw_single, 'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome), 'paramH': {}, 'logPostFix': '', 'logExistsFn': lambda x: True, 'outFilePostFix': [], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/splice_gsnap/ei_junc']) import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch## MODULES return [ ## PARAMETERS # { # 'name': 'bam to fastq', # 'desc': 'bam -> fastq', # 'fun': bam2fastq_batch2.bam2fastq_batch2, # 'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'), # 'paramH': {}, # 'logPostFix': 'fastq.log', # 'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1], # 'outFilePostFix': ['fastq'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False, False), # 'paramH': {}, # 'logPostFix': 'gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, { 'name': 'Filter eiJunc', 'desc': 'splice.gsnap.gz -> ei.dat', 'fun': ei_junc_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.ei.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Finished' in x[-1], 'outFilePostFix': ['ei.dat'], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path( ['NGS/fastq', 'NGS/align', 'NGS/splice_gsnap/ei_junc']) import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch ## MODULES return [ ## PARAMETERS # { # 'name': 'bam to fastq', # 'desc': 'bam -> fastq', # 'fun': bam2fastq_batch2.bam2fastq_batch2, # 'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'), # 'paramH': {}, # 'logPostFix': 'fastq.log', # 'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1], # 'outFilePostFix': ['fastq'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False, False), # 'paramH': {}, # 'logPostFix': 'gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, { 'name': 'Filter eiJunc', 'desc': 'splice.gsnap.gz -> ei.dat', 'fun': ei_junc_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.ei.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Finished' in x[-1], 'outFilePostFix': ['ei.dat'], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/mutation']) import mut_clonality_batch return [ ## PARAMETERS { 'name': 'determine mutation clonality', 'desc': 'mutect -> mutect_cl.dat', 'fun': mut_clonality_batch.main, 'paramL': (baseDir, baseDir, mysetting.cnaBaseDir, False, server), 'paramH': {}, 'logPostFix': '.mutect_cl.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['mutect_cl.dat'], 'clean': False, 'rerun': False }, ]
def main(datFileN, server='smc1', dbN='CancerSCAN'): mybasic.add_module_path(['NGS/mutation', 'Integration']) import vep_batch, makeDB_mutation_rxsq print mysetting.CSmutDir + '/*CS' vep_batch.main(glob(mysetting.CSmutDir + '/*CS'), postfixL=[ '.mutect_filter.vcf', '.mutect_single_filter.vcf', '.indels_filter.vcf', '.indels_single_filter.vcf' ], fork=True) os.system( 'cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s' % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN)) mymysql.reset_table(tableN='mutation_cs', dataFileN=datFileN, user=mysetting.mysqlH[server]['user'], passwd=mysetting.mysqlH[server]['passwd'], db=dbN, host=mysetting.mysqlH[server]['host']) (con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'], passwd=mysetting.mysqlH[server]['passwd'], db=dbN, host=mysetting.mysqlH[server]['host']) sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir + '/' + x), os.listdir(mysetting.CSmutDir)) for sampN in sampNL: id = '_'.join(sampN.split('_')[:-2]) postfix = sampN.split('_')[-2] if postfix == 'B': continue if postfix != 'T': id = '%s_%s' % (id, postfix) cursor.execute( '''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' % id) cursor.execute( '''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/phylotree']) import merge_count, make_phylotree return [ { 'name': 'merge counts', 'desc': 'merge mutation loci and allele counts', 'fun': merge_count.merge_count, 'paramL': (baseDir, baseDir, 5, 0.05, 5), # 'paramL': (baseDir, baseDir, 20, 0.2, 5), 'paramH': {}, 'logPostFix': '.merge_count.log', 'logExistsFn': lambda x: 'done' in x[-1], 'outFilePostFix': ['.mutations', '.filtered'], 'clean': False, 'rerun': False }, { 'name': 'build tree', 'desc': 'make phylogenetic tree', 'fun': make_phylotree.main, 'paramL': (baseDir, baseDir), 'paramH': {}, 'logPostFix': '.make_phylotree.log', 'logExistsFn': lambda x: 'done' in x[-1], 'outFilePostFix': [ '.infile', '.outfile', '.tree', '.pars_tree.pdf', '.outfile_report.txt' ], 'clean': False, 'rerun': False }, ]
def main(datFileN, server='smc1', dbN='CancerSCAN'): mybasic.add_module_path(['NGS/mutation','Integration']) import vep_batch, makeDB_mutation_rxsq print mysetting.CSmutDir+'/*CS' vep_batch.main(glob(mysetting.CSmutDir+'/*CS'), postfixL=['.mutect_filter.vcf','.mutect_single_filter.vcf','.indels_filter.vcf','.indels_single_filter.vcf'], fork=True) os.system('cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s' % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN)) mymysql.reset_table(tableN='mutation_cs', dataFileN=datFileN, user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host']) (con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host']) sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir+'/'+x), os.listdir(mysetting.CSmutDir)) for sampN in sampNL: id = '_'.join(sampN.split('_')[:-2]) postfix = sampN.split('_')[-2] if postfix == 'B': continue if postfix != 'T': id = '%s_%s' % (id, postfix) cursor.execute('''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' % id) cursor.execute('''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/mutation','NGS/loh','NGS/purity']) import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch ## MODULES return [ ## PARAMETERS { 'name': 'MutScan for the tumor sample', 'desc': 'pileup_proc -> loh.mutscan', 'fun': mutScan_loh_batch.main, 'paramL': (baseDir, baseDir, False, 10, 0, 0), 'paramH': {}, 'logPostFix': '.loh.mutscan.log', 'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1], 'outFilePostFix': ['loh.mutscan'], 'clean': False, 'rerun': False }, { 'name': 'delta B-allele frequencies calculation', 'desc': 'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample', 'fun': delta_baf_mutscan_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dbaf.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['dbaf.txt'], 'clean': False, 'rerun': False }, { 'name': 'delta BAF segmentation', 'desc': 'segment delta BAF', 'fun': delta_baf_seg_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dbaf.seg.log', 'logExistsFn': lambda x: len(x)>0 and 'Analyzing' in x[-1], 'outFilePostFix': ['seg'], 'clean': False, 'rerun': False }, { 'name': 'Plotting', 'desc': 'Generate deltaBAF/CN trajectory plot', 'fun': dbaf_cn_plot_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.traj_plot.log', 'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1], 'outFilePostFix': ['dBAF_CNA_traj.pdf'], 'clean': False, 'rerun': False }, { 'name': 'CNLOH/LOH determination', 'desc': 'calculate average copy number of LOH segments to determine CNLOH/LOH', 'fun': calcCN_LOH_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.loh_cn.log', 'logExistsFn': lambda x: len(x)>0 and 'Setting' in x[-1], 'outFilePostFix': ['loh_cn.txt'], 'clean': False, 'rerun': False }, { 'name': 'gene LOH', 'desc': 'loh_cn.txt -> loh_gene.dat', 'fun': loh2gene_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.refFlatH[server][genome]), 'paramH': {}, 'logPostFix': '.loh_gene.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['loh_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Normal contamiation calculation', 'desc': 'calculate normal contamination levels at heterozygous germline SNPs in LOH regions', 'fun': calcNormalF_loh_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.nfrac_all.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['nFrac_all.txt'], 'clean': False, 'rerun': False }, { 'name': 'Tumor fraction estimation', 'desc': 'estimate tumor fraction', 'fun': peakFrac_batch.main, 'paramL': (baseDir, baseDir,False), 'paramH': {}, 'logPostFix': '.tfrac.log', 'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1], 'outFilePostFix': ['tumor_frac.txt'], 'clean': False, 'rerun': False }, ]
#!/usr/bin/python ## postprocessing for RNA-Seq pipelines : rsq2skip, rsq2fusion, rsq2eiJunc ## handles 3 pipeline at the same time: no need to run 3 times after each pipeline from glob import glob import sys, os import mymysql, mypipe, mybasic from mysetting import mysqlH from datetime import datetime from warnings import filterwarnings from warnings import resetwarnings mybasic.add_module_path(['NGS/splice_gsnap/skipping','NGS/splice_gsnap/fusion','NGS/splice_gsnap/ei_junc','Integration']) import makeDB_splice_AF import prepDB_splice_normal, exonSkip_summarize, prepDB_splice_skip import fusion_summarize, prepDB_splice_fusion import ei_junc_filter, prepDB_splice_eiJunc BASE='/EQL1/NSL/RNASeq/results' RSQPattern=('(.*)_RSq','') def post_rsq2skip(dirN, server='smc1', dbN='ihlee_test', sampL=[]): (con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host']) cursor.execute('ALTER TABLE splice_normal CHANGE COLUMN samp_id samp_id char(63)') cursor.execute('ALTER TABLE splice_normal_loc1 CHANGE COLUMN samp_id samp_id char(63)') cursor.execute('ALTER TABLE splice_normal_loc2 CHANGE COLUMN samp_id samp_id char(63)') cursor.execute('CREATE TEMPORARY TABLE splice_normal_tmp LIKE splice_normal') sampNL = filter(lambda x: os.path.isdir(dirN + '/' + x), os.listdir(dirN)) for sampN in sampNL: baseDir = dirN + '/' + sampN sid = sampN[:-4].replace('.','_').replace('-','_') ## RNASeq sample has '***_RSq'
#!/usr/bin/python import sys, os, glob, getopt import mybasic, mysetting mybasic.add_module_path(['utils']) import link_fqgz_hj # linking link_fqgz_hj.link('/EQL1/NSL/WXS/fastq','/EQL1/NSL/WXS/exome_20130529', '.*([0-9]{3})[ITN].*') # listing directories dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*') #for dir_name in dir_list def main(pbs=False): print dir_list, len(dir_list) projectName = 'heejin_20' os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName) for single_dir in dir_list: sampN = single_dir.split('/')[-1] # if sampN not in ['S012_T_SS']: # continue
annotH = {} for line in inFile: colL = line.rstrip().split('\t') rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']]) (chr,chrSta,chrEnd) = rm.groups() ref = colL[idxH['ref']] alt = colL[idxH['alt']] if (chr,chrSta,chrEnd,ref,alt) not in annotH: annotH[(chr,chrSta,chrEnd,ref,alt)] = {} for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']: annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]] return annotH ### until it is merged into pipeline import mybasic mybasic.add_module_path(['NGS/pipeline']) import mypipe #bamDirL = mysetting.wxsBamDirL #trioH = mypipe.read_trio(bamDirL=bamDirL) #pairH = {} #for tid in trioH: # if tid not in ['37']: # continue # if trioH[tid]['recur_id'] != []: # print tid, trioH[tid]['prim_id'] # print tid, trioH[tid]['recur_id'] # pid = re.match('(.*)_T.{,2}_[TS]{2}', trioH[tid]['prim_id'][0]).group(1) # pairH[pid] = map(lambda x: re.match('(.*)_T.{,2}_[TS]{2}',x).group(1), trioH[tid]['recur_id']) inDir = '/EQL3/pipeline/somatic_mutect/' outDir = '/EQL1/PrimRecur/phylogeny'
#!/usr/bin/python import sys, os, re, getopt, glob import mybasic mybasic.add_module_path(["NGS/align", "NGS/mutation"]) import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch def wxs_seq(baseDir, projectName): current_files_list = [] compared_files_list = [] current_files_list = glob.glob(baseDir + "/*") # compose log string html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>' # prep html file html_path = "/var/www/html/pipeline_logs/" + projectName + "/" file_name_split = baseDir.split("/S") sample_name = "S" + file_name_split[1] file_name = "pipeline1_log_" + sample_name + ".html" # create .html file with open(os.path.join(html_path, file_name), "wb") as log_file: log_file.write(html_head_string) log_file.close() # change mode and open log_file again os.system("chmod 755 %s%s" % (html_path, file_name))
def genSpec_CS(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/coverage','NGS/expression','NGS/copynumber']) import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch return [ ## PARAMTERS { 'name': 'Format Conversion and sorting', 'desc': 'bam -> sort -> sorted.bed', 'fun': bam2sortedBed_batch.sam2bed_batch, 'paramL': (baseDir, baseDir, 'recal', False), 'paramH': {}, 'logPostFix': '.sorted.bed.qlog', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['sorted.bed'], 'clean': False, 'rerun': False }, { 'name': 'RPKMgen', 'desc': 'sorted.bed -> rpkm', 'fun': degSeq_batch.main, 'paramL': (baseDir, baseDir, '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False), 'paramH': {}, 'logPostFix': '.degSeq.qlog', 'logExistsFn': lambda x: len(x)>0 and 'omitted' in x[-1], 'outFilePostFix': ['rpkm'], 'clean': False, 'rerun': False }, { 'name': 'Calculate a log2 rpkm ratio for all exons', 'desc': 'log2(tumor rpkm/normal rpkm', 'fun': rpkm2cn_batch.main_pool, 'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False), 'paramH': {}, 'logPostFix': '.cn.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['copynumber'], 'clean': False, 'rerun': False }, { 'name': 'Calculate gene copy number from log2 rpkm ratios', 'desc': 'copynumber -> cn_gene.dat', 'fun': exon2gene_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],mysetting.cs_gene,False), 'paramH': {}, 'logPostFix': '.cn_gene.log', 'logExistsFn': lambda x: len(x)>0 and 'VHL' in x[-1], 'outFilePostFix': ['cn_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Draw Plot', 'desc': 'seg->plot', # 'fun' : drawCNATraj_batch.batch, 'fun' : drawCNATraj_batch.draw_single, 'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome), 'paramH': {}, 'logPostFix': '', 'logExistsFn': lambda x: True, 'outFilePostFix': [], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation']) import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch ## MODULES import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'BWA', 'desc': 'fq -> sam -> bam -> sorted.bam', 'fun': bwa_batch.align, 'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True), 'paramH': {}, 'logPostFix': '.bwa.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'bam_sort_core' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': False, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam -> RG.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dedup.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['RG.bam'], 'clean': False, 'rerun': False }, { 'name': 'Realign', 'desc': 'RG.bam -> realign.bam -> recal.bam', 'fun': realign_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '.realign.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['recal.bam'], 'clean': False, 'rerun': False }, # { # 'name': 'Pileup', # 'desc': 'recal.bam -> pileup', # 'fun': pileup_batch.main, # 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]), # 'paramH': {}, # 'logPostFix': '.pileup.log', # 'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1], # 'outFilePostFix': ['pileup'], # 'clean': False, # 'rerun': False # }, { 'name': 'Pileup_proc', 'desc': 'recal.bam -> pileup -> pileup_proc', 'fun': procPileup_split_batch.main, 'paramL': (baseDir, baseDir, mysetting.ucscRefH[server][genome], False), 'paramH': {}, 'logPostFix': '.pileup_proc.log', 'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1], 'outFilePostFix': ['pileup_proc', 'pileup.gz'], 'clean': False, 'rerun': False }, { 'name': 'MutScan', 'desc': 'pileup_proc -> mutscan', 'fun': mutScan_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.mutscan.log', 'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1], 'outFilePostFix': ['mutscan'], 'clean': False, 'rerun': False }, # temporarily off # { # 'name': 'MuTect', # 'desc': 'recal.bam -> .vcf', # 'fun': mutect_batch.mutect_PON, # 'paramL': (baseDir, genome, server, False), # 'paramH': {}, # 'logPostFix': '.mutect_single.log', # 'logExistsFn': lambda x: 'done' in x[-9], # 'outFilePostFix': ['.mutect.vcf','.mutect'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'SomaticIndelDetector', # 'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf', # 'fun': somaticindeldetector_batch.single_mode, # 'paramL': (baseDir, baseDir, 'SS', genome, server, False), # 'paramH': {}, # 'logPostFix': '.somaticindeldetector.log', # 'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]), # 'outFilePostFix': ['indels_filter.vcf','indels_filter.out'], # 'clean': False, # 'rerun': False # }, # {## old cosmic join # 'name': 'mutscan_snp_cosmic', # 'desc': 'mutscan -> cosmic.dat', # 'fun': mutscan_snp_cosmic_batch.main, # 'paramL': (baseDir, server), # 'paramH': {}, # 'logPostFix': '.cosmic.log', # 'logExistsFn': lambda x: len(x) == 0, # 'outFilePostFix': ['cosmic.dat'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'VEP annotation', # 'desc': 'Annotate mutscan output', # 'fun': vep_mutscan_batch.main, # 'paramL': ([baseDir]), # 'paramH': {}, # 'logPostFix': '.mutscan_vep.log', # 'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1], # 'outFilePostFix': ['mutscan_vep_out.vcf'], # 'clean': False, # 'rerun': False # }, ## join cosmic # { # 'name': 'Join Cosmic', # 'desc': 'Join annotated mutscan output with COSMIC', # 'fun': annotate_join_cosmic_batch.main, # 'paramL': (baseDir, '(.*)\.vep$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_cosmic.dat'], # 'clean': False, # 'rerun': False # }, # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
#!/usr/bin/python ## integration into DB (per sample) import sys, os import mymysql, mypipe, mybasic from mysetting import mysqlH mybasic.add_module_path(['NGS/expression','Integration']) import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'): sampN = baseDir.split('/')[-1] sid = sampN[:-4].replace('-','_').replace('.','_') ##drop '_RSq' if dbN in ['ihlee_test','ircr1']: gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN else: gctFileN = '%s/%s.gct' % (baseDir, sampN) datFileN = '%s/%s.dat' % (baseDir, sampN) print sampN, gctFileN rpkm_process.rpkm_process(inputDirN=baseDir, filePattern='*.rpkm', sampRegex='(.*)_RSq\.rpkm', outputFileN=gctFileN) ## prep prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN, geneList=[], samplePrefix='', outDatFileName=datFileN) ## import (con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host']) cursor.execute('DELETE FROM rpkm_gene_expr WHERE samp_id="%s"' % sid) cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE rpkm_gene_expr' % datFileN) cursor.execute('DROP VIEW IF EXISTS rpkm_gene_expr_lg2') cursor.execute('CREATE VIEW rpkm_gene_expr_lg2 AS SELECT samp_id,gene_sym,log2(rpkm+1) AS lg2_rpkm FROM rpkm_gene_expr') ## make sure to update sample_tag that this sample has RNA-Seq cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/align','NGS/splice_gsnap/fusion']) import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch ## MODULES return [ ## PARAMETERS # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False), # 'paramH': {}, # 'logPostFix': '.gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, # { 'name': 'Filter transloc', 'desc': 'splice.gsnap.gz -> splice_transloc.gsnap', 'fun': fusion_filter_transloc_batch.fusion_filter_batch, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.ft_tloc.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Results' in x[-1], 'outFilePostFix': ['splice_transloc.gsnap'], 'clean': False, 'rerun': False }, { 'name': 'annotate', 'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap', 'fun': fusion_filter_annot1_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.annot.qlog', 'logExistsFn': lambda x: len(x)>1 and 'Results' in x[-1], 'outFilePostFix': ['splice_transloc_annot1.gsnap'], 'clean': False, 'rerun': False }, { 'name': 'sort', 'desc': 'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt', 'fun': fusion_proc_sort_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.sort.qlog', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['splice_transloc_annot1.sorted.gsnap','splice_transloc_annot1.report.txt'], 'clean': False, 'rerun': False }, { 'name': 'annotate report', 'desc': 'report.txt -> report_annot.txt', 'fun': fusion_proc_annot_batch.fusion_proc_annot_batch, 'paramL': (baseDir, baseDir, None, False), 'paramH': {}, 'logPostFix': '.report_annot.qlog', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'], 'clean': False, 'rerun': False }, # { # 'name': 'Summarize', # 'desc': '', # 'fun': , # 'paramL': (baseDir, baseDir, False), # 'paramH': {}, # 'logPostFix': 'realign.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], # 'outFilePostFix': ['realign.bam', 'recal.bam'], # 'clean': False, # 'rerun': False # }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation']) import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch ## MODULES import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'BWA', 'desc': 'fq -> sam -> bam -> sorted.bam', 'fun': bwa_batch.align, 'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True), 'paramH': {}, 'logPostFix': '.bwa.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'bam_sort_core' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': True, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dedup.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['dedup.bam'], 'clean': False, 'rerun': False }, { 'name': 'Realign', 'desc': 'dedup.bam -> realign.bam -> recal.bam', 'fun': realign_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '.realign.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['recal.bam'], 'clean': False, 'rerun': False }, { 'name': 'MuTect', 'desc': 'recal.bam -> .vcf', 'fun': mutect_batch.mutect_PON, 'paramL': (baseDir, genome, server, False), 'paramH': {}, 'logPostFix': '.mutect_single.log', 'logExistsFn': lambda x: 'done' in x[-9], 'outFilePostFix': ['.mutect.vcf', '.mutect_filter.vcf', '.mutect'], 'clean': False, 'rerun': False }, { 'name': 'SomaticIndelDetector', 'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf', 'fun': somaticindeldetector_batch.single_mode, 'paramL': (baseDir, baseDir, 'CS', genome, server, False), 'paramH': {}, 'logPostFix': '.somaticindeldetector.log', 'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]), 'outFilePostFix': ['indels_filter.vcf', 'indels_filter.out'], 'clean': False, 'rerun': False }, # { ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?) # 'name': 'VEP', # 'desc': '.vcf -> .dat', # 'fun': vep_mutect_batch.main, # 'paramL': ([baseDir], False), # 'paramH': {}, # 'logPostFix': 'mutect_vep.log', # 'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1], # 'outFilePostFix': ['_vep.dat'], # 'clean': False, # 'rerun': False # } # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
#!/usr/bin/python import sys, os, glob, getopt import mybasic, mysetting mybasic.add_module_path(['utils']) import link_fqgz_hj # linking link_fqgz_hj.link('/EQL1/NSL/WXS/fastq', '/EQL1/NSL/WXS/exome_20130529', '.*([0-9]{3})[ITN].*') # listing directories dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*') #for dir_name in dir_list def main(pbs=False): print dir_list, len(dir_list) projectName = 'heejin_20' os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName) for single_dir in dir_list: sampN = single_dir.split('/')[-1] # if sampN not in ['S012_T_SS']: # continue
#!/usr/bin/python from glob import glob import sys, os, re import mysetting, mymysql, mypipe, mybasic mybasic.add_module_path(['Integration', 'NGS/mutation']) import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch def prep_single(outFileN, server='smc1', dbN='ircr1'): (con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'], passwd=mysetting.mysqlH[server]['passwd'], db=dbN, host=mysetting.mysqlH[server]['host']) cosmicL = [] for dir in mysetting.wxsMutscanDirL: cosmicL += filter( lambda x: '_B_' not in x, glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir)) cursor.execute( 'SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"') results = cursor.fetchall() sidL = [] for res in results: sidL.append(res[0]) for cosmic in cosmicL: (sid, postfix, platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat', os.path.basename(cosmic)).groups()
def genSpec(baseDir, server="smc1", genome="hg19"): mybasic.add_module_path(["NGS/align", "NGS/splice_gsnap/skipping"]) import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch ## MODULES return [ ## PARAMETERS # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False), # 'paramH': {}, # 'logPostFix': 'gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, { "name": "Filter exonskip", "desc": "splice.gsnap.gz -> splice_exonSkip.gsnap", "fun": exonSkip_filter_batch.exonSkip_filter_batch, "paramL": (baseDir, baseDir, False), "paramH": {}, "logPostFix": ".exonSkip.qlog", "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1], "outFilePostFix": ["splice_exonSkip.gsnap"], "clean": False, "rerun": False, }, { "name": "Filter normal exonskip", "desc": "splice.gsnap -> splice_exonSkip_normal.gsnap.gz", "fun": exonSkip_filter_normal_batch.exonSkip_filter_batch, "paramL": (baseDir, baseDir, False), "paramH": {}, "logPostFix": ".exonSkip_normal.qlog", "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1], "outFilePostFix": ["splice_exonSkip_normal.gsnap.gz"], "clean": False, "rerun": False, }, { "name": "sort", "desc": "splice_exonSkip.gsnap -> splice_exonSkip_report.txt", "fun": exonSkip_sort_batch.main, "paramL": (baseDir, baseDir, False), "paramH": {}, "logPostFix": ".sort.qlog", "logExistsFn": lambda x: len(x) == 0, "outFilePostFix": ["splice_exonSkip_report.txt"], "clean": False, "rerun": False, }, { "name": "sort-normal", "desc": "splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt", "fun": exonSkip_normal_sort_batch.main, "paramL": (baseDir, baseDir, False), "paramH": {}, "logPostFix": ".sort_normal.qlog", "logExistsFn": lambda x: len(x) == 0, "outFilePostFix": ["splice_exonSkip_normal_report.txt"], "clean": False, "rerun": False, }, { "name": "annotate report", "desc": "report.txt -> report_annot.txt", "fun": exonSkip_proc_annot_batch.exonSkip_proc_annot_batch, "paramL": (baseDir, baseDir, None, False), "paramH": {}, "logPostFix": ".skip_annot.qlog", "logExistsFn": lambda x: len(x) == 0, "outFilePostFix": ["splice_exonSkip_report_annot.txt"], "clean": False, "rerun": False, }, # { # 'name': 'link', # 'desc': 'put all report_annot.txt files in a directory', # 'fun': exonSkip_link.link, # 'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'), # 'paramH': {}, # 'logPostFix': 'link.qlog', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['splice_exonSkip_report_annot.txt'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'link-normal', # 'desc': 'put all report_normal.txt files in a directory', # 'fun': exonSkip_link_normal.link, # 'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'), # 'paramH': {}, # 'logPostFix': 'link_normal.qlog', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['splice_exonSkip_normal_report.txt'], # 'clean': False, # 'rerun': False # }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation']) import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch ## MODULES import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'BWA', 'desc': 'fq -> sam -> bam -> sorted.bam', 'fun': bwa_batch.align, 'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True), 'paramH': {}, 'logPostFix': '.bwa.qlog', 'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': False, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam -> RG.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dedup.qlog', 'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['RG.bam'], 'clean': False, 'rerun': False }, { 'name': 'Realign', 'desc': 'RG.bam -> realign.bam -> recal.bam', 'fun': realign_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '.realign.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['recal.bam'], 'clean': False, 'rerun': False }, # { # 'name': 'Pileup', # 'desc': 'recal.bam -> pileup', # 'fun': pileup_batch.main, # 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]), # 'paramH': {}, # 'logPostFix': '.pileup.log', # 'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1], # 'outFilePostFix': ['pileup'], # 'clean': False, # 'rerun': False # }, { 'name': 'Pileup_proc', 'desc': 'recal.bam -> pileup -> pileup_proc', 'fun': procPileup_split_batch.main, 'paramL': (baseDir, baseDir, mysetting.ucscRefH[server][genome], False), 'paramH': {}, 'logPostFix': '.pileup_proc.log', 'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1], 'outFilePostFix': ['pileup_proc','pileup.gz'], 'clean': False, 'rerun': False }, { 'name': 'MutScan', 'desc': 'pileup_proc -> mutscan', 'fun': mutScan_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.mutscan.log', 'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1], 'outFilePostFix': ['mutscan'], 'clean': False, 'rerun': False }, # temporarily off # { # 'name': 'MuTect', # 'desc': 'recal.bam -> .vcf', # 'fun': mutect_batch.mutect_PON, # 'paramL': (baseDir, genome, server, False), # 'paramH': {}, # 'logPostFix': '.mutect_single.log', # 'logExistsFn': lambda x: 'done' in x[-9], # 'outFilePostFix': ['.mutect.vcf','.mutect'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'SomaticIndelDetector', # 'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf', # 'fun': somaticindeldetector_batch.single_mode, # 'paramL': (baseDir, baseDir, 'SS', genome, server, False), # 'paramH': {}, # 'logPostFix': '.somaticindeldetector.log', # 'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]), # 'outFilePostFix': ['indels_filter.vcf','indels_filter.out'], # 'clean': False, # 'rerun': False # }, # {## old cosmic join # 'name': 'mutscan_snp_cosmic', # 'desc': 'mutscan -> cosmic.dat', # 'fun': mutscan_snp_cosmic_batch.main, # 'paramL': (baseDir, server), # 'paramH': {}, # 'logPostFix': '.cosmic.log', # 'logExistsFn': lambda x: len(x) == 0, # 'outFilePostFix': ['cosmic.dat'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'VEP annotation', # 'desc': 'Annotate mutscan output', # 'fun': vep_mutscan_batch.main, # 'paramL': ([baseDir]), # 'paramH': {}, # 'logPostFix': '.mutscan_vep.log', # 'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1], # 'outFilePostFix': ['mutscan_vep_out.vcf'], # 'clean': False, # 'rerun': False # }, ## join cosmic # { # 'name': 'Join Cosmic', # 'desc': 'Join annotated mutscan output with COSMIC', # 'fun': annotate_join_cosmic_batch.main, # 'paramL': (baseDir, '(.*)\.vep$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_cosmic.dat'], # 'clean': False, # 'rerun': False # }, # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
#!/usr/bin/python import sys, os, re import mysetting, mybasic mybasic.add_module_path(['NGS/pipeline','NGS/mutation']) import mutect_batch, somaticindeldetector_batch import mypipe bamDirL = mysetting.wxsBamDirL trioH = mypipe.read_trio('/EQL1/NSL/clinical/trio_info.txt', bamDirL) #for tid in sorted(trioH.keys()): # if tid not in ['59','60','61']: # continue # print tid, trioH[tid]['prim_id'], trioH[tid]['recur_id'] # for role in ['Normal','Primary','Recurrent']: # print role,trioH[tid][role] #sys.exit(1) outDir='/EQL3/pipeline/somatic_mutect' ## assume 1 primary & normal per trio for tid in trioH: if trioH[tid]['norm_id'] == []: continue if tid not in ['63']: continue norm = trioH[tid]['norm_id'][0]
#!/usr/bin/python import sys, os import mymysql, mypipe, mybasic from mysetting import mysqlH mybasic.add_module_path(["Integration"]) import prepDB_mutscan, makeDB_mutation_rxsq def post_s_rsq2mut(baseDir, server="smc1", dbN="ihlee_test"): sampN = baseDir.split("/")[-1] sid = sampN[:-4].replace(".", "_").replace("-", "_") print sampN, sid cosmicDatFileN = "%s/%s_splice_cosmic.dat" % (baseDir, sampN) if dbN in ["ihlee_test", "ircr1"]: datFileN = "/EQL1/NSL/RNASeq/results/mutation/%s.dat" % sampN else: datFileN = "%s/%s.dat" % (baseDir, sampN) if os.path.isfile(cosmicDatFileN): prepDB_mutscan.main(sampNamePat=("(.*)_(RSq)", ""), geneList=[], inFileN=cosmicDatFileN, outFileN=datFileN) ## import (con, cursor) = mymysql.connectDB( user=mysqlH[server]["user"], passwd=mysqlH[server]["passwd"], db=dbN, host=mysqlH[server]["host"] ) cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid) cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE mutation_rsq' % datFileN) ## make sure to update sample_tag that this sample has RNA-Seq cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation']) import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch ## MODULES import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'BWA', 'desc': 'fq -> sam -> bam -> sorted.bam', 'fun': bwa_batch.align, 'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True), 'paramH': {}, 'logPostFix': '.bwa.qlog', 'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': True, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dedup.qlog', 'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['dedup.bam'], 'clean': False, 'rerun': False }, { 'name': 'Realign', 'desc': 'dedup.bam -> realign.bam -> recal.bam', 'fun': realign_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '.realign.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['recal.bam'], 'clean': False, 'rerun': False }, { 'name': 'MuTect', 'desc': 'recal.bam -> .vcf', 'fun': mutect_batch.mutect_PON, 'paramL': (baseDir, genome, server, False), 'paramH': {}, 'logPostFix': '.mutect_single.log', 'logExistsFn': lambda x: 'done' in x[-9], 'outFilePostFix': ['.mutect.vcf','.mutect_filter.vcf','.mutect'], 'clean': False, 'rerun': False }, { 'name': 'SomaticIndelDetector', 'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf', 'fun': somaticindeldetector_batch.single_mode, 'paramL': (baseDir, baseDir, 'CS', genome, server, False), 'paramH': {}, 'logPostFix': '.somaticindeldetector.log', 'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]), 'outFilePostFix': ['indels_filter.vcf','indels_filter.out'], 'clean': False, 'rerun': False }, # { ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?) # 'name': 'VEP', # 'desc': '.vcf -> .dat', # 'fun': vep_mutect_batch.main, # 'paramL': ([baseDir], False), # 'paramH': {}, # 'logPostFix': 'mutect_vep.log', # 'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1], # 'outFilePostFix': ['_vep.dat'], # 'clean': False, # 'rerun': False # } # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
#!/usr/bin/python ## integration into DB (per sample) import sys, os import mymysql, mypipe, mybasic from mysetting import mysqlH mybasic.add_module_path(['NGS/expression', 'Integration']) import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'): sampN = baseDir.split('/')[-1] sid = sampN[:-4].replace('-', '_').replace('.', '_') ##drop '_RSq' if dbN in ['ihlee_test', 'ircr1']: gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN else: gctFileN = '%s/%s.gct' % (baseDir, sampN) datFileN = '%s/%s.dat' % (baseDir, sampN) print sampN, gctFileN rpkm_process.rpkm_process(inputDirN=baseDir, filePattern='*.rpkm', sampRegex='(.*)_RSq\.rpkm', outputFileN=gctFileN) ## prep prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN, geneList=[], samplePrefix='', outDatFileName=datFileN) ## import
annotH = {} for line in inFile: colL = line.rstrip().split('\t') rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']]) (chr,chrSta,chrEnd) = rm.groups() ref = colL[idxH['ref']] alt = colL[idxH['alt']] if (chr,chrSta,chrEnd,ref,alt) not in annotH: annotH[(chr,chrSta,chrEnd,ref,alt)] = {} for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']: annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]] return annotH ### until it is merged into pipeline import mybasic mybasic.add_module_path(['NGS/pipeline']) import mypipe trioH = mypipe.read_trio(bamDirL=mysetting.wxsBamDirL) pairH = {} for tid in trioH: if trioH[tid]['recur_id'] != []: pid = trioH[tid]['prim_id'][0][:-5] pairH[pid] = map(lambda x: x[:-5], trioH[tid]['recur_id']) #### #(con,cursor) = mymysql.connectDB(db='ircr1') #tag = 'pair_R:%' #cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag) #sIdL_p = [x for (x,) in cursor.fetchall()] # #tag = 'XSeq%%,N' #cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/mutation', 'NGS/loh', 'NGS/purity']) import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch ## MODULES return [ ## PARAMETERS { 'name': 'MutScan for the tumor sample', 'desc': 'pileup_proc -> loh.mutscan', 'fun': mutScan_loh_batch.main, 'paramL': (baseDir, baseDir, False, 10, 0, 0), 'paramH': {}, 'logPostFix': '.loh.mutscan.log', 'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1], 'outFilePostFix': ['loh.mutscan'], 'clean': False, 'rerun': False }, { 'name': 'delta B-allele frequencies calculation', 'desc': 'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample', 'fun': delta_baf_mutscan_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dbaf.log', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['dbaf.txt'], 'clean': False, 'rerun': False }, { 'name': 'delta BAF segmentation', 'desc': 'segment delta BAF', 'fun': delta_baf_seg_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.dbaf.seg.log', 'logExistsFn': lambda x: len(x) > 0 and 'Analyzing' in x[-1], 'outFilePostFix': ['seg'], 'clean': False, 'rerun': False }, { 'name': 'Plotting', 'desc': 'Generate deltaBAF/CN trajectory plot', 'fun': dbaf_cn_plot_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.traj_plot.log', 'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1], 'outFilePostFix': ['dBAF_CNA_traj.pdf'], 'clean': False, 'rerun': False }, { 'name': 'CNLOH/LOH determination', 'desc': 'calculate average copy number of LOH segments to determine CNLOH/LOH', 'fun': calcCN_LOH_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.loh_cn.log', 'logExistsFn': lambda x: len(x) > 0 and 'Setting' in x[-1], 'outFilePostFix': ['loh_cn.txt'], 'clean': False, 'rerun': False }, { 'name': 'gene LOH', 'desc': 'loh_cn.txt -> loh_gene.dat', 'fun': loh2gene_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.refFlatH[server][genome]), 'paramH': {}, 'logPostFix': '.loh_gene.log', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['loh_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Normal contamiation calculation', 'desc': 'calculate normal contamination levels at heterozygous germline SNPs in LOH regions', 'fun': calcNormalF_loh_batch.main, 'paramL': (baseDir, baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.nfrac_all.log', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['nFrac_all.txt'], 'clean': False, 'rerun': False }, { 'name': 'Tumor fraction estimation', 'desc': 'estimate tumor fraction', 'fun': peakFrac_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.tfrac.log', 'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1], 'outFilePostFix': ['tumor_frac.txt'], 'clean': False, 'rerun': False }, ]
#!/usr/bin/python import sys, os, re, getopt, glob import mybasic mybasic.add_module_path(['NGS/align','NGS/mutation']) import mybasic, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch def main(baseDir, projectName): current_files_list = [] compared_files_list = [] current_files_list = glob.glob(baseDir+'/*') outDir = baseDir + '/pileup_proc' # compose log string html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>' # prep html file html_path = '/var/www/html/pipeline_logs/' + projectName + '/' file_name_split = baseDir.split('/S') sample_name = 'S' + file_name_split[1] file_name = 'pipeline2_log_' + sample_name + '.html' # create .html file with open(os.path.join(html_path, file_name), 'wb') as log_file: log_file.write(html_head_string) log_file.close() # change mod and open log_file again
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/align','NGS/fastq','NGS/coverage','NGS/expression']) import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch ## MODULES import fastqc_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'Trim', 'desc': 'fq.gz -> trim -> fq', 'fun': trim_batch.trim_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30), 'paramH': {}, 'logPostFix': '.trim.log', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'], 'clean': False, 'rerun': False }, { 'name': 'Mapping', 'desc': 'fq -> bam', 'fun': gsnap_sam_batch.align, 'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)), 'paramH': {}, 'logPostFix': '.gsnap.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], 'outFilePostFix': ['bam'], 'clean': False, 'rerun': False }, { 'name': 'Formet Conversion and sorting', 'desc': 'bam -> sort -> sorted.bed', 'fun': bam2sortedBed_batch.sam2bed_batch, 'paramL': (baseDir, baseDir, '', False), 'paramH': {}, 'logPostFix': '.sorted.bed.qlog', 'logExistsFn': lambda x: len(x)==0, 'outFilePostFix': ['sorted.bed'], 'clean': False, 'rerun': False }, { 'name': 'TDFgen', 'desc': 'sorted.bed -> bedgraph -> tdf', 'fun': sortedBed2tdf_batch.main, 'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' % (mysetting.ucscSeqDir[server][genome], genome), genome), 'paramH': {}, 'logPostFix': '.tdf.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-9], 'outFilePostFix': ['bedgraph','tdf'], 'clean': False, 'rerun': False }, { 'name': 'RPKMgen', 'desc': 'sorted.bed -> rpkm', 'fun': degSeq_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], False), 'paramH': {}, 'logPostFix': '.degSeq.qlog', 'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1], 'outFilePostFix': ['rpkm'], 'clean': False, 'rerun': False }, # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path( ['NGS/align', 'NGS/fastq', 'NGS/coverage', 'NGS/expression']) import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch ## MODULES import fastqc_batch return [ ## PARAMETERS { 'name': 'FastQC', 'desc': 'QC for fastq', 'fun': fastqc_batch.fastqc_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir), 'paramH': {}, 'logPostFix': '.fastqc.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Analysis complete' in x[-1], 'outFilePostFix': ['_fastqc.zip'], 'outLinkPostFix': ['_fastqc/fastqc_report.html'], 'clean': False, 'rerun': False }, { 'name': 'Trim', 'desc': 'fq.gz -> trim -> fq', 'fun': trim_batch.trim_batch, 'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30), 'paramH': {}, 'logPostFix': '.trim.log', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'], 'clean': False, 'rerun': False }, { 'name': 'Mapping', 'desc': 'fq -> bam', 'fun': gsnap_sam_batch.align, 'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)), 'paramH': {}, 'logPostFix': '.gsnap.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1], 'outFilePostFix': ['bam'], 'clean': False, 'rerun': False }, { 'name': 'Formet Conversion and sorting', 'desc': 'bam -> sort -> sorted.bed', 'fun': bam2sortedBed_batch.sam2bed_batch, 'paramL': (baseDir, baseDir, '', False), 'paramH': {}, 'logPostFix': '.sorted.bed.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['sorted.bed'], 'clean': False, 'rerun': False }, { 'name': 'TDFgen', 'desc': 'sorted.bed -> bedgraph -> tdf', 'fun': sortedBed2tdf_batch.main, 'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' % (mysetting.ucscSeqDir[server][genome], genome), genome), 'paramH': {}, 'logPostFix': '.tdf.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-9], 'outFilePostFix': ['bedgraph', 'tdf'], 'clean': False, 'rerun': False }, { 'name': 'RPKMgen', 'desc': 'sorted.bed -> rpkm', 'fun': degSeq_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], False), 'paramH': {}, 'logPostFix': '.degSeq.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'ZZZ3' in x[-1], 'outFilePostFix': ['rpkm'], 'clean': False, 'rerun': False }, # { # 'name': 'Cleanup', # 'desc': 'remove all, but logs and designated result file', # 'fun': cleanup.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': 'cleanup.qlog', # 'logExistsFn': lambda x: False, # 'outFilePostFix': ['pileup'] # }, ]
#!/usr/bin/python import sys, os import mymysql, mypipe, mybasic from mysetting import mysqlH mybasic.add_module_path(['Integration']) import prepDB_mutscan, makeDB_mutation_rxsq def post_s_rsq2mut(baseDir, server='smc1', dbN='ihlee_test'): sampN = baseDir.split('/')[-1] sid = sampN[:-4].replace('.', '_').replace('-', '_') print sampN, sid cosmicDatFileN = '%s/%s_splice_cosmic.dat' % (baseDir, sampN) if dbN in ['ihlee_test', 'ircr1']: datFileN = '/EQL1/NSL/RNASeq/results/mutation/%s.dat' % sampN else: datFileN = '%s/%s.dat' % (baseDir, sampN) if os.path.isfile(cosmicDatFileN): prepDB_mutscan.main(sampNamePat=('(.*)_(RSq)', ''), geneList=[], inFileN=cosmicDatFileN, outFileN=datFileN) ## import (con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'], passwd=mysqlH[server]['passwd'], db=dbN, host=mysqlH[server]['host']) cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation']) import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch ## MODULES specL = [ ## PARAMETERS { 'name': 'Align', 'desc': '.fq.gz -> .bam', 'fun': gsnap_splice_bam_batch.align, 'paramL': (baseDir, baseDir, False, genome), 'paramH': {}, 'logPostFix': '.gsnap.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], 'outFilePostFix': ['splice.bam'], 'clean': False, 'rerun': False }, { 'name': 'Sort', 'desc': 'bam -> sorted.bam', 'fun': gsnap_splice_bam_sort_batch.main, 'paramL': (baseDir, baseDir, 10000000000), 'paramH': {}, 'logPostFix': '_splice.sort.qlog', 'logExistsFn': lambda x: len(x)<1 or 'merging' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': False, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '_splice.dedup.qlog', 'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['dedup.bam'], 'clean': False, 'rerun': False }, { 'name': 'RealignTarget', 'desc': 'dedup.bam -> realigner.intervals -> realigner_ft.intervals', 'fun': realignTargetFilter_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '_splice.interval.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['realigner.intervals','realigner_ft.intervals'], 'clean': False, 'rerun': False }, { 'name': 'Realign/Recalibrate', 'desc': 'dedup.bam -> realign.bam -> recal.bam', 'fun': realignWithFtTarget_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '_splice.realign.qlog', 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['realign.bam', 'recal.bam'], 'clean': False, 'rerun': False }, { 'name': 'UnifiedGenotype', 'desc': 'recal.bam -> vcf', 'fun': unifiedGeno_batch.main, 'paramL': (baseDir, baseDir, server, genome, False), 'paramH': {}, 'logPostFix': '_splice.gatk.log', 'logExistsFn': lambda x: len(x)>0 and any(s for s in x[-10:] if 'Total runtime' in s), 'outFilePostFix': ['vcf'], 'clean': False, 'rerun': False }, # { # 'name': 'MutScan', # 'desc': 'vcf -> mutscan', # 'fun': vcf2mutScan_batch.main, # 'paramL': (baseDir, baseDir, False), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_splice.mutscan'], # 'clean': False, # 'rerun': False # }, ### annotate mutscan using VEP # { # 'name': 'VEP annotation', # 'desc': 'Annotate mutscan output', # 'fun': annotate_mutscan_batch.annotate_mutscan_batch, # 'paramL': (baseDir, '(.*)\.mutscan$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.vep.log', # 'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1], # 'outFilePostFix': ['vep'], # 'clean': False, # 'rerun': False # }, ## join cosmic # { # 'name': 'Join Cosmic', # 'desc': 'Join annotated mutscan output with COSMIC', # 'fun': annotate_join_cosmic_batch.main, # 'paramL': (baseDir, '(.*)\.vep$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_cosmic.dat'], # 'clean': False, # 'rerun': False # }, # { ## old joinCosmic # 'name': 'JoinCosmic', # 'desc': 'mutscan -> cosmic.dat', # 'fun': mutscan_snp_cosmic_batch.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': '_splice.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['dat'], # 'clean': False, # 'rerun': False # }, ## { ## 'name': 'Cleanup', ## 'desc': 'remove all, but logs and designated result file', ## 'fun': cleanup.main, ## 'paramL': (baseDir,), ## 'paramH': {}, ## 'logPostFix': 'cleanup.qlog', ## 'logExistsFn': lambda x: False, ## 'outFilePostFix': ['pileup'] ## }, ] # if server == 'smc2': # return specL[-1] # else: # return specL return specL
def genSpec_CS(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path( ['NGS/coverage', 'NGS/expression', 'NGS/copynumber']) import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch return [ ## PARAMTERS { 'name': 'Format Conversion and sorting', 'desc': 'bam -> sort -> sorted.bed', 'fun': bam2sortedBed_batch.sam2bed_batch, 'paramL': (baseDir, baseDir, 'recal', False), 'paramH': {}, 'logPostFix': '.sorted.bed.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['sorted.bed'], 'clean': False, 'rerun': False }, { 'name': 'RPKMgen', 'desc': 'sorted.bed -> rpkm', 'fun': degSeq_batch.main, 'paramL': (baseDir, baseDir, '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False), 'paramH': {}, 'logPostFix': '.degSeq.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'omitted' in x[-1], 'outFilePostFix': ['rpkm'], 'clean': False, 'rerun': False }, { 'name': 'Calculate a log2 rpkm ratio for all exons', 'desc': 'log2(tumor rpkm/normal rpkm', 'fun': rpkm2cn_batch.main_pool, 'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False), 'paramH': {}, 'logPostFix': '.cn.log', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['copynumber'], 'clean': False, 'rerun': False }, { 'name': 'Calculate gene copy number from log2 rpkm ratios', 'desc': 'copynumber -> cn_gene.dat', 'fun': exon2gene_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], mysetting.cs_gene, False), 'paramH': {}, 'logPostFix': '.cn_gene.log', 'logExistsFn': lambda x: len(x) > 0 and 'VHL' in x[-1], 'outFilePostFix': ['cn_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Draw Plot', 'desc': 'seg->plot', # 'fun' : drawCNATraj_batch.batch, 'fun': drawCNATraj_batch.draw_single, 'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome), 'paramH': {}, 'logPostFix': '', 'logExistsFn': lambda x: True, 'outFilePostFix': [], 'clean': False, 'rerun': False }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/fusion']) import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch ## MODULES return [ ## PARAMETERS # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False), # 'paramH': {}, # 'logPostFix': '.gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, # { 'name': 'Filter transloc', 'desc': 'splice.gsnap.gz -> splice_transloc.gsnap', 'fun': fusion_filter_transloc_batch.fusion_filter_batch, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.ft_tloc.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1], 'outFilePostFix': ['splice_transloc.gsnap'], 'clean': False, 'rerun': False }, { 'name': 'annotate', 'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap', 'fun': fusion_filter_annot1_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.annot.qlog', 'logExistsFn': lambda x: len(x) > 1 and 'Results' in x[-1], 'outFilePostFix': ['splice_transloc_annot1.gsnap'], 'clean': False, 'rerun': False }, { 'name': 'sort', 'desc': 'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt', 'fun': fusion_proc_sort_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.sort.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': [ 'splice_transloc_annot1.sorted.gsnap', 'splice_transloc_annot1.report.txt' ], 'clean': False, 'rerun': False }, { 'name': 'annotate report', 'desc': 'report.txt -> report_annot.txt', 'fun': fusion_proc_annot_batch.fusion_proc_annot_batch, 'paramL': (baseDir, baseDir, None, False), 'paramH': {}, 'logPostFix': '.report_annot.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'], 'clean': False, 'rerun': False }, # { # 'name': 'Summarize', # 'desc': '', # 'fun': , # 'paramL': (baseDir, baseDir, False), # 'paramH': {}, # 'logPostFix': 'realign.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1], # 'outFilePostFix': ['realign.bam', 'recal.bam'], # 'clean': False, # 'rerun': False # }, ]
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/copynumber']) import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch return [ ## PARAMETERS { 'name': 'run ngCGH for pairs of bam', 'desc': 'bam -> .ngCGH', 'fun': ngCGH_batch.main, 'paramL': (baseDir, baseDir, 1000, False), 'paramH': {}, 'logPostFix': '.cn_ngCGH.log', 'logExistsFn': lambda x: len(x) > 0 and 'finalizers' in x[-1], 'outFilePostFix': ['ngCGH'], 'clean': False, 'rerun': False }, { 'name': 'Segmenation', 'desc': 'ngCGH -> seg', 'fun': cgh2seg_batch.cgh2seg, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.seg.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Centrality parameter' in x[-1], 'outFilePostFix': ['ngCGH.seg'], 'clean': False, 'rerun': False }, { 'name': 'Calculate gene copy number from segments', 'desc': 'seg -> cn_gene.dat', 'fun': seg2gene_batch.main, 'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], [], False), 'paramH': {}, 'logPostFix': '.cn_gene.log', 'logExistsFn': lambda x: len(x) > 0 and 'ZZZ3' in x[-1], 'outFilePostFix': ['cn_gene.dat'], 'clean': False, 'rerun': False }, { 'name': 'Draw Plot', 'desc': 'seg->plot', # 'fun' : drawCNATraj_batch.batch, 'fun': drawCNATraj_batch.draw_single, 'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome), 'paramH': {}, 'logPostFix': '', 'logExistsFn': lambda x: True, 'outFilePostFix': [], 'clean': False, 'rerun': False }, ]
#!/usr/bin/python import sys, os, re, getopt, glob import mybasic mybasic.add_module_path(['NGS/align', 'NGS/mutation']) import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch def wxs_seq(baseDir, projectName): current_files_list = [] compared_files_list = [] current_files_list = glob.glob(baseDir + '/*') # compose log string html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>' # prep html file html_path = '/var/www/html/pipeline_logs/' + projectName + '/' file_name_split = baseDir.split('/S') sample_name = 'S' + file_name_split[1] file_name = 'pipeline1_log_' + sample_name + '.html' # create .html file with open(os.path.join(html_path, file_name), 'wb') as log_file: log_file.write(html_head_string) log_file.close() # change mode and open log_file again os.system('chmod 755 %s%s' % (html_path, file_name))
#!/usr/bin/python from glob import glob import sys,os,re import mysetting, mymysql, mypipe, mybasic mybasic.add_module_path(['Integration','NGS/mutation']) import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch def prep_single(outFileN, server='smc1', dbN='ircr1'): (con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host']) cosmicL = [] for dir in mysetting.wxsMutscanDirL: cosmicL += filter(lambda x: '_B_' not in x, glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir)) cursor.execute('SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"') results = cursor.fetchall() sidL = [] for res in results: sidL.append(res[0]) for cosmic in cosmicL: (sid, postfix, platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat', os.path.basename(cosmic)).groups() if postfix not in ['T', 'RSq']: sid = '%s_%s' % (sid, postfix) if sid not in sidL: print sid, cosmic tag = 'XSeq_%s' % platform cursor.execute('INSERT INTO sample_tag SET samp_id="%s", tag="%s"' % (sid, tag)) cmd = 'cat %s | /usr/bin/python %s/Integration/prepDB_mutscan.py > %s' % (' '.join(cosmicL), mysetting.SRC_HOME, outFileN) os.system(cmd)
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation']) import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch ## MODULES specL = [ ## PARAMETERS { 'name': 'Align', 'desc': '.fq.gz -> .bam', 'fun': gsnap_splice_bam_batch.align, 'paramL': (baseDir, baseDir, False, genome), 'paramH': {}, 'logPostFix': '.gsnap.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1], 'outFilePostFix': ['splice.bam'], 'clean': False, 'rerun': False }, { 'name': 'Sort', 'desc': 'bam -> sorted.bam', 'fun': gsnap_splice_bam_sort_batch.main, 'paramL': (baseDir, baseDir, 10000000000), 'paramH': {}, 'logPostFix': '_splice.sort.qlog', 'logExistsFn': lambda x: len(x) < 1 or 'merging' in x[-1], 'outFilePostFix': ['sorted.bam'], 'clean': False, 'rerun': False }, { 'name': 'MarkDuplicate/ReadGroup', 'desc': 'sorted.bam -> dedup.bam', 'fun': markDuplicates_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '_splice.dedup.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1], 'outFilePostFix': ['dedup.bam'], 'clean': False, 'rerun': False }, { 'name': 'RealignTarget', 'desc': 'dedup.bam -> realigner.intervals -> realigner_ft.intervals', 'fun': realignTargetFilter_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '_splice.interval.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['realigner.intervals', 'realigner_ft.intervals'], 'clean': False, 'rerun': False }, { 'name': 'Realign/Recalibrate', 'desc': 'dedup.bam -> realign.bam -> recal.bam', 'fun': realignWithFtTarget_batch.main, 'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]), 'paramH': {}, 'logPostFix': '_splice.realign.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Uploaded run' in x[-1], 'outFilePostFix': ['realign.bam', 'recal.bam'], 'clean': False, 'rerun': False }, { 'name': 'UnifiedGenotype', 'desc': 'recal.bam -> vcf', 'fun': unifiedGeno_batch.main, 'paramL': (baseDir, baseDir, server, genome, False), 'paramH': {}, 'logPostFix': '_splice.gatk.log', 'logExistsFn': lambda x: len(x) > 0 and any(s for s in x[-10:] if 'Total runtime' in s), 'outFilePostFix': ['vcf'], 'clean': False, 'rerun': False }, # { # 'name': 'MutScan', # 'desc': 'vcf -> mutscan', # 'fun': vcf2mutScan_batch.main, # 'paramL': (baseDir, baseDir, False), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_splice.mutscan'], # 'clean': False, # 'rerun': False # }, ### annotate mutscan using VEP # { # 'name': 'VEP annotation', # 'desc': 'Annotate mutscan output', # 'fun': annotate_mutscan_batch.annotate_mutscan_batch, # 'paramL': (baseDir, '(.*)\.mutscan$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.vep.log', # 'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1], # 'outFilePostFix': ['vep'], # 'clean': False, # 'rerun': False # }, ## join cosmic # { # 'name': 'Join Cosmic', # 'desc': 'Join annotated mutscan output with COSMIC', # 'fun': annotate_join_cosmic_batch.main, # 'paramL': (baseDir, '(.*)\.vep$', baseDir), # 'paramH': {}, # 'logPostFix': '_splice.mutscan.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['_cosmic.dat'], # 'clean': False, # 'rerun': False # }, # { ## old joinCosmic # 'name': 'JoinCosmic', # 'desc': 'mutscan -> cosmic.dat', # 'fun': mutscan_snp_cosmic_batch.main, # 'paramL': (baseDir,), # 'paramH': {}, # 'logPostFix': '_splice.cosmic.log', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['dat'], # 'clean': False, # 'rerun': False # }, ## { ## 'name': 'Cleanup', ## 'desc': 'remove all, but logs and designated result file', ## 'fun': cleanup.main, ## 'paramL': (baseDir,), ## 'paramH': {}, ## 'logPostFix': 'cleanup.qlog', ## 'logExistsFn': lambda x: False, ## 'outFilePostFix': ['pileup'] ## }, ] # if server == 'smc2': # return specL[-1] # else: # return specL return specL
def genSpec(baseDir, server='smc1', genome='hg19'): mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/skipping']) import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch ## MODULES return [ ## PARAMETERS # { # 'name': 'Align', # 'desc': 'fastq -> splice.gsnap', # 'fun': gsnap_splice_batch.align, # 'paramL':(baseDir, baseDir, 6, False), # 'paramH': {}, # 'logPostFix': 'gsnap.qlog', # 'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1], # 'outFilePostFix': ['splice.gsnap'], # 'clean': False, # 'rerun': False # }, { 'name': 'Filter exonskip', 'desc': 'splice.gsnap.gz -> splice_exonSkip.gsnap', 'fun': exonSkip_filter_batch.exonSkip_filter_batch, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.exonSkip.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1], 'outFilePostFix': ['splice_exonSkip.gsnap'], 'clean': False, 'rerun': False }, { 'name': 'Filter normal exonskip', 'desc': 'splice.gsnap -> splice_exonSkip_normal.gsnap.gz', 'fun': exonSkip_filter_normal_batch.exonSkip_filter_batch, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.exonSkip_normal.qlog', 'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1], 'outFilePostFix': ['splice_exonSkip_normal.gsnap.gz'], 'clean': False, 'rerun': False }, { 'name': 'sort', 'desc': 'splice_exonSkip.gsnap -> splice_exonSkip_report.txt', 'fun': exonSkip_sort_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.sort.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['splice_exonSkip_report.txt'], 'clean': False, 'rerun': False }, { 'name': 'sort-normal', 'desc': 'splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt', 'fun': exonSkip_normal_sort_batch.main, 'paramL': (baseDir, baseDir, False), 'paramH': {}, 'logPostFix': '.sort_normal.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['splice_exonSkip_normal_report.txt'], 'clean': False, 'rerun': False }, { 'name': 'annotate report', 'desc': 'report.txt -> report_annot.txt', 'fun': exonSkip_proc_annot_batch.exonSkip_proc_annot_batch, 'paramL': (baseDir, baseDir, None, False), 'paramH': {}, 'logPostFix': '.skip_annot.qlog', 'logExistsFn': lambda x: len(x) == 0, 'outFilePostFix': ['splice_exonSkip_report_annot.txt'], 'clean': False, 'rerun': False }, # { # 'name': 'link', # 'desc': 'put all report_annot.txt files in a directory', # 'fun': exonSkip_link.link, # 'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'), # 'paramH': {}, # 'logPostFix': 'link.qlog', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['splice_exonSkip_report_annot.txt'], # 'clean': False, # 'rerun': False # }, # # { # 'name': 'link-normal', # 'desc': 'put all report_normal.txt files in a directory', # 'fun': exonSkip_link_normal.link, # 'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'), # 'paramH': {}, # 'logPostFix': 'link_normal.qlog', # 'logExistsFn': lambda x: len(x)==0, # 'outFilePostFix': ['splice_exonSkip_normal_report.txt'], # 'clean': False, # 'rerun': False # }, ]