Пример #1
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/phylotree'])
	import merge_count, make_phylotree

	return [
		{
		'name': 'merge counts',
		'desc': 'merge mutation loci and allele counts',
		'fun': merge_count.merge_count,
		'paramL': (baseDir, baseDir, 5, 0.05, 5),
#		'paramL': (baseDir, baseDir, 20, 0.2, 5),
		'paramH': {},
		'logPostFix': '.merge_count.log',
		'logExistsFn': lambda x: 'done' in x[-1],
		'outFilePostFix': ['.mutations','.filtered'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'build tree',
		'desc': 'make phylogenetic tree',
		'fun': make_phylotree.main,
		'paramL': (baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.make_phylotree.log',
		'logExistsFn': lambda x: 'done' in x[-1],
		'outFilePostFix': ['.infile', '.outfile','.tree','.pars_tree.pdf','.outfile_report.txt'],
		'clean': False,
		'rerun': False
		},
	]
Пример #2
0
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/mutation'])
    import mutect_batch, somaticindeldetector_batch

    return [  ## PARAMTERS
        {
            'name': 'Run MuTect',
            'desc': '.recal.bam -> .mutect, mutect.vcf',
            'fun': mutect_batch.mutect_pair,
            'paramL': (baseDir, baseDir, genome, server, False),
            'paramH': {},
            'logPostFix': '.mutect_pair.log',
            'logExistsFn': lambda x: 'done' in x[-9],
            'outFilePostFix': ['.mutect', '.mutect_pair.vcf'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'somaticindeldetector',
            'desc': '.recal.bam -> indels_filter.vcf',
            'fun': somaticindeldetector_batch.paired_mode,
            'paramL': (baseDir, baseDir, 'SS', genome, server, False),
            'paramH': {},
            'logPostFix': '.somaticindeldetector_pair.log',
            'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
            'outFilePostFix':
            ['indels_pair_filter.vcf', 'indels_pair_filter.out'],
            'clean': False,
            'rerun': False
        },
    ]
Пример #3
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/copynumber'])
	import	cn_corr_batch, corrcgh2seg_batch, drawCNATraj_batch, corrseg2gene_batch

	return [ ## PARAMETERS
		{
		'name': 'copy number correction',
		'desc': 'ngCGH -> corr.ngCGH',
		'fun': cn_corr_batch.main,
		'paramL': (baseDir, baseDir, False, server),
		'paramH': {},
		'logPostFix': '.cn_corr.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['corr.ngCGH'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Segmenation',
		'desc': 'corr.ngCGH -> corr.ngCGH.seg',
		'fun': corrcgh2seg_batch.cgh2seg,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.corr.seg.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1],
		'outFilePostFix': ['corr.ngCGH.seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from segments',
		'desc': 'corr.seg -> corr.cn_gene.dat',
		'fun': corrseg2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False),
		'paramH': {},
		'logPostFix': '.corr.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['corr.cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Plot corrected segmentation',
		'desc': 'Plot segmentations for corrected copy number profile',
#		'fun': drawCNATraj.main,
#		'paramL': (baseDir, baseDir),
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, baseDir, genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},

		]
Пример #4
0
def genSpec_single(baseDir, server="smc1", genome="hg19"):
    mybasic.add_module_path(["NGS/mutation"])
    import mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            "name": "Run MuTect (single)",
            "desc": ".recal.bam -> .mutect, mutect_single_filter.vcf",
            "fun": mutect_batch.mutect_PON,
            "paramL": (baseDir, genome, server, False),
            "paramH": {},
            "logPostFix": ".mutect_single.log",
            "logExistsFn": lambda x: "done" in x[-9],
            "outFilePostFix": ["mutect_single_filter.vcf"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "somaticindeldetector",
            "desc": ".recal.bam -> indels_single_filter.vcf",
            "fun": somaticindeldetector_batch.single_mode,
            "paramL": (baseDir, baseDir, "SS", genome, server, False),
            "paramH": {},
            "logPostFix": ".somaticindeldetector_single.log",
            "logExistsFn": lambda x: ("chrX" in x[-1] or "chrX" in x[-2]),
            "outFilePostFix": ["indels_single_filter.vcf", "indels_single_filter.out"],
            "clean": False,
            "rerun": False,
        },
    ]
Пример #5
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/copynumber'])
	import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch

	return [ ## PARAMETERS
		{
		'name': 'run ngCGH for pairs of bam',
		'desc': 'bam -> .ngCGH',
		'fun': ngCGH_batch.main,
		'paramL': (baseDir, baseDir, 1000, False),
		'paramH': {},
		'logPostFix': '.cn_ngCGH.log',
		'logExistsFn': lambda x: len(x)>0 and 'finalizers' in x[-1],
		'outFilePostFix': ['ngCGH'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Segmenation',
		'desc': 'ngCGH -> seg',
		'fun': cgh2seg_batch.cgh2seg,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.seg.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Centrality parameter' in x[-1],
		'outFilePostFix': ['ngCGH.seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from segments',
		'desc': 'seg -> cn_gene.dat',
		'fun': seg2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],[],False),
		'paramH': {},
		'logPostFix': '.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Draw Plot',
		'desc': 'seg->plot',
#		'fun' : drawCNATraj_batch.batch,
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},

		]
Пример #6
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/splice_gsnap/ei_junc'])
	import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch## MODULES

	return [ ## PARAMETERS
#		{
#		'name': 'bam to fastq',
#		'desc': 'bam -> fastq',
#		'fun': bam2fastq_batch2.bam2fastq_batch2,
#		'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'),
#		'paramH': {},
#		'logPostFix': 'fastq.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1],
#		'outFilePostFix': ['fastq'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'Align',
#		'desc': 'fastq -> splice.gsnap',
#		'fun': gsnap_splice_batch.align,
#		'paramL':(baseDir, baseDir, 6, False, False),
#		'paramH': {},
#		'logPostFix': 'gsnap.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
#		'outFilePostFix': ['splice.gsnap'],
#		'clean': False,
#		'rerun': False
#		},

		{
		'name': 'Filter eiJunc',
		'desc': 'splice.gsnap.gz -> ei.dat',
		'fun': ei_junc_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.ei.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Finished' in x[-1],
		'outFilePostFix': ['ei.dat'],
		'clean': False,
		'rerun': False 
		},

		]
Пример #7
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(
        ['NGS/fastq', 'NGS/align', 'NGS/splice_gsnap/ei_junc'])
    import bam2fastq_batch2, gsnap_splice_batch, ei_junc_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'bam to fastq',
        #		'desc': 'bam -> fastq',
        #		'fun': bam2fastq_batch2.bam2fastq_batch2,
        #		'paramL':(baseDir, baseDir, 'UNCID_[0-9]{7}\.(.*)\.sorted_.*'),
        #		'paramH': {},
        #		'logPostFix': 'fastq.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Samples' in x[-1],
        #		'outFilePostFix': ['fastq'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False, False),
        #		'paramH': {},
        #		'logPostFix': 'gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Filter eiJunc',
            'desc': 'splice.gsnap.gz -> ei.dat',
            'fun': ei_junc_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.ei.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Finished' in x[-1],
            'outFilePostFix': ['ei.dat'],
            'clean': False,
            'rerun': False
        },
    ]
Пример #8
0
def genSpec(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/mutation'])
	import	mut_clonality_batch

	return [ ## PARAMETERS
		{
		'name': 'determine mutation clonality',
		'desc': 'mutect -> mutect_cl.dat',
		'fun': mut_clonality_batch.main,
		'paramL': (baseDir, baseDir, mysetting.cnaBaseDir, False, server),
		'paramH': {},
		'logPostFix': '.mutect_cl.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['mutect_cl.dat'],
		'clean': False,
		'rerun': False
		},

		]
Пример #9
0
def main(datFileN, server='smc1', dbN='CancerSCAN'):
    mybasic.add_module_path(['NGS/mutation', 'Integration'])

    import vep_batch, makeDB_mutation_rxsq
    print mysetting.CSmutDir + '/*CS'
    vep_batch.main(glob(mysetting.CSmutDir + '/*CS'),
                   postfixL=[
                       '.mutect_filter.vcf', '.mutect_single_filter.vcf',
                       '.indels_filter.vcf', '.indels_single_filter.vcf'
                   ],
                   fork=True)

    os.system(
        'cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s'
        % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN))
    mymysql.reset_table(tableN='mutation_cs',
                        dataFileN=datFileN,
                        user=mysetting.mysqlH[server]['user'],
                        passwd=mysetting.mysqlH[server]['passwd'],
                        db=dbN,
                        host=mysetting.mysqlH[server]['host'])

    (con,
     cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],
                                 passwd=mysetting.mysqlH[server]['passwd'],
                                 db=dbN,
                                 host=mysetting.mysqlH[server]['host'])
    sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir + '/' + x),
                    os.listdir(mysetting.CSmutDir))
    for sampN in sampNL:
        id = '_'.join(sampN.split('_')[:-2])
        postfix = sampN.split('_')[-2]
        if postfix == 'B':
            continue
        if postfix != 'T':
            id = '%s_%s' % (id, postfix)
        cursor.execute(
            '''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' %
            id)
        cursor.execute(
            '''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
Пример #10
0
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/phylotree'])
    import merge_count, make_phylotree

    return [
        {
            'name': 'merge counts',
            'desc': 'merge mutation loci and allele counts',
            'fun': merge_count.merge_count,
            'paramL': (baseDir, baseDir, 5, 0.05, 5),
            #		'paramL': (baseDir, baseDir, 20, 0.2, 5),
            'paramH': {},
            'logPostFix': '.merge_count.log',
            'logExistsFn': lambda x: 'done' in x[-1],
            'outFilePostFix': ['.mutations', '.filtered'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'build tree',
            'desc':
            'make phylogenetic tree',
            'fun':
            make_phylotree.main,
            'paramL': (baseDir, baseDir),
            'paramH': {},
            'logPostFix':
            '.make_phylotree.log',
            'logExistsFn':
            lambda x: 'done' in x[-1],
            'outFilePostFix': [
                '.infile', '.outfile', '.tree', '.pars_tree.pdf',
                '.outfile_report.txt'
            ],
            'clean':
            False,
            'rerun':
            False
        },
    ]
Пример #11
0
def main(datFileN, server='smc1', dbN='CancerSCAN'):
	mybasic.add_module_path(['NGS/mutation','Integration'])

	import vep_batch, makeDB_mutation_rxsq
	print mysetting.CSmutDir+'/*CS'
	vep_batch.main(glob(mysetting.CSmutDir+'/*CS'), postfixL=['.mutect_filter.vcf','.mutect_single_filter.vcf','.indels_filter.vcf','.indels_single_filter.vcf'], fork=True)

	os.system('cat %s/*CS/*filter_vep.dat | /usr/bin/python %s/Integration/prepDB_mutation_cancerscan.py > %s' % (mysetting.CSmutDir, mysetting.SRC_HOME, datFileN))
	mymysql.reset_table(tableN='mutation_cs', dataFileN=datFileN, user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])

	(con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])
	sampNL = filter(lambda x: os.path.isdir(mysetting.CSmutDir+'/'+x), os.listdir(mysetting.CSmutDir))
	for sampN in sampNL:
		id = '_'.join(sampN.split('_')[:-2])
		postfix = sampN.split('_')[-2]
		if postfix == 'B':
			continue
		if postfix != 'T':
			id = '%s_%s' % (id, postfix)
		cursor.execute('''DELETE FROM sample_tag WHERE samp_id="%s" AND tag="XSeq_CS"''' % id)
		cursor.execute('''INSERT INTO sample_tag SET samp_id="%s",tag="XSeq_CS"''' % id)
Пример #12
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/mutation','NGS/loh','NGS/purity'])
	import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch ## MODULES

	return [ ## PARAMETERS
		{
		'name': 'MutScan for the tumor sample',
		'desc': 'pileup_proc -> loh.mutscan',
		'fun': mutScan_loh_batch.main,
		'paramL': (baseDir, baseDir, False, 10, 0, 0),
		'paramH': {},
		'logPostFix': '.loh.mutscan.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['loh.mutscan'],
		'clean': False,
		'rerun': False
		},
		
		{
		'name': 'delta B-allele frequencies calculation',
		'desc': 'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample',
		'fun': delta_baf_mutscan_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dbaf.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['dbaf.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'delta BAF segmentation',
		'desc': 'segment delta BAF',
		'fun': delta_baf_seg_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dbaf.seg.log',
		'logExistsFn': lambda x: len(x)>0 and 'Analyzing' in x[-1],
		'outFilePostFix': ['seg'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Plotting',
		'desc': 'Generate deltaBAF/CN trajectory plot',
		'fun': dbaf_cn_plot_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.traj_plot.log',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1],
		'outFilePostFix': ['dBAF_CNA_traj.pdf'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'CNLOH/LOH determination',
		'desc': 'calculate average copy number of LOH segments to determine CNLOH/LOH',
		'fun': calcCN_LOH_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.loh_cn.log',
		'logExistsFn': lambda x: len(x)>0 and 'Setting' in x[-1],
		'outFilePostFix': ['loh_cn.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'gene LOH',
		'desc': 'loh_cn.txt -> loh_gene.dat',
		'fun': loh2gene_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.refFlatH[server][genome]),
		'paramH': {},
		'logPostFix': '.loh_gene.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['loh_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Normal contamiation calculation',
		'desc': 'calculate normal contamination levels at heterozygous germline SNPs in LOH regions',
		'fun': calcNormalF_loh_batch.main,
		'paramL': (baseDir, baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.nfrac_all.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['nFrac_all.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Tumor fraction estimation',
		'desc': 'estimate tumor fraction',
		'fun': peakFrac_batch.main,
		'paramL': (baseDir, baseDir,False),
		'paramH': {},
		'logPostFix': '.tfrac.log',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-1],
		'outFilePostFix': ['tumor_frac.txt'],
		'clean': False,
		'rerun': False
		},

		]
Пример #13
0
#!/usr/bin/python
## postprocessing for RNA-Seq pipelines : rsq2skip, rsq2fusion, rsq2eiJunc
## handles 3 pipeline at the same time: no need to run 3 times after each pipeline

from glob import glob
import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
from datetime import datetime
from warnings import filterwarnings
from warnings import resetwarnings

mybasic.add_module_path(['NGS/splice_gsnap/skipping','NGS/splice_gsnap/fusion','NGS/splice_gsnap/ei_junc','Integration'])
import makeDB_splice_AF
import prepDB_splice_normal, exonSkip_summarize, prepDB_splice_skip
import fusion_summarize, prepDB_splice_fusion
import ei_junc_filter, prepDB_splice_eiJunc

BASE='/EQL1/NSL/RNASeq/results'
RSQPattern=('(.*)_RSq','')

def post_rsq2skip(dirN, server='smc1', dbN='ihlee_test', sampL=[]):
	(con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host'])
	cursor.execute('ALTER TABLE splice_normal CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('ALTER TABLE splice_normal_loc1 CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('ALTER TABLE splice_normal_loc2 CHANGE COLUMN samp_id samp_id char(63)')
	cursor.execute('CREATE TEMPORARY TABLE splice_normal_tmp LIKE splice_normal')
	sampNL = filter(lambda x: os.path.isdir(dirN + '/' + x), os.listdir(dirN))
	for sampN in sampNL:
		baseDir = dirN + '/' + sampN
		sid = sampN[:-4].replace('.','_').replace('-','_') ## RNASeq sample has '***_RSq'
Пример #14
0
#!/usr/bin/python

import sys, os, glob, getopt
import mybasic, mysetting

mybasic.add_module_path(['utils'])

import link_fqgz_hj

# linking
link_fqgz_hj.link('/EQL1/NSL/WXS/fastq','/EQL1/NSL/WXS/exome_20130529', '.*([0-9]{3})[ITN].*')


# listing directories
dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*')

#for dir_name in dir_list

def main(pbs=False):

	print dir_list, len(dir_list)
	projectName = 'heejin_20'
	os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName)

	for single_dir in dir_list:

		sampN = single_dir.split('/')[-1]

#		if sampN not in ['S012_T_SS']:
#			continue
Пример #15
0
	annotH = {}
	for line in inFile:
		colL = line.rstrip().split('\t')
		rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']])
		(chr,chrSta,chrEnd) = rm.groups()
		ref = colL[idxH['ref']]
		alt = colL[idxH['alt']]
		if (chr,chrSta,chrEnd,ref,alt) not in annotH:
			annotH[(chr,chrSta,chrEnd,ref,alt)] = {}
			for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']:
				annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]]
	return annotH

### until it is merged into pipeline
import mybasic
mybasic.add_module_path(['NGS/pipeline'])
import mypipe
#bamDirL = mysetting.wxsBamDirL
#trioH = mypipe.read_trio(bamDirL=bamDirL)
#pairH = {}
#for tid in trioH:
#	if tid not in ['37']:
#		continue
#	if trioH[tid]['recur_id'] != []:
#		print tid, trioH[tid]['prim_id']
#		print tid, trioH[tid]['recur_id']
#		pid = re.match('(.*)_T.{,2}_[TS]{2}', trioH[tid]['prim_id'][0]).group(1)
#		pairH[pid] = map(lambda x: re.match('(.*)_T.{,2}_[TS]{2}',x).group(1), trioH[tid]['recur_id'])

inDir = '/EQL3/pipeline/somatic_mutect/'
outDir = '/EQL1/PrimRecur/phylogeny'
Пример #16
0
#!/usr/bin/python

import sys, os, re, getopt, glob
import mybasic

mybasic.add_module_path(["NGS/align", "NGS/mutation"])

import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch


def wxs_seq(baseDir, projectName):

    current_files_list = []
    compared_files_list = []
    current_files_list = glob.glob(baseDir + "/*")

    # compose log string
    html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

    # prep html file
    html_path = "/var/www/html/pipeline_logs/" + projectName + "/"
    file_name_split = baseDir.split("/S")
    sample_name = "S" + file_name_split[1]
    file_name = "pipeline1_log_" + sample_name + ".html"
    # create .html file
    with open(os.path.join(html_path, file_name), "wb") as log_file:
        log_file.write(html_head_string)
    log_file.close()

    # change mode and open log_file again
    os.system("chmod 755 %s%s" % (html_path, file_name))
Пример #17
0
def genSpec_CS(baseDir, server='smc1', genome='hg19'):
	mybasic.add_module_path(['NGS/coverage','NGS/expression','NGS/copynumber'])
	import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch

	return [ ## PARAMTERS
		{
		'name': 'Format Conversion and sorting',
		'desc': 'bam -> sort -> sorted.bed',
		'fun': bam2sortedBed_batch.sam2bed_batch,
		'paramL': (baseDir, baseDir, 'recal', False),
		'paramH': {},
		'logPostFix': '.sorted.bed.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['sorted.bed'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RPKMgen',
		'desc': 'sorted.bed -> rpkm',
		'fun': degSeq_batch.main,
		'paramL': (baseDir, baseDir, '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False),
		'paramH': {},
		'logPostFix': '.degSeq.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'omitted' in x[-1],
		'outFilePostFix': ['rpkm'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate a log2 rpkm ratio for all exons',
		'desc': 'log2(tumor rpkm/normal rpkm',
		'fun': rpkm2cn_batch.main_pool,
		'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False),
		'paramH': {},
		'logPostFix': '.cn.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['copynumber'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Calculate gene copy number from log2 rpkm ratios',
		'desc': 'copynumber -> cn_gene.dat',
		'fun': exon2gene_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],mysetting.cs_gene,False),
		'paramH': {},
		'logPostFix': '.cn_gene.log',
		'logExistsFn': lambda x: len(x)>0 and 'VHL' in x[-1],
		'outFilePostFix': ['cn_gene.dat'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Draw Plot',
		'desc': 'seg->plot',
#		'fun' : drawCNATraj_batch.batch,
		'fun' : drawCNATraj_batch.draw_single,
		'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA',genome),
		'paramH': {},
		'logPostFix': '',
		'logExistsFn': lambda x: True,
		'outFilePostFix': [],
		'clean': False,
		'rerun': False
		},
	]
Пример #18
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch  ## MODULES
    import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'BWA',
            'desc':
            'fq -> sam -> bam -> sorted.bam',
            'fun':
            bwa_batch.align,
            'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000,
                       False, mysetting.bwaIndexH[server][genome], True),
            'paramH': {},
            'logPostFix':
            '.bwa.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'bam_sort_core' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam -> RG.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['RG.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Realign',
            'desc':
            'RG.bam -> realign.bam -> recal.bam',
            'fun':
            realign_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },

        #		{
        #		'name': 'Pileup',
        #		'desc': 'recal.bam -> pileup',
        #		'fun': pileup_batch.main,
        #		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]),
        #		'paramH': {},
        #		'logPostFix': '.pileup.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1],
        #		'outFilePostFix': ['pileup'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Pileup_proc',
            'desc': 'recal.bam -> pileup -> pileup_proc',
            'fun': procPileup_split_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.ucscRefH[server][genome], False),
            'paramH': {},
            'logPostFix': '.pileup_proc.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['pileup_proc', 'pileup.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'MutScan',
            'desc': 'pileup_proc -> mutscan',
            'fun': mutScan_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.mutscan.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['mutscan'],
            'clean': False,
            'rerun': False
        },

        # temporarily off
        #		{
        #		'name': 'MuTect',
        #		'desc': 'recal.bam -> .vcf',
        #		'fun': mutect_batch.mutect_PON,
        #		'paramL': (baseDir, genome, server, False),
        #		'paramH': {},
        #		'logPostFix': '.mutect_single.log',
        #		'logExistsFn': lambda x: 'done' in x[-9],
        #		'outFilePostFix': ['.mutect.vcf','.mutect'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'SomaticIndelDetector',
        #		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
        #		'fun': somaticindeldetector_batch.single_mode,
        #		'paramL': (baseDir, baseDir, 'SS', genome, server, False),
        #		'paramH': {},
        #		'logPostFix': '.somaticindeldetector.log',
        #		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
        #		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{## old cosmic join
        #		'name': 'mutscan_snp_cosmic',
        #		'desc': 'mutscan -> cosmic.dat',
        #		'fun': mutscan_snp_cosmic_batch.main,
        #		'paramL': (baseDir, server),
        #		'paramH': {},
        #		'logPostFix': '.cosmic.log',
        #		'logExistsFn': lambda x: len(x) == 0,
        #		'outFilePostFix': ['cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'VEP annotation',
        #		'desc': 'Annotate mutscan output',
        #		'fun': vep_mutscan_batch.main,
        #		'paramL': ([baseDir]),
        #		'paramH': {},
        #		'logPostFix': '.mutscan_vep.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['mutscan_vep_out.vcf'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ## join cosmic
        #		{
        #		'name': 'Join Cosmic',
        #		'desc': 'Join annotated mutscan output with COSMIC',
        #		'fun': annotate_join_cosmic_batch.main,
        #		'paramL': (baseDir, '(.*)\.vep$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Пример #19
0
#!/usr/bin/python
## integration into DB (per sample)

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['NGS/expression','Integration'])
import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene

def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'):
	sampN = baseDir.split('/')[-1]
	sid = sampN[:-4].replace('-','_').replace('.','_') ##drop '_RSq'

	if dbN in ['ihlee_test','ircr1']:
		gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN
		datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN
	else:
		gctFileN = '%s/%s.gct' % (baseDir, sampN)
		datFileN = '%s/%s.dat' % (baseDir, sampN)
	print sampN, gctFileN
	rpkm_process.rpkm_process(inputDirN=baseDir, filePattern='*.rpkm', sampRegex='(.*)_RSq\.rpkm', outputFileN=gctFileN)
	## prep
	prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN, geneList=[], samplePrefix='', outDatFileName=datFileN)
	## import
	(con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],passwd=mysqlH[server]['passwd'],db=dbN,host=mysqlH[server]['host'])
	cursor.execute('DELETE FROM rpkm_gene_expr WHERE samp_id="%s"' % sid)
	cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE rpkm_gene_expr' % datFileN)
	cursor.execute('DROP VIEW IF EXISTS rpkm_gene_expr_lg2')
	cursor.execute('CREATE VIEW rpkm_gene_expr_lg2 AS SELECT samp_id,gene_sym,log2(rpkm+1) AS lg2_rpkm FROM rpkm_gene_expr')
	## make sure to update sample_tag that this sample has RNA-Seq
	cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
Пример #20
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/align','NGS/splice_gsnap/fusion'])
	import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch ## MODULES

	return [ ## PARAMETERS
#		{
#		'name': 'Align',
#		'desc': 'fastq -> splice.gsnap',
#		'fun': gsnap_splice_batch.align,
#		'paramL':(baseDir, baseDir, 6, False),
#		'paramH': {},
#		'logPostFix': '.gsnap.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
#		'outFilePostFix': ['splice.gsnap'],
#		'clean': False,
#		'rerun': False
#		},
#
		{
		'name': 'Filter transloc',
		'desc': 'splice.gsnap.gz -> splice_transloc.gsnap',
		'fun': fusion_filter_transloc_batch.fusion_filter_batch,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.ft_tloc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Results' in x[-1],
		'outFilePostFix': ['splice_transloc.gsnap'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'annotate',
		'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap',
		'fun': fusion_filter_annot1_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.annot.qlog',
		'logExistsFn': lambda x: len(x)>1 and 'Results' in x[-1],
		'outFilePostFix': ['splice_transloc_annot1.gsnap'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'sort',
		'desc': 'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt',
		'fun': fusion_proc_sort_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.sort.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['splice_transloc_annot1.sorted.gsnap','splice_transloc_annot1.report.txt'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'annotate report',
		'desc': 'report.txt -> report_annot.txt',
		'fun': fusion_proc_annot_batch.fusion_proc_annot_batch,
		'paramL': (baseDir, baseDir, None, False),
		'paramH': {},
		'logPostFix': '.report_annot.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'],
		'clean': False,
		'rerun': False
		},
		
#		{
#		'name': 'Summarize',
#		'desc': '',
#		'fun': ,
#		'paramL': (baseDir, baseDir, False),
#		'paramH': {},
#		'logPostFix': 'realign.qlog',
#		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
#		'outFilePostFix': ['realign.bam', 'recal.bam'],
#		'clean': False,
#		'rerun': False
#		},

		]
Пример #21
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch  ## MODULES
    import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'BWA',
            'desc':
            'fq -> sam -> bam -> sorted.bam',
            'fun':
            bwa_batch.align,
            'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000,
                       False, mysetting.bwaIndexH[server][genome], True),
            'paramH': {},
            'logPostFix':
            '.bwa.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'bam_sort_core' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean':
            True,
            'rerun':
            False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['dedup.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Realign',
            'desc':
            'dedup.bam -> realign.bam -> recal.bam',
            'fun':
            realign_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'MuTect',
            'desc': 'recal.bam -> .vcf',
            'fun': mutect_batch.mutect_PON,
            'paramL': (baseDir, genome, server, False),
            'paramH': {},
            'logPostFix': '.mutect_single.log',
            'logExistsFn': lambda x: 'done' in x[-9],
            'outFilePostFix': ['.mutect.vcf', '.mutect_filter.vcf', '.mutect'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'SomaticIndelDetector',
            'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
            'fun': somaticindeldetector_batch.single_mode,
            'paramL': (baseDir, baseDir, 'CS', genome, server, False),
            'paramH': {},
            'logPostFix': '.somaticindeldetector.log',
            'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
            'outFilePostFix': ['indels_filter.vcf', 'indels_filter.out'],
            'clean': False,
            'rerun': False
        },

        #		{ ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?)
        #		'name': 'VEP',
        #		'desc': '.vcf -> .dat',
        #		'fun': vep_mutect_batch.main,
        #		'paramL': ([baseDir], False),
        #		'paramH': {},
        #		'logPostFix': 'mutect_vep.log',
        #		'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['_vep.dat'],
        #		'clean': False,
        #		'rerun': False
        #		}

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Пример #22
0
#!/usr/bin/python

import sys, os, glob, getopt
import mybasic, mysetting

mybasic.add_module_path(['utils'])

import link_fqgz_hj

# linking
link_fqgz_hj.link('/EQL1/NSL/WXS/fastq', '/EQL1/NSL/WXS/exome_20130529',
                  '.*([0-9]{3})[ITN].*')

# listing directories
dir_list = glob.glob('/EQL1/NSL/WXS/exome_20130529/*')

#for dir_name in dir_list


def main(pbs=False):

    print dir_list, len(dir_list)
    projectName = 'heejin_20'
    os.system('mkdir /var/www/html/pipeline_logs/%s' % projectName)

    for single_dir in dir_list:

        sampN = single_dir.split('/')[-1]

        #		if sampN not in ['S012_T_SS']:
        #			continue
Пример #23
0
#!/usr/bin/python

from glob import glob
import sys, os, re
import mysetting, mymysql, mypipe, mybasic
mybasic.add_module_path(['Integration', 'NGS/mutation'])
import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch


def prep_single(outFileN, server='smc1', dbN='ircr1'):
    (con,
     cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],
                                 passwd=mysetting.mysqlH[server]['passwd'],
                                 db=dbN,
                                 host=mysetting.mysqlH[server]['host'])
    cosmicL = []
    for dir in mysetting.wxsMutscanDirL:
        cosmicL += filter(
            lambda x: '_B_' not in x,
            glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir))

    cursor.execute(
        'SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"')
    results = cursor.fetchall()
    sidL = []
    for res in results:
        sidL.append(res[0])
    for cosmic in cosmicL:
        (sid, postfix,
         platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat',
                              os.path.basename(cosmic)).groups()
Пример #24
0
def genSpec(baseDir, server="smc1", genome="hg19"):

    mybasic.add_module_path(["NGS/align", "NGS/splice_gsnap/skipping"])
    import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        # 		{
        # 		'name': 'Align',
        # 		'desc': 'fastq -> splice.gsnap',
        # 		'fun': gsnap_splice_batch.align,
        # 		'paramL':(baseDir, baseDir, 6, False),
        # 		'paramH': {},
        # 		'logPostFix': 'gsnap.qlog',
        # 		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        # 		'outFilePostFix': ['splice.gsnap'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
        {
            "name": "Filter exonskip",
            "desc": "splice.gsnap.gz -> splice_exonSkip.gsnap",
            "fun": exonSkip_filter_batch.exonSkip_filter_batch,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".exonSkip.qlog",
            "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1],
            "outFilePostFix": ["splice_exonSkip.gsnap"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "Filter normal exonskip",
            "desc": "splice.gsnap -> splice_exonSkip_normal.gsnap.gz",
            "fun": exonSkip_filter_normal_batch.exonSkip_filter_batch,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".exonSkip_normal.qlog",
            "logExistsFn": lambda x: len(x) > 0 and "Results" in x[-1],
            "outFilePostFix": ["splice_exonSkip_normal.gsnap.gz"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "sort",
            "desc": "splice_exonSkip.gsnap -> splice_exonSkip_report.txt",
            "fun": exonSkip_sort_batch.main,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".sort.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_report.txt"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "sort-normal",
            "desc": "splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt",
            "fun": exonSkip_normal_sort_batch.main,
            "paramL": (baseDir, baseDir, False),
            "paramH": {},
            "logPostFix": ".sort_normal.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_normal_report.txt"],
            "clean": False,
            "rerun": False,
        },
        {
            "name": "annotate report",
            "desc": "report.txt -> report_annot.txt",
            "fun": exonSkip_proc_annot_batch.exonSkip_proc_annot_batch,
            "paramL": (baseDir, baseDir, None, False),
            "paramH": {},
            "logPostFix": ".skip_annot.qlog",
            "logExistsFn": lambda x: len(x) == 0,
            "outFilePostFix": ["splice_exonSkip_report_annot.txt"],
            "clean": False,
            "rerun": False,
        },
        # 		{
        # 		'name': 'link',
        # 		'desc': 'put all report_annot.txt files in a directory',
        # 		'fun': exonSkip_link.link,
        # 		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'),
        # 		'paramH': {},
        # 		'logPostFix': 'link.qlog',
        # 		'logExistsFn': lambda x: len(x)==0,
        # 		'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
        #
        # 		{
        # 		'name': 'link-normal',
        # 		'desc': 'put all report_normal.txt files in a directory',
        # 		'fun': exonSkip_link_normal.link,
        # 		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'),
        # 		'paramH': {},
        # 		'logPostFix': 'link_normal.qlog',
        # 		'logExistsFn': lambda x: len(x)==0,
        # 		'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
        # 		'clean': False,
        # 		'rerun': False
        # 		},
    ]
Пример #25
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch ## MODULES
	import fastqc_batch, annotate_mutscan_batch, annotate_join_cosmic_batch, vep_mutscan_batch, mutect_batch, somaticindeldetector_batch

	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'BWA',
		'desc': 'fq -> sam -> bam -> sorted.bam',
		'fun': bwa_batch.align,
		'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True),
		'paramH': {},
		'logPostFix': '.bwa.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam -> RG.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['RG.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Realign',
		'desc': 'RG.bam -> realign.bam -> recal.bam',
		'fun': realign_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['recal.bam'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'Pileup',
#		'desc': 'recal.bam -> pileup',
#		'fun': pileup_batch.main,
#		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome]),
#		'paramH': {},
#		'logPostFix': '.pileup.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Set max' in x[-1],
#		'outFilePostFix': ['pileup'],
#		'clean': False,
#		'rerun': False
#		},

		{
		'name': 'Pileup_proc',
		'desc': 'recal.bam -> pileup -> pileup_proc',
		'fun': procPileup_split_batch.main,
		'paramL': (baseDir, baseDir, mysetting.ucscRefH[server][genome], False),
		'paramH': {},
		'logPostFix': '.pileup_proc.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['pileup_proc','pileup.gz'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MutScan',
		'desc': 'pileup_proc -> mutscan',
		'fun': mutScan_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.mutscan.log',
		'logExistsFn': lambda x: len(x)>0 and 'Success' in x[-1],
		'outFilePostFix': ['mutscan'],
		'clean': False,
		'rerun': False
		},

# temporarily off
#		{
#		'name': 'MuTect',
#		'desc': 'recal.bam -> .vcf',
#		'fun': mutect_batch.mutect_PON,
#		'paramL': (baseDir, genome, server, False),
#		'paramH': {},
#		'logPostFix': '.mutect_single.log',
#		'logExistsFn': lambda x: 'done' in x[-9],
#		'outFilePostFix': ['.mutect.vcf','.mutect'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'SomaticIndelDetector',
#		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
#		'fun': somaticindeldetector_batch.single_mode,
#		'paramL': (baseDir, baseDir, 'SS', genome, server, False),
#		'paramH': {},
#		'logPostFix': '.somaticindeldetector.log',
#		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
#		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
#		'clean': False,
#		'rerun': False
#		},

#		{## old cosmic join
#		'name': 'mutscan_snp_cosmic',
#		'desc': 'mutscan -> cosmic.dat',
#		'fun': mutscan_snp_cosmic_batch.main,
#		'paramL': (baseDir, server),
#		'paramH': {},
#		'logPostFix': '.cosmic.log',
#		'logExistsFn': lambda x: len(x) == 0,
#		'outFilePostFix': ['cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},
#
#		{
#		'name': 'VEP annotation',
#		'desc': 'Annotate mutscan output',
#		'fun': vep_mutscan_batch.main,
#		'paramL': ([baseDir]),
#		'paramH': {},
#		'logPostFix': '.mutscan_vep.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['mutscan_vep_out.vcf'],
#		'clean': False,
#		'rerun': False
#		},

## join cosmic
#		{
#		'name': 'Join Cosmic',
#		'desc': 'Join annotated mutscan output with COSMIC',
#		'fun': annotate_join_cosmic_batch.main,
#		'paramL': (baseDir, '(.*)\.vep$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Пример #26
0
#!/usr/bin/python

import sys, os, re
import mysetting, mybasic

mybasic.add_module_path(['NGS/pipeline','NGS/mutation'])
import mutect_batch, somaticindeldetector_batch

import mypipe

bamDirL = mysetting.wxsBamDirL
trioH = mypipe.read_trio('/EQL1/NSL/clinical/trio_info.txt', bamDirL)

#for tid in sorted(trioH.keys()):
#	if tid not in ['59','60','61']:
#		continue
#	print tid, trioH[tid]['prim_id'], trioH[tid]['recur_id']
#	for role in ['Normal','Primary','Recurrent']:
#		print role,trioH[tid][role]
#sys.exit(1)

outDir='/EQL3/pipeline/somatic_mutect'

## assume 1 primary & normal per trio
for tid in trioH:
	if trioH[tid]['norm_id'] == []:
		continue
	if tid not in ['63']:
		continue

	norm = trioH[tid]['norm_id'][0]
Пример #27
0
#!/usr/bin/python

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH

mybasic.add_module_path(["Integration"])
import prepDB_mutscan, makeDB_mutation_rxsq


def post_s_rsq2mut(baseDir, server="smc1", dbN="ihlee_test"):
    sampN = baseDir.split("/")[-1]
    sid = sampN[:-4].replace(".", "_").replace("-", "_")
    print sampN, sid

    cosmicDatFileN = "%s/%s_splice_cosmic.dat" % (baseDir, sampN)
    if dbN in ["ihlee_test", "ircr1"]:
        datFileN = "/EQL1/NSL/RNASeq/results/mutation/%s.dat" % sampN
    else:
        datFileN = "%s/%s.dat" % (baseDir, sampN)
    if os.path.isfile(cosmicDatFileN):
        prepDB_mutscan.main(sampNamePat=("(.*)_(RSq)", ""), geneList=[], inFileN=cosmicDatFileN, outFileN=datFileN)

        ## import
        (con, cursor) = mymysql.connectDB(
            user=mysqlH[server]["user"], passwd=mysqlH[server]["passwd"], db=dbN, host=mysqlH[server]["host"]
        )
        cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid)
        cursor.execute('LOAD DATA LOCAL INFILE "%s" INTO TABLE mutation_rsq' % datFileN)
        ## make sure to update sample_tag that this sample has RNA-Seq
        cursor.execute('SELECT * FROM sample_tag WHERE samp_id="%s" AND tag="RNA-Seq"' % sid)
Пример #28
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import bwa_batch, markDuplicates_batch, realign_batch, procPileup_split_batch, mutScan_batch ## MODULES
	import fastqc_batch, vep_mutect_batch, mutect_batch, somaticindeldetector_batch

	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'BWA',
		'desc': 'fq -> sam -> bam -> sorted.bam',
		'fun': bwa_batch.align,
		'paramL': (baseDir, baseDir, '(.*)\.[12]\.fq.gz', 10, 5000000000, False, mysetting.bwaIndexH[server][genome], True),
		'paramH': {},
		'logPostFix': '.bwa.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'bam_sort_core' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': True,
		'rerun': False
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['dedup.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Realign',
		'desc': 'dedup.bam -> realign.bam -> recal.bam',
		'fun': realign_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['recal.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'MuTect',
		'desc': 'recal.bam -> .vcf',
		'fun': mutect_batch.mutect_PON,
		'paramL': (baseDir, genome, server, False),
		'paramH': {},
		'logPostFix': '.mutect_single.log',
		'logExistsFn': lambda x: 'done' in x[-9],
		'outFilePostFix': ['.mutect.vcf','.mutect_filter.vcf','.mutect'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'SomaticIndelDetector',
		'desc': 'recal.bam -> indels.vcf -> indels_filter.vcf',
		'fun': somaticindeldetector_batch.single_mode,
		'paramL': (baseDir, baseDir, 'CS', genome, server, False),
		'paramH': {},
		'logPostFix': '.somaticindeldetector.log',
		'logExistsFn': lambda x: ('chrX' in x[-1] or 'chrX' in x[-2]),
		'outFilePostFix': ['indels_filter.vcf','indels_filter.out'],
		'clean': False,
		'rerun': False
		},

#		{ ## keep dying while trying to fork (when using PBS, even with --fork 2). It's better to annotate in a single batch (take it to post- pipeline?)
#		'name': 'VEP',
#		'desc': '.vcf -> .dat',
#		'fun': vep_mutect_batch.main,
#		'paramL': ([baseDir], False),
#		'paramH': {},
#		'logPostFix': 'mutect_vep.log',
#		'logExistsFn': lambda x: len(x) > 0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['_vep.dat'],
#		'clean': False,
#		'rerun': False
#		}

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Пример #29
0
#!/usr/bin/python
## integration into DB (per sample)

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['NGS/expression', 'Integration'])
import rpkm_process, prepDB_rpkm_gene_expr, boxplot_expr_cs_gene


def post_s_rsq2expr(baseDir, server='smc1', dbN='ihlee_test'):
    sampN = baseDir.split('/')[-1]
    sid = sampN[:-4].replace('-', '_').replace('.', '_')  ##drop '_RSq'

    if dbN in ['ihlee_test', 'ircr1']:
        gctFileN = '/EQL1/NSL/RNASeq/results/expression/%s.gct' % sampN
        datFileN = '/EQL1/NSL/RNASeq/results/expression/%s.dat' % sampN
    else:
        gctFileN = '%s/%s.gct' % (baseDir, sampN)
        datFileN = '%s/%s.dat' % (baseDir, sampN)
    print sampN, gctFileN
    rpkm_process.rpkm_process(inputDirN=baseDir,
                              filePattern='*.rpkm',
                              sampRegex='(.*)_RSq\.rpkm',
                              outputFileN=gctFileN)
    ## prep
    prepDB_rpkm_gene_expr.main(inGctFileName=gctFileN,
                               geneList=[],
                               samplePrefix='',
                               outDatFileName=datFileN)
    ## import
Пример #30
0
	annotH = {}
	for line in inFile:
		colL = line.rstrip().split('\t')
		rm = re.match('(chr[^:]*):([0-9]*)~([0-9]*)', colL[idxH['locus']])
		(chr,chrSta,chrEnd) = rm.groups()
		ref = colL[idxH['ref']]
		alt = colL[idxH['alt']]
		if (chr,chrSta,chrEnd,ref,alt) not in annotH:
			annotH[(chr,chrSta,chrEnd,ref,alt)] = {}
			for col in ['gene_symL','ch_dna','ch_aa','ch_type','cosmic','mutsig']:
				annotH[(chr,chrSta,chrEnd,ref,alt)][col] = colL[idxH[col]]
	return annotH

### until it is merged into pipeline
import mybasic
mybasic.add_module_path(['NGS/pipeline'])
import mypipe
trioH = mypipe.read_trio(bamDirL=mysetting.wxsBamDirL)
pairH = {}
for tid in trioH:
	if trioH[tid]['recur_id'] != []:
		pid = trioH[tid]['prim_id'][0][:-5]
		pairH[pid] = map(lambda x: x[:-5], trioH[tid]['recur_id'])
####
#(con,cursor) = mymysql.connectDB(db='ircr1')
#tag = 'pair_R:%'
#cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag)
#sIdL_p = [x for (x,) in cursor.fetchall()]
#
#tag = 'XSeq%%,N'
#cursor.execute('select distinct samp_id from sample_tag where tag like "%s"' % tag)
Пример #31
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/mutation', 'NGS/loh', 'NGS/purity'])
    import mutScan_loh_batch, delta_baf_mutscan_batch, delta_baf_seg_batch, calcCN_LOH_batch, loh2gene_batch, calcNormalF_loh_batch, peakFrac_batch, dbaf_cn_plot_batch  ## MODULES

    return [  ## PARAMETERS
        {
            'name': 'MutScan for the tumor sample',
            'desc': 'pileup_proc -> loh.mutscan',
            'fun': mutScan_loh_batch.main,
            'paramL': (baseDir, baseDir, False, 10, 0, 0),
            'paramH': {},
            'logPostFix': '.loh.mutscan.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Success' in x[-1],
            'outFilePostFix': ['loh.mutscan'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'delta B-allele frequencies calculation',
            'desc':
            'calculate tumor delta BAF for all positions genotyped as heterozygous in the normal sample',
            'fun': delta_baf_mutscan_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dbaf.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['dbaf.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'delta BAF segmentation',
            'desc': 'segment delta BAF',
            'fun': delta_baf_seg_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.dbaf.seg.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Analyzing' in x[-1],
            'outFilePostFix': ['seg'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Plotting',
            'desc': 'Generate deltaBAF/CN trajectory plot',
            'fun': dbaf_cn_plot_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.traj_plot.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1],
            'outFilePostFix': ['dBAF_CNA_traj.pdf'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'CNLOH/LOH determination',
            'desc':
            'calculate average copy number of LOH segments to determine CNLOH/LOH',
            'fun': calcCN_LOH_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.loh_cn.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Setting' in x[-1],
            'outFilePostFix': ['loh_cn.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'gene LOH',
            'desc': 'loh_cn.txt -> loh_gene.dat',
            'fun': loh2gene_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.refFlatH[server][genome]),
            'paramH': {},
            'logPostFix': '.loh_gene.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['loh_gene.dat'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Normal contamiation calculation',
            'desc':
            'calculate normal contamination levels at heterozygous germline SNPs in LOH regions',
            'fun': calcNormalF_loh_batch.main,
            'paramL': (baseDir, baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.nfrac_all.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['nFrac_all.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Tumor fraction estimation',
            'desc': 'estimate tumor fraction',
            'fun': peakFrac_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.tfrac.log',
            'logExistsFn': lambda x: len(x) > 0 and 'Done' in x[-1],
            'outFilePostFix': ['tumor_frac.txt'],
            'clean': False,
            'rerun': False
        },
    ]
Пример #32
0
#!/usr/bin/python

import sys, os, re, getopt, glob

import mybasic
mybasic.add_module_path(['NGS/align','NGS/mutation'])

import mybasic, procPileup_split_batch, mutScan_batch, mutscan_snp_cosmic_batch

def main(baseDir, projectName):
	
	current_files_list = []
	compared_files_list = []	
	current_files_list = glob.glob(baseDir+'/*')

	outDir = baseDir + '/pileup_proc'
	
	# compose log string
	html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

	# prep html file
	html_path = '/var/www/html/pipeline_logs/' + projectName + '/'
	file_name_split = baseDir.split('/S')
	sample_name = 'S' + file_name_split[1]
	file_name = 'pipeline2_log_' + sample_name + '.html'
	# create .html file
	with open(os.path.join(html_path, file_name), 'wb') as log_file:
		log_file.write(html_head_string)
	log_file.close()

	# change mod and open log_file again
Пример #33
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/align','NGS/fastq','NGS/coverage','NGS/expression'])
	import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch ## MODULES
	import fastqc_batch
	
	return [ ## PARAMETERS
		{
		'name': 'FastQC',
		'desc': 'QC for fastq',
		'fun': fastqc_batch.fastqc_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
		'paramH': {},
		'logPostFix': '.fastqc.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Analysis complete' in x[-1],
		'outFilePostFix': ['_fastqc.zip'],
		'outLinkPostFix': ['_fastqc/fastqc_report.html'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Trim',
		'desc': 'fq.gz -> trim -> fq',
		'fun': trim_batch.trim_batch,
		'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30),
		'paramH': {},
		'logPostFix': '.trim.log',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Mapping',
		'desc': 'fq -> bam',
		'fun': gsnap_sam_batch.align,
		'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)),
		'paramH': {},
		'logPostFix': '.gsnap.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
		'outFilePostFix': ['bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'Formet Conversion and sorting',
		'desc': 'bam -> sort -> sorted.bed',
		'fun': bam2sortedBed_batch.sam2bed_batch,
		'paramL': (baseDir, baseDir, '', False),
		'paramH': {},
		'logPostFix': '.sorted.bed.qlog',
		'logExistsFn': lambda x: len(x)==0,
		'outFilePostFix': ['sorted.bed'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'TDFgen',
		'desc': 'sorted.bed -> bedgraph -> tdf',
		'fun': sortedBed2tdf_batch.main,
		'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' % (mysetting.ucscSeqDir[server][genome], genome), genome),
		'paramH': {},
		'logPostFix': '.tdf.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Done' in x[-9],
		'outFilePostFix': ['bedgraph','tdf'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RPKMgen',
		'desc': 'sorted.bed -> rpkm',
		'fun': degSeq_batch.main,
		'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome], False),
		'paramH': {},
		'logPostFix': '.degSeq.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'ZZZ3' in x[-1],
		'outFilePostFix': ['rpkm'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'Cleanup',
#		'desc': 'remove all, but logs and designated result file',
#		'fun': cleanup.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': 'cleanup.qlog',
#		'logExistsFn': lambda x: False,
#		'outFilePostFix': ['pileup']
#		},

		]
Пример #34
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(
        ['NGS/align', 'NGS/fastq', 'NGS/coverage', 'NGS/expression'])
    import trim_batch, gsnap_sam_batch, bam2sortedBed_batch, sortedBed2tdf_batch, degSeq_batch  ## MODULES
    import fastqc_batch

    return [  ## PARAMETERS
        {
            'name': 'FastQC',
            'desc': 'QC for fastq',
            'fun': fastqc_batch.fastqc_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, baseDir),
            'paramH': {},
            'logPostFix': '.fastqc.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Analysis complete' in x[-1],
            'outFilePostFix': ['_fastqc.zip'],
            'outLinkPostFix': ['_fastqc/fastqc_report.html'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Trim',
            'desc': 'fq.gz -> trim -> fq',
            'fun': trim_batch.trim_batch,
            'paramL': (baseDir, '(.*)\.[12]\.fq\.gz', baseDir, 30),
            'paramH': {},
            'logPostFix': '.trim.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['t1.fq.gz', 't2.fq.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Mapping',
            'desc': 'fq -> bam',
            'fun': gsnap_sam_batch.align,
            'paramL': (baseDir, baseDir, False, 'sanger', '%s_nh' % (genome)),
            'paramH': {},
            'logPostFix': '.gsnap.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1],
            'outFilePostFix': ['bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Formet Conversion and sorting',
            'desc': 'bam -> sort -> sorted.bed',
            'fun': bam2sortedBed_batch.sam2bed_batch,
            'paramL': (baseDir, baseDir, '', False),
            'paramH': {},
            'logPostFix': '.sorted.bed.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['sorted.bed'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'TDFgen',
            'desc':
            'sorted.bed -> bedgraph -> tdf',
            'fun':
            sortedBed2tdf_batch.main,
            'paramL': (baseDir, baseDir, False, '%s/chromsizes_%s.txt' %
                       (mysetting.ucscSeqDir[server][genome], genome), genome),
            'paramH': {},
            'logPostFix':
            '.tdf.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Done' in x[-9],
            'outFilePostFix': ['bedgraph', 'tdf'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'RPKMgen',
            'desc': 'sorted.bed -> rpkm',
            'fun': degSeq_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.refFlatH[server][genome], False),
            'paramH': {},
            'logPostFix': '.degSeq.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'ZZZ3' in x[-1],
            'outFilePostFix': ['rpkm'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'Cleanup',
        #		'desc': 'remove all, but logs and designated result file',
        #		'fun': cleanup.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': 'cleanup.qlog',
        #		'logExistsFn': lambda x: False,
        #		'outFilePostFix': ['pileup']
        #		},
    ]
Пример #35
0
#!/usr/bin/python

import sys, os
import mymysql, mypipe, mybasic
from mysetting import mysqlH
mybasic.add_module_path(['Integration'])
import prepDB_mutscan, makeDB_mutation_rxsq


def post_s_rsq2mut(baseDir, server='smc1', dbN='ihlee_test'):
    sampN = baseDir.split('/')[-1]
    sid = sampN[:-4].replace('.', '_').replace('-', '_')
    print sampN, sid

    cosmicDatFileN = '%s/%s_splice_cosmic.dat' % (baseDir, sampN)
    if dbN in ['ihlee_test', 'ircr1']:
        datFileN = '/EQL1/NSL/RNASeq/results/mutation/%s.dat' % sampN
    else:
        datFileN = '%s/%s.dat' % (baseDir, sampN)
    if os.path.isfile(cosmicDatFileN):
        prepDB_mutscan.main(sampNamePat=('(.*)_(RSq)', ''),
                            geneList=[],
                            inFileN=cosmicDatFileN,
                            outFileN=datFileN)

        ## import
        (con, cursor) = mymysql.connectDB(user=mysqlH[server]['user'],
                                          passwd=mysqlH[server]['passwd'],
                                          db=dbN,
                                          host=mysqlH[server]['host'])
        cursor.execute('DELETE FROM mutation_rsq WHERE samp_id="%s"' % sid)
Пример #36
0
def genSpec(baseDir, server='smc1', genome='hg19'):

	mybasic.add_module_path(['NGS/fastq','NGS/align','NGS/mutation'])
	import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch ## MODULES

	specL = [ ## PARAMETERS
		{
		'name': 'Align',
		'desc': '.fq.gz -> .bam',
		'fun': gsnap_splice_bam_batch.align,
		'paramL': (baseDir, baseDir, False, genome),
		'paramH': {},
		'logPostFix': '.gsnap.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
		'outFilePostFix': ['splice.bam'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'Sort',
		'desc': 'bam -> sorted.bam',
		'fun': gsnap_splice_bam_sort_batch.main,
		'paramL': (baseDir, baseDir, 10000000000),
		'paramH': {},
		'logPostFix': '_splice.sort.qlog',
		'logExistsFn': lambda x: len(x)<1 or 'merging' in x[-1],
		'outFilePostFix': ['sorted.bam'],
		'clean': False,
		'rerun': False 
		},

		{
		'name': 'MarkDuplicate/ReadGroup',
		'desc': 'sorted.bam -> dedup.bam',
		'fun': markDuplicates_batch.main,
		'paramL': (baseDir, baseDir, False),
		'paramH': {},
		'logPostFix': '_splice.dedup.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'totalMemory()' in x[-1],
		'outFilePostFix': ['dedup.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'RealignTarget',
		'desc': 'dedup.bam -> realigner.intervals -> realigner_ft.intervals',
		'fun': realignTargetFilter_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '_splice.interval.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['realigner.intervals','realigner_ft.intervals'],
		'clean': False,
		'rerun': False
		},
	
		{
		'name': 'Realign/Recalibrate',
		'desc': 'dedup.bam -> realign.bam -> recal.bam',
		'fun': realignWithFtTarget_batch.main,
		'paramL': (baseDir, baseDir, False, mysetting.ucscRefH[server][genome], mysetting.dbsnpH[server][genome]),
		'paramH': {},
		'logPostFix': '_splice.realign.qlog',
		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
		'outFilePostFix': ['realign.bam', 'recal.bam'],
		'clean': False,
		'rerun': False
		},

		{
		'name': 'UnifiedGenotype',
		'desc': 'recal.bam -> vcf',
		'fun': unifiedGeno_batch.main,
		'paramL': (baseDir, baseDir, server, genome, False),
		'paramH': {},
		'logPostFix': '_splice.gatk.log',
		'logExistsFn': lambda x: len(x)>0 and any(s for s in x[-10:] if 'Total runtime' in s),
		'outFilePostFix': ['vcf'],
		'clean': False,
		'rerun': False
		},

#		{
#		'name': 'MutScan',
#		'desc': 'vcf -> mutscan',
#		'fun': vcf2mutScan_batch.main,
#		'paramL': (baseDir, baseDir, False),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_splice.mutscan'],
#		'clean': False,
#		'rerun': False
#		},

### annotate mutscan using VEP
#		{
#		'name': 'VEP annotation',
#		'desc': 'Annotate mutscan output',
#		'fun': annotate_mutscan_batch.annotate_mutscan_batch,
#		'paramL': (baseDir, '(.*)\.mutscan$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.vep.log',
#		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
#		'outFilePostFix': ['vep'],
#		'clean': False,
#		'rerun': False
#		},

## join cosmic
#		{
#		'name': 'Join Cosmic',
#		'desc': 'Join annotated mutscan output with COSMIC',
#		'fun': annotate_join_cosmic_batch.main,
#		'paramL': (baseDir, '(.*)\.vep$', baseDir),
#		'paramH': {},
#		'logPostFix': '_splice.mutscan.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['_cosmic.dat'],
#		'clean': False,
#		'rerun': False
#		},

#		{ ## old joinCosmic
#		'name': 'JoinCosmic',
#		'desc': 'mutscan -> cosmic.dat',
#		'fun': mutscan_snp_cosmic_batch.main,
#		'paramL': (baseDir,),
#		'paramH': {},
#		'logPostFix': '_splice.cosmic.log',
#		'logExistsFn': lambda x: len(x)==0,
#		'outFilePostFix': ['dat'],
#		'clean': False,
#		'rerun': False 
#		},

##		{
##		'name': 'Cleanup',
##		'desc': 'remove all, but logs and designated result file',
##		'fun': cleanup.main,
##		'paramL': (baseDir,),
##		'paramH': {},
##		'logPostFix': 'cleanup.qlog',
##		'logExistsFn': lambda x: False,
##		'outFilePostFix': ['pileup']
##		},

		]

#	if server == 'smc2':
#		return specL[-1]
#	else:
#		return specL
	return specL
Пример #37
0
def genSpec_CS(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(
        ['NGS/coverage', 'NGS/expression', 'NGS/copynumber'])
    import bam2sortedBed_batch, degSeq_batch, rpkm2cn_batch, exon2gene_batch, drawCNATraj_batch

    return [  ## PARAMTERS
        {
            'name': 'Format Conversion and sorting',
            'desc': 'bam -> sort -> sorted.bed',
            'fun': bam2sortedBed_batch.sam2bed_batch,
            'paramL': (baseDir, baseDir, 'recal', False),
            'paramH': {},
            'logPostFix': '.sorted.bed.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['sorted.bed'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'RPKMgen',
            'desc':
            'sorted.bed -> rpkm',
            'fun':
            degSeq_batch.main,
            'paramL':
            (baseDir, baseDir,
             '/data1/Sequence/ucsc_hg19/annot/refFlat_exon.txt', False),
            'paramH': {},
            'logPostFix':
            '.degSeq.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'omitted' in x[-1],
            'outFilePostFix': ['rpkm'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Calculate a log2 rpkm ratio for all exons',
            'desc': 'log2(tumor rpkm/normal rpkm',
            'fun': rpkm2cn_batch.main_pool,
            'paramL': (baseDir, baseDir, 10, mysetting.poolB_CS_rpkm, False),
            'paramH': {},
            'logPostFix': '.cn.log',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['copynumber'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Calculate gene copy number from log2 rpkm ratios',
            'desc':
            'copynumber -> cn_gene.dat',
            'fun':
            exon2gene_batch.main,
            'paramL': (baseDir, baseDir, mysetting.refFlatH[server][genome],
                       mysetting.cs_gene, False),
            'paramH': {},
            'logPostFix':
            '.cn_gene.log',
            'logExistsFn':
            lambda x: len(x) > 0 and 'VHL' in x[-1],
            'outFilePostFix': ['cn_gene.dat'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Draw Plot',
            'desc': 'seg->plot',
            #		'fun' : drawCNATraj_batch.batch,
            'fun': drawCNATraj_batch.draw_single,
            'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome),
            'paramH': {},
            'logPostFix': '',
            'logExistsFn': lambda x: True,
            'outFilePostFix': [],
            'clean': False,
            'rerun': False
        },
    ]
Пример #38
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/fusion'])
    import gsnap_splice_batch, fusion_filter_transloc_batch, fusion_filter_annot1_batch, fusion_proc_sort_batch, fusion_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False),
        #		'paramH': {},
        #		'logPostFix': '.gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        {
            'name': 'Filter transloc',
            'desc': 'splice.gsnap.gz -> splice_transloc.gsnap',
            'fun': fusion_filter_transloc_batch.fusion_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.ft_tloc.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_transloc.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'annotate',
            'desc': 'splice_transloc.gsnap -> splice_transloc_annot1.gsnap',
            'fun': fusion_filter_annot1_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.annot.qlog',
            'logExistsFn': lambda x: len(x) > 1 and 'Results' in x[-1],
            'outFilePostFix': ['splice_transloc_annot1.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'sort',
            'desc':
            'splice_transloc_annot1.gsnap -> splice_transloc_annot1.sorted.gsnap and gnerate report.txt',
            'fun':
            fusion_proc_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix':
            '.sort.qlog',
            'logExistsFn':
            lambda x: len(x) == 0,
            'outFilePostFix': [
                'splice_transloc_annot1.sorted.gsnap',
                'splice_transloc_annot1.report.txt'
            ],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'annotate report',
            'desc': 'report.txt -> report_annot.txt',
            'fun': fusion_proc_annot_batch.fusion_proc_annot_batch,
            'paramL': (baseDir, baseDir, None, False),
            'paramH': {},
            'logPostFix': '.report_annot.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_transloc_annot1.report_annot.txt'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'Summarize',
        #		'desc': '',
        #		'fun': ,
        #		'paramL': (baseDir, baseDir, False),
        #		'paramH': {},
        #		'logPostFix': 'realign.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Uploaded run' in x[-1],
        #		'outFilePostFix': ['realign.bam', 'recal.bam'],
        #		'clean': False,
        #		'rerun': False
        #		},
    ]
Пример #39
0
def genSpec(baseDir, server='smc1', genome='hg19'):
    mybasic.add_module_path(['NGS/copynumber'])
    import ngCGH_batch, cgh2seg_batch, seg2gene_batch, drawCNATraj_batch

    return [  ## PARAMETERS
        {
            'name': 'run ngCGH for pairs of bam',
            'desc': 'bam -> .ngCGH',
            'fun': ngCGH_batch.main,
            'paramL': (baseDir, baseDir, 1000, False),
            'paramH': {},
            'logPostFix': '.cn_ngCGH.log',
            'logExistsFn': lambda x: len(x) > 0 and 'finalizers' in x[-1],
            'outFilePostFix': ['ngCGH'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Segmenation',
            'desc': 'ngCGH -> seg',
            'fun': cgh2seg_batch.cgh2seg,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.seg.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Centrality parameter' in x[-1],
            'outFilePostFix': ['ngCGH.seg'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'Calculate gene copy number from segments',
            'desc':
            'seg -> cn_gene.dat',
            'fun':
            seg2gene_batch.main,
            'paramL':
            (baseDir, baseDir, mysetting.refFlatH[server][genome], [], False),
            'paramH': {},
            'logPostFix':
            '.cn_gene.log',
            'logExistsFn':
            lambda x: len(x) > 0 and 'ZZZ3' in x[-1],
            'outFilePostFix': ['cn_gene.dat'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name': 'Draw Plot',
            'desc': 'seg->plot',
            #		'fun' : drawCNATraj_batch.batch,
            'fun': drawCNATraj_batch.draw_single,
            'paramL': (baseDir, '/EQL1/NSL/WXS/results/CNA', genome),
            'paramH': {},
            'logPostFix': '',
            'logExistsFn': lambda x: True,
            'outFilePostFix': [],
            'clean': False,
            'rerun': False
        },
    ]
Пример #40
0
#!/usr/bin/python

import sys, os, re, getopt, glob
import mybasic

mybasic.add_module_path(['NGS/align', 'NGS/mutation'])

import bwa_batch, markDuplicates_batch, realign_batch, pileup_batch


def wxs_seq(baseDir, projectName):

    current_files_list = []
    compared_files_list = []
    current_files_list = glob.glob(baseDir + '/*')

    # compose log string
    html_head_string = '<DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head></head><body>'

    # prep html file
    html_path = '/var/www/html/pipeline_logs/' + projectName + '/'
    file_name_split = baseDir.split('/S')
    sample_name = 'S' + file_name_split[1]
    file_name = 'pipeline1_log_' + sample_name + '.html'
    # create .html file
    with open(os.path.join(html_path, file_name), 'wb') as log_file:
        log_file.write(html_head_string)
    log_file.close()

    # change mode and open log_file again
    os.system('chmod 755 %s%s' % (html_path, file_name))
Пример #41
0
#!/usr/bin/python

from glob import glob
import sys,os,re
import mysetting, mymysql, mypipe, mybasic
mybasic.add_module_path(['Integration','NGS/mutation'])
import prepDB_mutation_normal, makeDB_mutation_rxsq, vep_batch

def prep_single(outFileN, server='smc1', dbN='ircr1'):
	(con, cursor) = mymysql.connectDB(user=mysetting.mysqlH[server]['user'],passwd=mysetting.mysqlH[server]['passwd'],db=dbN,host=mysetting.mysqlH[server]['host'])
	cosmicL = []
	for dir in mysetting.wxsMutscanDirL:
		cosmicL += filter(lambda x: '_B_' not in x, glob('%s/*/*cosmic.dat' % dir) + glob('%s/*cosmic.dat' % dir))

	cursor.execute('SELECT DISTINCT samp_id FROM sample_tag WHERE tag LIKE "XSeq_%%"')
	results = cursor.fetchall()
	sidL = []
	for res in results:
		sidL.append(res[0])
	for cosmic in cosmicL:
		(sid, postfix, platform) = re.match('(.*)_([XT].{,2})_([STKN]{2})_cosmic.dat', os.path.basename(cosmic)).groups()
		if postfix not in ['T', 'RSq']:
			sid = '%s_%s' % (sid, postfix)
		if sid not in sidL:
			print sid, cosmic
			tag = 'XSeq_%s' % platform
			cursor.execute('INSERT INTO sample_tag SET samp_id="%s", tag="%s"' % (sid, tag))

	cmd = 'cat %s | /usr/bin/python %s/Integration/prepDB_mutscan.py > %s' % (' '.join(cosmicL), mysetting.SRC_HOME, outFileN)
	os.system(cmd)
Пример #42
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/fastq', 'NGS/align', 'NGS/mutation'])
    import fastqc_batch, gsnap_splice_bam_batch, gsnap_splice_bam_sort_batch, markDuplicates_batch, realignTargetFilter_batch, realignWithFtTarget_batch, unifiedGeno_batch, vcf2mutScan_batch, mutscan_snp_cosmic_batch, annotate_mutscan_batch, annotate_join_cosmic_batch  ## MODULES

    specL = [  ## PARAMETERS
        {
            'name': 'Align',
            'desc': '.fq.gz -> .bam',
            'fun': gsnap_splice_bam_batch.align,
            'paramL': (baseDir, baseDir, False, genome),
            'paramH': {},
            'logPostFix': '.gsnap.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Processed' in x[-1],
            'outFilePostFix': ['splice.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Sort',
            'desc': 'bam -> sorted.bam',
            'fun': gsnap_splice_bam_sort_batch.main,
            'paramL': (baseDir, baseDir, 10000000000),
            'paramH': {},
            'logPostFix': '_splice.sort.qlog',
            'logExistsFn': lambda x: len(x) < 1 or 'merging' in x[-1],
            'outFilePostFix': ['sorted.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'MarkDuplicate/ReadGroup',
            'desc': 'sorted.bam -> dedup.bam',
            'fun': markDuplicates_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '_splice.dedup.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'totalMemory()' in x[-1],
            'outFilePostFix': ['dedup.bam'],
            'clean': False,
            'rerun': False
        },
        {
            'name':
            'RealignTarget',
            'desc':
            'dedup.bam -> realigner.intervals -> realigner_ft.intervals',
            'fun':
            realignTargetFilter_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '_splice.interval.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix':
            ['realigner.intervals', 'realigner_ft.intervals'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name':
            'Realign/Recalibrate',
            'desc':
            'dedup.bam -> realign.bam -> recal.bam',
            'fun':
            realignWithFtTarget_batch.main,
            'paramL':
            (baseDir, baseDir, False, mysetting.ucscRefH[server][genome],
             mysetting.dbsnpH[server][genome]),
            'paramH': {},
            'logPostFix':
            '_splice.realign.qlog',
            'logExistsFn':
            lambda x: len(x) > 0 and 'Uploaded run' in x[-1],
            'outFilePostFix': ['realign.bam', 'recal.bam'],
            'clean':
            False,
            'rerun':
            False
        },
        {
            'name':
            'UnifiedGenotype',
            'desc':
            'recal.bam -> vcf',
            'fun':
            unifiedGeno_batch.main,
            'paramL': (baseDir, baseDir, server, genome, False),
            'paramH': {},
            'logPostFix':
            '_splice.gatk.log',
            'logExistsFn':
            lambda x: len(x) > 0 and any(s for s in x[-10:]
                                         if 'Total runtime' in s),
            'outFilePostFix': ['vcf'],
            'clean':
            False,
            'rerun':
            False
        },

        #		{
        #		'name': 'MutScan',
        #		'desc': 'vcf -> mutscan',
        #		'fun': vcf2mutScan_batch.main,
        #		'paramL': (baseDir, baseDir, False),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_splice.mutscan'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ### annotate mutscan using VEP
        #		{
        #		'name': 'VEP annotation',
        #		'desc': 'Annotate mutscan output',
        #		'fun': annotate_mutscan_batch.annotate_mutscan_batch,
        #		'paramL': (baseDir, '(.*)\.mutscan$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.vep.log',
        #		'logExistsFn': lambda x: len(x)>0 and 'Finished!' in x[-1],
        #		'outFilePostFix': ['vep'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ## join cosmic
        #		{
        #		'name': 'Join Cosmic',
        #		'desc': 'Join annotated mutscan output with COSMIC',
        #		'fun': annotate_join_cosmic_batch.main,
        #		'paramL': (baseDir, '(.*)\.vep$', baseDir),
        #		'paramH': {},
        #		'logPostFix': '_splice.mutscan.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['_cosmic.dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        #		{ ## old joinCosmic
        #		'name': 'JoinCosmic',
        #		'desc': 'mutscan -> cosmic.dat',
        #		'fun': mutscan_snp_cosmic_batch.main,
        #		'paramL': (baseDir,),
        #		'paramH': {},
        #		'logPostFix': '_splice.cosmic.log',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['dat'],
        #		'clean': False,
        #		'rerun': False
        #		},

        ##		{
        ##		'name': 'Cleanup',
        ##		'desc': 'remove all, but logs and designated result file',
        ##		'fun': cleanup.main,
        ##		'paramL': (baseDir,),
        ##		'paramH': {},
        ##		'logPostFix': 'cleanup.qlog',
        ##		'logExistsFn': lambda x: False,
        ##		'outFilePostFix': ['pileup']
        ##		},
    ]

    #	if server == 'smc2':
    #		return specL[-1]
    #	else:
    #		return specL
    return specL
Пример #43
0
def genSpec(baseDir, server='smc1', genome='hg19'):

    mybasic.add_module_path(['NGS/align', 'NGS/splice_gsnap/skipping'])
    import gsnap_splice_batch, exonSkip_filter_batch, exonSkip_filter_normal_batch, exonSkip_sort_batch, exonSkip_normal_sort_batch, exonSkip_proc_annot_batch  ## MODULES

    return [  ## PARAMETERS
        #		{
        #		'name': 'Align',
        #		'desc': 'fastq -> splice.gsnap',
        #		'fun': gsnap_splice_batch.align,
        #		'paramL':(baseDir, baseDir, 6, False),
        #		'paramH': {},
        #		'logPostFix': 'gsnap.qlog',
        #		'logExistsFn': lambda x: len(x)>0 and 'Processed' in x[-1],
        #		'outFilePostFix': ['splice.gsnap'],
        #		'clean': False,
        #		'rerun': False
        #		},
        {
            'name': 'Filter exonskip',
            'desc': 'splice.gsnap.gz -> splice_exonSkip.gsnap',
            'fun': exonSkip_filter_batch.exonSkip_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.exonSkip.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_exonSkip.gsnap'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'Filter normal exonskip',
            'desc': 'splice.gsnap -> splice_exonSkip_normal.gsnap.gz',
            'fun': exonSkip_filter_normal_batch.exonSkip_filter_batch,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.exonSkip_normal.qlog',
            'logExistsFn': lambda x: len(x) > 0 and 'Results' in x[-1],
            'outFilePostFix': ['splice_exonSkip_normal.gsnap.gz'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'sort',
            'desc': 'splice_exonSkip.gsnap -> splice_exonSkip_report.txt',
            'fun': exonSkip_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.sort.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_report.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'sort-normal',
            'desc':
            'splice_exonSkip_normal.gsnap.gz -> splice_exonSkip_normal_report.txt',
            'fun': exonSkip_normal_sort_batch.main,
            'paramL': (baseDir, baseDir, False),
            'paramH': {},
            'logPostFix': '.sort_normal.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
            'clean': False,
            'rerun': False
        },
        {
            'name': 'annotate report',
            'desc': 'report.txt -> report_annot.txt',
            'fun': exonSkip_proc_annot_batch.exonSkip_proc_annot_batch,
            'paramL': (baseDir, baseDir, None, False),
            'paramH': {},
            'logPostFix': '.skip_annot.qlog',
            'logExistsFn': lambda x: len(x) == 0,
            'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
            'clean': False,
            'rerun': False
        },

        #		{
        #		'name': 'link',
        #		'desc': 'put all report_annot.txt files in a directory',
        #		'fun': exonSkip_link.link,
        #		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip'),
        #		'paramH': {},
        #		'logPostFix': 'link.qlog',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['splice_exonSkip_report_annot.txt'],
        #		'clean': False,
        #		'rerun': False
        #		},
        #
        #		{
        #		'name': 'link-normal',
        #		'desc': 'put all report_normal.txt files in a directory',
        #		'fun': exonSkip_link_normal.link,
        #		'paramL': (baseDir, '/EQL1/NSL/RNASeq/results/exonSkip_normal'),
        #		'paramH': {},
        #		'logPostFix': 'link_normal.qlog',
        #		'logExistsFn': lambda x: len(x)==0,
        #		'outFilePostFix': ['splice_exonSkip_normal_report.txt'],
        #		'clean': False,
        #		'rerun': False
        #		},
    ]