def json_somatic(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { "chunk": "001", "library": "LIB-1216301779A", "platform": "ILLUMINA", "platform_unit": "C0MR3ACXX.001", "rgid": "BC18-06-2013", "sample_name": "BC18-06-2013LyT_S5_L001", "pair": "1", "path": "/path/to/fastq.gz", "sample_type": "normal or tumor" }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_Somatic(), configure(wga_settings), add_run(workflow))
def json_(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { 'chunk': 001, 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair': 0, #0 or 1 'path': '/path/to/fastq' }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline(), configure(wga_settings), add_run(workflow))
def json_somatic(workflow,input_dict,**kwargs): """ Input file is a json of the following format: [ { "chunk": "001", "library": "LIB-1216301779A", "platform": "ILLUMINA", "platform_unit": "C0MR3ACXX.001", "rgid": "BC18-06-2013", "sample_name": "BC18-06-2013LyT_S5_L001", "pair": "1", "path": "/path/to/fastq.gz", "sample_type": "normal or tumor" }, {..} ] """ input_json = json.load(open(input_dict,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_Somatic(), configure(wga_settings), add_run(workflow) )
def json_local(workflow,input_dict,**kwargs): """ Input is a folder where each file is a json of the following format: [ { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':1 'path': '/path/to/fastq' }, { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':2 'path': '/path/to/fastq'..} ] """ dirList=os.listdir(input_dict) for files in dirList: print input_dict+files input_json = json.load(open(input_dict+files,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] for i in inputs: print i DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_local(), configure(wga_settings), add_run(workflow) )
def json_(workflow,input_dict,**kwargs): """ Input file is a json of the following format: [ { 'chunk': 001, 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair': 0, #0 or 1 'path': '/path/to/fastq' }, {..} ] """ input_json = json.load(open(input_dict,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline(), configure(wga_settings), add_run(workflow) )
def downdbs(workflow,**kwargs): """ Download all annotation databases """ DAG().sequence_( add_([ annovarext.DownDB(tags={'build':'hg19','dbname':db}) for db in annovarext.get_db_names() ]), configure(wga_settings), add_run(workflow) )
def gunzip(workflow, input_dir, **kwargs): """ Gunzips all gz files in directory $ genomekey gunzip -n 'Gunzip' /path/to/dir """ DAG().sequence_( add_([ INPUT(f, tags={'i': i}) for i, f in enumerate(glob.glob(os.path.join(input_dir, '*.gz'))) ]), map_(unix.Gunzip), add_run(workflow))
def gunzip(workflow,input_dir,**kwargs): """ Gunzips all gz files in directory $ genomekey gunzip -n 'Gunzip' /path/to/dir """ DAG().sequence_( add_([ INPUT(f,tags={'i':i}) for i,f in enumerate(glob.glob(os.path.join(input_dir,'*.gz'))) ]), map_(unix.Gunzip), add_run(workflow) )
def downdbs(workflow, **kwargs): """ Download all annotation databases """ DAG().sequence_( add_([ annovarext.DownDB(tags={ 'build': 'hg19', 'dbname': db }) for db in annovarext.get_db_names() ]), configure(wga_settings), add_run(workflow))
def fastq_(workflow,input_dict,output_dict,output_json,**kwargs): json_fastq_to_split=json_creator.json_out(input_dict,output_dict) input_json = json.load(open(json_fastq_to_split,'r')) inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_split(), configure(wga_settings), add_run(workflow) )
def upload_(workflow,bucket,project,out_dict,**kwargs): project_folder=join(out_dict,project.replace(" ", "_")) if not os.path.exists(project_folder): os.makedirs(project_folder) json_fastq_to_upload=s3_Bucket.getList(bucket,project,out_dict) input_json = json.load(open(json_fastq_to_upload,'r')) inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_upload(), configure(wga_settings), add_run(workflow) )
def CteamPipeline(input_bams): bam_seq = None bam_dup = [] # to check duplicate input files for b in input_bams: # extract genome_id from file, add as a tag genome_id = os.path.basename(b).partition('.')[0] if genome_id in bam_dup: print '\n\nERROR: \"%s\" was already included in the input file list.\n' % b sys.exit() else: bam_dup.append(genome_id) # TEMPORARILLY, use genome_id as RG_ID, too s = sequence_( add_([INPUT(b, tags={'rg': genome_id})], stage_name="Load Input")) # append to sequence if bam_seq is None: bam_seq = s else: bam_seq = sequence_(bam_seq, s, combine=True) nInput = len(input_bams) nNodes = settings.settings['nNode'] nSplit = min( 256, 16 * max(nNodes / nInput, 1)) # will use floor, min 16, up to 256 splits settings.settings['nSplit'] = nSplit chrom = ('chrom', range(1, 23) + ['X', 'Y', 'MT']) split = ('split', range(1, nSplit + 1)) return sequence_( bam_seq, map_(pipes.CteamSortSplitBam), # sort bam by readname (== shuffling) split_([split], pipes.CteamTrimReadGroup), # map_(pipes.CteamBwaAln), # bwa aln map_(pipes.CteamBwaSampe), # bwa sampe reduce_(['rg'], pipes.CteamSplitByChromosome ), # merge split files and (re)split by chromosome split_([chrom], pipes.CteamRmDup_BuildIndex), # samtools rmdup + index map_(pipes.CteamRealignTarget), # gatk indel realign target creator map_(pipes.CteamIndelRealigner), # gatk indel realigner map_(pipes.CteamUnifiedGenotyper) # gatk unifiedGenotyper # #map_(pipes.CteamVariantFiltration) # gatk variantFilter )
def fastq_(workflow, input_dict, output_dict, output_json, **kwargs): json_fastq_to_split = json_creator.json_out(input_dict, output_dict) input_json = json.load(open(json_fastq_to_split, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_split(), configure(wga_settings), add_run(workflow))
def anno(workflow, input_file, input_file_list, file_format='vcf', **kwargs): """ Annotates all files in input_Files $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf """ input_files = input_file_list.read().strip().split( '\n') if input_file_list else [] if input_file: input_files.append(input_file.name) print('annotating {0}'.format(', '.join(input_files)), file=sys.stderr) DAG().sequence_( add_([ INPUT(input_file, tags={'vcf': i}) for i, input_file in enumerate(input_files) ]), massive_annotation, configure(wga_settings), add_run(workflow))
def anno(workflow,input_file,input_file_list,file_format='vcf',**kwargs): """ Annotates all files in input_Files $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf """ input_files = input_file_list.read().strip().split('\n') if input_file_list else [] if input_file: input_files.append(input_file.name) print >> sys.stderr, 'annotating {0}'.format(', '.join(input_files)) DAG().sequence_( add_([ INPUT(input_file,tags={'vcf':i}) for i,input_file in enumerate(input_files) ]), massive_annotation, configure(wga_settings), add_run(workflow) )
def upload_(workflow, bucket, project, out_dict, **kwargs): project_folder = join(out_dict, project.replace(" ", "_")) if not os.path.exists(project_folder): os.makedirs(project_folder) json_fastq_to_upload = s3_Bucket.getList(bucket, project, out_dict) input_json = json.load(open(json_fastq_to_upload, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_upload(), configure(wga_settings), add_run(workflow))
def Bam2Fastq(workflow, dag, settings, input_bams): if len(input_bams) == 0: raise WorkflowException, 'At least 1 BAM input required' dag.sequence_( sequence_( *[ sequence_( add_([ INPUT(input_bam, tags={'input':os.path.basename(input_bam)})],stage_name="Load Input Bams"), split_([('rgid',_inputbam2rgids(input_bam))],pipes.FilterBamByRG_To_FastQ) ) for input_bam in input_bams ], combine=True ), split_([('pair',[1,2])],genomekey_scripts.SplitFastq), configure(settings), add_run(workflow,finish=False), ).add_(list(_splitfastq2inputs(dag))) return dag
def Bam2Fastq(workflow, dag, settings, input_bams): if len(input_bams) == 0: raise WorkflowException, 'At least 1 BAM input required' dag.sequence_( sequence_(*[ sequence_( add_([ INPUT(input_bam, tags={'input': os.path.basename(input_bam)}) ], stage_name="Load Input Bams"), split_([('rgid', _inputbam2rgids(input_bam))], pipes.FilterBamByRG_To_FastQ)) for input_bam in input_bams ], combine=True), split_([('pair', [1, 2])], genomekey_scripts.SplitFastq), configure(settings), add_run(workflow, finish=False), ).add_(list(_splitfastq2inputs(dag))) return dag
def json_local(workflow, input_dict, **kwargs): """ Input is a folder where each file is a json of the following format: [ { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':1 'path': '/path/to/fastq' }, { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':2 'path': '/path/to/fastq'..} ] """ dirList = os.listdir(input_dict) for files in dirList: print(input_dict + files) input_json = json.load(open(input_dict + files, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] for i in inputs: print(i) DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_local(), configure(wga_settings), add_run(workflow))
from cosmos.Workflow.models import Workflow from cosmos.lib.ezflow.dag import DAG, split_, add_, map_, reduce_ from tools import ECHO, CAT, WC, PASTE #################### # Workflow #################### dag = DAG().sequence_( add_([ECHO(tags={'word': 'hello'}), ECHO(tags={'word': 'world'})]), split_([('i', [1, 2])], CAT), reduce_([], PASTE), map_(WC)) dag.create_dag_img('/tmp/ex.svg') ################# # Run Workflow ################# WF = Workflow.start('Example 2', restart=True, delete_intermediates=True) dag.add_to_workflow(WF) WF.run()
def pipeline(bams, test_bam=False, chromosome_only_split=False): # split_ tuples #chrom = ('chrom', range(1,23) + ['X', 'Y', 'MT']) chrom = ('chrom', range(1, 23)) glm = ('glm', ['SNP', 'INDEL']) dbnames = ('dbname', [ 'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain', 'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC', 'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way', 'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all', 'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa', 'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity', 'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene', 'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog' ]) bam_seq = None for b in bams: header = _getHeaderInfo(b) sn = _getSeqName(header) rgid = [h[0] for h in header['rg']] # restrict output for testing if test_bam: sn = ['chr1'] chrom = ('chrom', [1]) glm = ('glm', ['SNP']) skip_VQSR = ('skip_VQSR', [True]) else: skip_VQSR = ('skip_VQSR', [False]) # if seqName is empty, then let's assume that the input is unaligned bam # use everything before extension as part of tag sample_name = os.path.splitext(os.path.basename(b))[0] if chromosome_only_split: # Stop splitting by rgId bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])] indelrealign_reduce = ['bam'] else: bam_bwa_split = [('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False])] indelrealign_reduce = ['bam', 'rgId'] s = sequence_( add_([INPUT(b, tags={'bam': sample_name})], stage_name="Load BAMs"), split_(bam_bwa_split, pipes.Bam_To_BWA)) if bam_seq is None: bam_seq = s else: bam_seq = sequence_(bam_seq, s, combine=True) # Previous pipeline pr_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_(pipes.MarkDuplicates), reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration), map_(pipes.ReduceReads), reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper), reduce_(['glm'], pipes.VariantQualityScoreRecalibration, tag={'vcf': 'main'}), reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"), map_(pipes.Vcf2Anno_in), split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}), reduce_(['vcf'], pipes.MergeAnnotations)) # HaplotypeCaller Pipeline: official for GATK 3.0 hc_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_(pipes.MarkDuplicates), reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration), map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs), split_([glm, skip_VQSR], pipes.VariantQualityScoreRecalibration, tag={'vcf': 'main'})) return hc_pipeline
from cosmos.Workflow.models import Workflow from cosmos.lib.ezflow.dag import DAG, add_,split_ from tools import ECHO, CAT #################### # Workflow #################### dag = DAG().sequence_( add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]), split_([('i',[1,2])],CAT) ) dag.create_dag_img('/tmp/ex.svg') ################# # Run Workflow ################# WF = Workflow.start('Example 1',restart=True) dag.add_to_workflow(WF) WF.run()
def pipeline(bams, test_bam=False, chromosome_only_split=False): # split_ tuples #chrom = ('chrom', range(1,23) + ['X', 'Y', 'MT']) chrom = ('chrom', range(1,23)) glm = ('glm', ['SNP', 'INDEL']) dbnames = ('dbname', ['dbSNP135','CytoBand','Target_Scan','mirBase','Self_Chain','Repeat_Masker','TFBS','Segmental_Duplications','SIFT','COSMIC', 'PolyPhen2','Mutation_Taster','GERP','PhyloP','LRT','Mce46way','Complete_Genomics_69','The_1000g_Febuary_all','The_1000g_April_all', 'NHLBI_Exome_Project_euro','NHLBI_Exome_Project_aa','NHLBI_Exome_Project_all','ENCODE_DNaseI_Hypersensitivity','ENCODE_Transcription_Factor', 'UCSC_Gene','Refseq_Gene','Ensembl_Gene','CCDS_Gene','HGMD_INDEL','HGMD_SNP','GWAS_Catalog']) bam_seq = None for b in bams: header = _getHeaderInfo(b) sn = _getSeqName(header) rgid = [ h[0] for h in header['rg']] # restrict output for testing if test_bam: sn = ['chr1'] chrom = ('chrom',[1]) glm = ('glm',['SNP']) skip_VQSR = ('skip_VQSR', [True]) else: skip_VQSR = ('skip_VQSR', [False]) # if seqName is empty, then let's assume that the input is unaligned bam # use everything before extension as part of tag sample_name = os.path.splitext(os.path.basename(b))[0] if chromosome_only_split: # Stop splitting by rgId bam_bwa_split = [ ('prevSn', sn), ('chromosome_only_split', [True]) ] indelrealign_reduce = ['bam'] else: bam_bwa_split = [ ('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False]) ] indelrealign_reduce = ['bam','rgId'] s = sequence_( add_([INPUT(b, tags={'bam':sample_name})], stage_name="Load BAMs"), split_(bam_bwa_split, pipes.Bam_To_BWA)) if bam_seq is None: bam_seq = s else: bam_seq = sequence_(bam_seq, s, combine=True) # Previous pipeline pr_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_( pipes.MarkDuplicates), reduce_(['bam','chrom'], pipes.BaseQualityScoreRecalibration), map_( pipes.ReduceReads), reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper), reduce_(['glm'], pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'}), reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"), map_( pipes.Vcf2Anno_in), split_([dbnames], pipes.Annotate, tag={'build':'hg19'}), reduce_(['vcf'], pipes.MergeAnnotations) ) # HaplotypeCaller Pipeline: official for GATK 3.0 hc_pipeline = sequence_( bam_seq, reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner), map_( pipes.MarkDuplicates), reduce_(['bam','chrom'], pipes.BaseQualityScoreRecalibration), map_( pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs), split_([glm, skip_VQSR], pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'}) ) return hc_pipeline
from cosmos.Workflow.models import Workflow from cosmos.lib.ezflow.dag import DAG, split_,add_,map_,reduce_ from tools import ECHO, CAT, WC, PASTE, Sleep #################### # Workflow #################### dag = DAG().sequence_( add_([ ECHO(tags={'word':'hello'}), ECHO(tags={'word':'world'}) ]), map_(Sleep), split_([('i',[1,2])], CAT), reduce_([], PASTE), map_(WC), ) dag.create_dag_img('/tmp/ex.svg') ################# # Run Workflow ################# WF = Workflow.start('Example 3',restart=True,delete_intermediates=True) dag.add_to_workflow(WF) WF.run()