def json_(workflow,input_dict,**kwargs): """ Input file is a json of the following format: [ { 'chunk': 001, 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair': 0, #0 or 1 'path': '/path/to/fastq' }, {..} ] """ input_json = json.load(open(input_dict,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline(), configure(wga_settings), add_run(workflow) )
def bam(workflow,input_bam,input_bam_list,**kwargs): """ Input file is a bam with properly annotated readgroups. *** Note that this workflow assumes the bam header is *** *** also properly annotated with the correct readgroups! *** Example usage: $ genomekey bam -n 'Bam to VCF Workflow 1' input_bam.bam $ echo "dir/sample1.bam" > /tmp/bam.list $ echo "dir/sample2.bam" >> /tmp/bam.list $ genomekey bam -n 'Bam to VCF 2' -li /tmp/bam.list """ # capture and pedigree_file are used in main() input_bams = input_bam_list.read().strip().split('\n') if input_bam_list else [] if input_bam: input_bams.append(input_bam.name) dag = DAG(ignore_stage_name_collisions=True) Bam2Fastq(workflow,dag,wga_settings,input_bams) dag.sequence_( Pipeline(), configure(wga_settings), add_run(workflow) )
def json_(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { 'chunk': 001, 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair': 0, #0 or 1 'path': '/path/to/fastq' }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline(), configure(wga_settings), add_run(workflow))
def bam(workflow, input_bam, input_bam_list, **kwargs): """ Input file is a bam with properly annotated readgroups. *** Note that this workflow assumes the bam header is *** *** also properly annotated with the correct readgroups! *** Example usage: $ genomekey bam -n 'Bam to VCF Workflow 1' input_bam.bam $ echo "dir/sample1.bam" > /tmp/bam.list $ echo "dir/sample2.bam" >> /tmp/bam.list $ genomekey bam -n 'Bam to VCF 2' -li /tmp/bam.list """ # capture and pedigree_file are used in main() input_bams = input_bam_list.read().strip().split( '\n') if input_bam_list else [] if input_bam: input_bams.append(input_bam.name) dag = DAG(ignore_stage_name_collisions=True) Bam2Fastq(workflow, dag, wga_settings, input_bams) dag.sequence_(Pipeline(), configure(wga_settings), add_run(workflow))
def json_somatic(workflow,input_dict,**kwargs): """ Input file is a json of the following format: [ { "chunk": "001", "library": "LIB-1216301779A", "platform": "ILLUMINA", "platform_unit": "C0MR3ACXX.001", "rgid": "BC18-06-2013", "sample_name": "BC18-06-2013LyT_S5_L001", "pair": "1", "path": "/path/to/fastq.gz", "sample_type": "normal or tumor" }, {..} ] """ input_json = json.load(open(input_dict,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_Somatic(), configure(wga_settings), add_run(workflow) )
def json_somatic(workflow, input_dict, **kwargs): """ Input file is a json of the following format: [ { "chunk": "001", "library": "LIB-1216301779A", "platform": "ILLUMINA", "platform_unit": "C0MR3ACXX.001", "rgid": "BC18-06-2013", "sample_name": "BC18-06-2013LyT_S5_L001", "pair": "1", "path": "/path/to/fastq.gz", "sample_type": "normal or tumor" }, {..} ] """ input_json = json.load(open(input_dict, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_Somatic(), configure(wga_settings), add_run(workflow))
def json_local(workflow,input_dict,**kwargs): """ Input is a folder where each file is a json of the following format: [ { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':1 'path': '/path/to/fastq' }, { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':2 'path': '/path/to/fastq'..} ] """ dirList=os.listdir(input_dict) for files in dirList: print input_dict+files input_json = json.load(open(input_dict+files,'r')) inputs = [ INPUT(name='fastq.gz',path=i['path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] for i in inputs: print i DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_local(), configure(wga_settings), add_run(workflow) )
def downdbs(workflow,**kwargs): """ Download all annotation databases """ DAG().sequence_( add_([ annovarext.DownDB(tags={'build':'hg19','dbname':db}) for db in annovarext.get_db_names() ]), configure(wga_settings), add_run(workflow) )
def downdbs(workflow, **kwargs): """ Download all annotation databases """ DAG().sequence_( add_([ annovarext.DownDB(tags={ 'build': 'hg19', 'dbname': db }) for db in annovarext.get_db_names() ]), configure(wga_settings), add_run(workflow))
def fastq_(workflow,input_dict,output_dict,output_json,**kwargs): json_fastq_to_split=json_creator.json_out(input_dict,output_dict) input_json = json.load(open(json_fastq_to_split,'r')) inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_split(), configure(wga_settings), add_run(workflow) )
def upload_(workflow,bucket,project,out_dict,**kwargs): project_folder=join(out_dict,project.replace(" ", "_")) if not os.path.exists(project_folder): os.makedirs(project_folder) json_fastq_to_upload=s3_Bucket.getList(bucket,project,out_dict) input_json = json.load(open(json_fastq_to_upload,'r')) inputs = [ INPUT(name='fastq.gz',path=i['gz_path'],fmt='fastq.gz',tags=i,stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_upload(), configure(wga_settings), add_run(workflow) )
def fastq_(workflow, input_dict, output_dict, output_json, **kwargs): json_fastq_to_split = json_creator.json_out(input_dict, output_dict) input_json = json.load(open(json_fastq_to_split, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_split(), configure(wga_settings), add_run(workflow))
def anno(workflow, input_file, input_file_list, file_format='vcf', **kwargs): """ Annotates all files in input_Files $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf """ input_files = input_file_list.read().strip().split( '\n') if input_file_list else [] if input_file: input_files.append(input_file.name) print('annotating {0}'.format(', '.join(input_files)), file=sys.stderr) DAG().sequence_( add_([ INPUT(input_file, tags={'vcf': i}) for i, input_file in enumerate(input_files) ]), massive_annotation, configure(wga_settings), add_run(workflow))
def anno(workflow,input_file,input_file_list,file_format='vcf',**kwargs): """ Annotates all files in input_Files $ genomekey anno -n 'My Annotation Workflow #1' file1.vcf file2.vcf """ input_files = input_file_list.read().strip().split('\n') if input_file_list else [] if input_file: input_files.append(input_file.name) print >> sys.stderr, 'annotating {0}'.format(', '.join(input_files)) DAG().sequence_( add_([ INPUT(input_file,tags={'vcf':i}) for i,input_file in enumerate(input_files) ]), massive_annotation, configure(wga_settings), add_run(workflow) )
def upload_(workflow, bucket, project, out_dict, **kwargs): project_folder = join(out_dict, project.replace(" ", "_")) if not os.path.exists(project_folder): os.makedirs(project_folder) json_fastq_to_upload = s3_Bucket.getList(bucket, project, out_dict) input_json = json.load(open(json_fastq_to_upload, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['gz_path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] DAG(ignore_stage_name_collisions=True).sequence_(add_(inputs), Pipeline_upload(), configure(wga_settings), add_run(workflow))
def Bam2Fastq(workflow, dag, settings, input_bams): if len(input_bams) == 0: raise WorkflowException, 'At least 1 BAM input required' dag.sequence_( sequence_( *[ sequence_( add_([ INPUT(input_bam, tags={'input':os.path.basename(input_bam)})],stage_name="Load Input Bams"), split_([('rgid',_inputbam2rgids(input_bam))],pipes.FilterBamByRG_To_FastQ) ) for input_bam in input_bams ], combine=True ), split_([('pair',[1,2])],genomekey_scripts.SplitFastq), configure(settings), add_run(workflow,finish=False), ).add_(list(_splitfastq2inputs(dag))) return dag
def Bam2Fastq(workflow, dag, settings, input_bams): if len(input_bams) == 0: raise WorkflowException, 'At least 1 BAM input required' dag.sequence_( sequence_(*[ sequence_( add_([ INPUT(input_bam, tags={'input': os.path.basename(input_bam)}) ], stage_name="Load Input Bams"), split_([('rgid', _inputbam2rgids(input_bam))], pipes.FilterBamByRG_To_FastQ)) for input_bam in input_bams ], combine=True), split_([('pair', [1, 2])], genomekey_scripts.SplitFastq), configure(settings), add_run(workflow, finish=False), ).add_(list(_splitfastq2inputs(dag))) return dag
def json_local(workflow, input_dict, **kwargs): """ Input is a folder where each file is a json of the following format: [ { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':1 'path': '/path/to/fastq' }, { 'library': 'LIB-1216301779A', 'sample_name': '1216301779A', 'platform': 'ILLUMINA', 'platform_unit': 'C0MR3ACXX.001' 'pair':2 'path': '/path/to/fastq'..} ] """ dirList = os.listdir(input_dict) for files in dirList: print(input_dict + files) input_json = json.load(open(input_dict + files, 'r')) inputs = [ INPUT(name='fastq.gz', path=i['path'], fmt='fastq.gz', tags=i, stage_name='Load Input Fastqs') for i in input_json ] for i in inputs: print(i) DAG(ignore_stage_name_collisions=True).sequence_( add_(inputs), Pipeline_local(), configure(wga_settings), add_run(workflow))