def create_destruct_workflow( bam_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): # Optionally cache raw reads for quicker rerun if raw_data_dir is not None: mgd_stats = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary') mgd_reads_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'), 'bylibrary') mgd_reads_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'), 'bylibrary') mgd_sample_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'), 'bylibrary') mgd_sample_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'), 'bylibrary') else: mgd_stats = mgd.TempFile('stats.txt', 'bylibrary') mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary') mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary') mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary') mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary') config = destruct.defaultconfig.get_config(ref_data_dir, config) workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(bam_filenames.keys()), ) # Retrieve discordant reads and stats from bam files workflow.commandline( name='bamdisc', axes=('bylibrary', ), ctx={ 'io': 1, 'mem': 8 }, args=( 'destruct_bamdiscordantfastq', '-r', '-c', config['bam_max_soft_clipped'], '-f', config['bam_max_fragment_length'], '-b', mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames), '-s', mgd_stats.as_output(), '--fastq1', mgd_reads_1.as_output(), '--fastq2', mgd_reads_2.as_output(), '-t', mgd.TempSpace('bamdisc.tempspace', 'bylibrary'), '-n', config['num_read_samples'], '--sample1', mgd_sample_1.as_output(), '--sample2', mgd_sample_2.as_output(), ), ) workflow.subworkflow( name='destruct_fastq', func=create_destruct_fastq_workflow, args=( mgd_reads_1.as_input(), mgd_reads_2.as_input(), mgd_sample_1.as_input(), mgd_sample_2.as_input(), mgd_stats.as_input(), mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), mgd.OutputFile(breakpoint_read_table), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) return workflow
def create_destruct_fastq_workflow( fastq1_filenames, fastq2_filenames, sample1_filenames, sample2_filenames, stats_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(fastq1_filenames.keys()), ) workflow.transform( name='readstats', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.read_stats', ret=mgd.TempOutputObj('stats', 'bylibrary'), args=( mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames), config['fragment_length_num_stddevs'], ), ) # Align a sample of reads and calculate alignment statistics workflow.transform( name='prepseed_sample', axes=('bylibrary', ), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), 36, mgd.TempOutputFile('sample.seed', 'bylibrary'), ), ) workflow.commandline( name='bwtrealign_sample', axes=('bylibrary', ), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('sample.seed', 'bylibrary'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_aligntrue', '-a', '-', '-1', mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), '-2', mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '-s', mgd.TempOutputFile('samples.align.true', 'bylibrary'), ), ) workflow.transform( name='scorestats', axes=('bylibrary', ), ctx=medmem, func='destruct.score_stats.create_score_stats', args=( mgd.TempInputFile('samples.align.true', 'bylibrary'), config['match_score'], mgd.TempOutputFile('score.stats', 'bylibrary'), ), ) # Split discordant fastqs and align workflow.transform( name='splitfastq1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads1', 'bylibrary', 'byread'), ), ) workflow.transform( name='splitfastq2', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads2', 'bylibrary', 'byread', axes_origin=[]), ), ) workflow.transform( name='prepseed', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.TempInputFile('reads1', 'bylibrary', 'byread'), mgd.TempInputFile('reads2', 'bylibrary', 'byread'), 36, mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'), ), ) workflow.commandline( name='bwtrealign', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_realign2', '-l', mgd.TempInputObj('library_id', 'bylibrary'), '-a', '-', '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--tchimer', config['chimeric_threshold'], '--talign', config['alignment_threshold'], '--pchimer', config['chimeric_prior'], '--tvalid', config['readvalid_threshold'], '-z', mgd.TempInputFile('score.stats', 'bylibrary'), '--span', mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'), '--split', mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'), ), ) workflow.transform( name='merge_spanning_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'), ), ) workflow.commandline( name='filterreads', axes=('bylibrary', ), ctx=lowmem, args=( 'destruct_filterreads', '-n', '2', '-a', mgd.TempInputFile('spanning.alignments_1', 'bylibrary'), '-r', config['satellite_regions'], '>', mgd.TempOutputFile('spanning.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_split_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('split.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_spanning_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary'), mgd.TempOutputFile('spanning.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) workflow.transform( name='merge_split_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('split.alignments', 'bylibrary'), mgd.TempOutputFile('split.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) # Cluster spanning reads workflow.setobj( obj=mgd.TempOutputObj('chrom.args', 'bychromarg'), value=destruct.tasks.generate_chromosome_args(config['chromosomes']), ) workflow.transform( name='write_stats_table', ctx=lowmem, func='destruct.tasks.write_stats_table', args=( mgd.TempInputObj('library_id', 'bylibrary'), mgd.TempInputObj('stats', 'bylibrary'), mgd.TempOutputFile('libstats.tsv'), ), ) workflow.commandline( name='cluster', axes=('bychromarg', ), ctx=medmem, args=( 'destruct_mclustermatepairs', '-a', mgd.TempInputFile('spanning.alignments'), '-s', mgd.TempInputFile('libstats.tsv'), '-c', mgd.TempOutputFile('clusters', 'bychromarg'), mgd.TempInputObj('chrom.args', 'bychromarg'), '--clustmin', config['cluster_readcount_threshold'], '--fragmax', config['fragment_length_max'], ), ) # Predict breakpoints from split reads workflow.transform( name='predict_breaks', axes=('bychromarg', ), ctx=medmem, func='destruct.predict_breaks.predict_breaks', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('spanning.alignments'), mgd.TempInputFile('split.alignments'), mgd.TempOutputFile('breakpoints_2', 'bychromarg'), ), ) workflow.transform( name='merge_clusters', ctx=lowmem, func='destruct.tasks.merge_clusters', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('breakpoints_2', 'bychromarg'), mgd.TempOutputFile('clusters'), mgd.TempOutputFile('breakpoints_2'), mgd.TempOutputFile('merge_clusters.debug'), ), ) # Realign reads to breakpoints workflow.commandline( name='realigntobreaks', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'destruct_realigntobreaks2', '-r', config['genome_fasta'], '-b', mgd.TempInputFile('breakpoints_2'), '-c', mgd.TempInputFile('clusters'), '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--span', mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '--realignments', mgd.TempOutputFile('realignments', 'bylibrary', 'byread'), ), ) # Calculate likelihoods based on realignments workflow.transform( name='calculate_realignment_likelihoods', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.predict_breaks.calculate_realignment_likelihoods', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempInputFile('realignments', 'bylibrary', 'byread'), mgd.TempInputFile('score.stats', 'bylibrary'), mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'), config['match_score'], mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_mean'), mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_stddev'), ), ) workflow.transform( name='merge_likelihoods_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'), mgd.TempOutputFile('likelihoods_2', 'bylibrary'), mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'), '1', ), ) workflow.transform( name='merge_likelihoods_2', ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary'), mgd.TempOutputFile('likelihoods_2'), mgd.TempSpace('merge_likelihoods_2_temp'), '1', ), ) # Set cover for multi mapping reads workflow.transform( name='calc_weights', ctx=medmem, func='destruct.predict_breaks.calculate_cluster_weights', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('cluster_weights'), ), ) workflow.commandline( name='setcover', ctx=medmem, args=( 'destruct_setcover', '-c', mgd.TempInputFile('clusters'), '-w', mgd.TempInputFile('cluster_weights'), '-a', mgd.TempOutputFile('clusters_setcover'), ), ) # Select cluster based on setcover workflow.transform( name='select_clusters', ctx=medmem, func='destruct.predict_breaks.select_clusters', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('breakpoints_1'), mgd.TempInputFile('likelihoods_2'), mgd.TempOutputFile('likelihoods_1'), ), ) # Select prediction based on max likelihood workflow.transform( name='select_predictions', ctx=himem, func='destruct.predict_breaks.select_predictions', args=( mgd.TempInputFile('breakpoints_1'), mgd.TempOutputFile('breakpoints'), mgd.TempInputFile('likelihoods_1'), mgd.TempOutputFile('likelihoods'), config['mate_score_threshold'], config['template_length_min_threshold'], config['min_alignment_log_likelihood'], ), ) # Optionally tabulate supporting reads workflow.transform( name='tabreads', ctx=medmem, func='destruct.tasks.tabulate_reads', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), mgd.TempOutputFile('breakreads.table.unsorted'), ), ) workflow.commandline( name='sortreads', ctx=medmem, args=( 'sort', '-n', mgd.TempInputFile('breakreads.table.unsorted'), '>', mgd.OutputFile(breakpoint_read_table), ), ) # Tabulate results workflow.transform( name='tabulate', ctx=himem, func='destruct.tasks.tabulate_results', args=( mgd.TempInputFile('breakpoints'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), config['genome_fasta'], config['gtf_filename'], config['dgv_filename'], mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), ), ) return workflow
def generate_bam( simulation_params, chromosomes, include_nonchromosomal, simulated_bam_filename, genome_fasta_filename, simulated_table_filename, raw_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj(mgd.TempOutputObj('simulation.params'), simulation_params) workflow.setobj(mgd.TempOutputObj('chromosomes'), chromosomes) workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'), include_nonchromosomal) workflow.transform( name='create_genome', func=destruct.benchmark.destruct_test.create_genome, args=( mgd.TempInputObj('chromosomes'), mgd.TempInputObj('include_nonchromosomal'), mgd.OutputFile(genome_fasta_filename), ), ) workflow.transform( name='create_sim', func=destruct.benchmark.create_breakpoint_simulation.create, args=( mgd.TempInputObj('simulation.params'), mgd.InputFile(genome_fasta_filename), mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.fasta')), mgd.OutputFile(simulated_table_filename), mgd.TempOutputFile('concordant.1.fastq'), mgd.TempOutputFile('concordant.2.fastq'), mgd.TempOutputFile('discordant.1.fastq'), mgd.TempOutputFile('discordant.2.fastq'), ), ) workflow.commandline( name='cat1', args=( 'cat', mgd.TempInputFile('concordant.1.fastq'), mgd.TempInputFile('discordant.1.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), ), ) workflow.commandline( name='cat2', args=( 'cat', mgd.TempInputFile('concordant.2.fastq'), mgd.TempInputFile('discordant.2.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), ), ) workflow.subworkflow( name='bwa_align', func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow, args=( mgd.InputFile(genome_fasta_filename), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), mgd.TempOutputFile('simulated.unsorted.bam'), ), ) workflow.transform( name='samtools_sort_index', func=destruct.benchmark.destruct_test.samtools_sort_index, args=( mgd.TempInputFile('simulated.unsorted.bam'), mgd.OutputFile(simulated_bam_filename), ), ) return workflow
mgd.TempOutputFile('tumour.unspiked.bam'), 0.5, ), ) workflow.commandline( name='simulate', args=( 'destruct_bamextractsimreads', '-b', mgd.InputFile(source_bam), '-r', mgd.InputFile(genome_fasta), '-s', mgd.InputFile(os.path.join(args['results_dir'], 'simulated.fa')), '-f', mgd.TempInputObj('simulation.params').extract( lambda a: a['coverage_fraction']), '-1', mgd.OutputFile( os.path.join(args['results_dir'], 'simulated.1.fastq')), '-2', mgd.OutputFile( os.path.join(args['results_dir'], 'simulated.2.fastq')), ), ) workflow.subworkflow( name='bwa_align', func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow, args=(
def create_bwa_mappability_workflow(config, ref_data_dir, **kwargs): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 8}) mappability_length = remixt.config.get_param(config, 'mappability_length') genome_fasta = remixt.config.get_filename(config, ref_data_dir, 'genome_fasta') mappability_filename = remixt.config.get_filename(config, ref_data_dir, 'mappability') workflow.transform( name='create_kmers', func=remixt.mappability.tasks.create_kmers, args=( mgd.InputFile(genome_fasta), mappability_length, mgd.TempOutputFile('kmers'), ), ) workflow.transform( name='split_kmers', func=remixt.mappability.tasks.split_file_byline, args=( mgd.TempInputFile('kmers'), 4000000, mgd.TempOutputFile('kmers', 'bykmer'), ), ) workflow.commandline( name='bwa_aln_kmers', axes=('bykmer', ), args=( 'bwa', 'aln', mgd.InputFile(genome_fasta), mgd.TempInputFile('kmers', 'bykmer'), '>', mgd.TempOutputFile('sai', 'bykmer'), ), ) workflow.commandline( name='bwa_samse_kmers', axes=('bykmer', ), args=( 'bwa', 'samse', mgd.InputFile(genome_fasta), mgd.TempInputFile('sai', 'bykmer'), mgd.TempInputFile('kmers', 'bykmer'), '>', mgd.TempOutputFile('alignments', 'bykmer'), ), ) workflow.transform( name='create_bedgraph', axes=('bykmer', ), func=remixt.mappability.tasks.create_bedgraph, args=( mgd.TempInputFile('alignments', 'bykmer'), mgd.TempOutputFile('bedgraph', 'bykmer'), ), ) workflow.transform( name='merge_bedgraph', func=remixt.mappability.tasks.merge_files_by_line, args=( mgd.TempInputFile('bedgraph', 'bykmer'), mgd.OutputFile(mappability_filename), ), ) return workflow
def bwa_align_workflow( genome_fasta_filename, fastq_1_filename, fastq_2_filename, bam_filename, reads_per_job=int(1e6), read_group_str=None, ): if read_group_str is None: read_group_str = '@RG\\tID:S1\\tSM:sample_1' lowmem = {'mem': 1} himem = {'mem': 8} if not os.path.exists(genome_fasta_filename + '.bwt'): raise Exception('No index for ' + genome_fasta_filename) workflow = pypeliner.workflow.Workflow() workflow.transform( name='split1', ctx=lowmem, func=destruct.benchmark.align.bwa.tasks.split_fastq, args=( mgd.InputFile(fastq_1_filename), reads_per_job, mgd.TempOutputFile('fastq1', 'byread'), ), ) workflow.transform( name='split2', ctx=lowmem, func=destruct.benchmark.align.bwa.tasks.split_fastq, args=( mgd.InputFile(fastq_2_filename), reads_per_job, mgd.TempOutputFile('fastq2', 'byread', axes_origin=[]), ), ) workflow.commandline( name='align', axes=('byread', ), ctx=himem, args=( 'bwa', 'mem', '-R', read_group_str, genome_fasta_filename, mgd.TempInputFile('fastq1', 'byread'), mgd.TempInputFile('fastq2', 'byread'), '|', 'samtools', 'view', '-bt', genome_fasta_filename + '.fai', '-', '-o', mgd.TempOutputFile('bam', 'byread'), ), ) workflow.transform( name='merge_bams', ctx=lowmem, func=destruct.benchmark.align.bwa.tasks.merge_bam, args=( mgd.TempInputFile('bam', 'byread'), mgd.OutputFile(bam_filename), ), ) return workflow