Пример #1
0
def make_variant_dict_parallel(vcf_files, bam_files, sample_names, bed_region,
                               nthreads):

    dirname = tempfile.gettempdir()  #os.curdir
    partial_regions = split_regions.split(
        bed_region, os.path.join(dirname,
                                 uuid.uuid4().hex + '.bed'), nthreads)

    pool = multiprocessing.Pool(nthreads)

    map_args = map(lambda bed_i: (vcf_files, bed_i), partial_regions)
    partitioned_vcf_files = pool.map_async(intersect_multiple_vcf_files,
                                           map_args).get()

    map_args = map(
        lambda partial_vcf_files: (partial_vcf_files, bam_files, sample_names),
        partitioned_vcf_files)
    variant_dictionaries = pool.map_async(make_variant_dict, map_args).get()

    pool.close()

    # partitioned_vcf_files is a list of a list of vcf files
    vcfs_to_be_deleted = []
    for list_of_vcfs in partitioned_vcf_files:
        vcfs_to_be_deleted = vcfs_to_be_deleted + list_of_vcfs

    for file_i in partial_regions + vcfs_to_be_deleted:
        os.remove(file_i)

    return variant_dictionaries
Пример #2
0
def splitRegions(nthreads, outfiles, bed=None, fai=None):

    assert bed or fai
    if fai and not bed:
        bed = split_bed.fai2bed(fai, outfiles)

    writtenBeds = split_bed.split(bed, outfiles, nthreads)

    return writtenBeds
Пример #3
0
def splitRegions(input_parameters):

    fai = input_parameters['genome_reference'] + '.fai'

    tempdir = os.path.join(TMPDIR, uuid.uuid4().hex)
    os.makedirs(tempdir, exist_ok=True)
    bed = split_bed.fai2bed(
        fai, os.path.join(input_parameters['output_directory'], 'genome.bed'))
    writtenBeds = split_bed.split(bed, os.path.join(tempdir, 'th.bed'),
                                  input_parameters['threads'])

    return writtenBeds

##########################################################

if __name__ == '__main__':
    
    workflowArguments = run()
    
    if workflowArguments['inclusion_region']:
        bed_file = workflowArguments['inclusion_region']
        
    else:
        split_bed.fai2bed(workflowArguments['genome_reference'] + '.fai', workflowArguments['output_directory'] + os.sep + 'genome.bed')
        bed_file = workflowArguments['output_directory'] + os.sep + 'genome.bed'
    
    split_bed.split(bed_file, workflowArguments['output_directory'] + os.sep + 'bed', workflowArguments['threads'])

    os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs', exist_ok=True)
    
        
    for thread_i in range(1, workflowArguments['threads']+1):
        
        if workflowArguments['threads'] > 1:
            
            perThreadParameter = copy(workflowArguments)
            
            # Add OUTDIR/thread_i for each thread
            perThreadParameter['output_directory'] = workflowArguments['output_directory'] + os.sep + str(thread_i)
            perThreadParameter['inclusion_region'] = '{}/{}.bed'.format( perThreadParameter['output_directory'], str(thread_i) )
            
            os.makedirs(perThreadParameter['output_directory'] + os.sep + 'logs', exist_ok=True)
Пример #5
0
def make_workflow(args, workflowArguments):

    logger.info(
        'Create SomaticSeq Workflow Scripts: ' +
        ', '.join(['{}={}'.format(i,
                                  vars(args)[i]) for i in vars(args)]))

    ts = re.sub(r'[:-]', '.',
                datetime.now().isoformat(sep='.', timespec='milliseconds'))
    workflow_tasks = {
        'caller_jobs': [],
        'somaticseq_jobs': [],
        'merging_jobs': []
    }

    ################# TUMOR-NORMAL RUNS #################
    if workflowArguments['which'] == 'paired':

        if workflowArguments['inclusion_region']:
            bed_file = workflowArguments['inclusion_region']

        else:
            split_bed.fai2bed(
                workflowArguments['genome_reference'] + '.fai',
                workflowArguments['output_directory'] + os.sep + 'genome.bed')
            bed_file = workflowArguments[
                'output_directory'] + os.sep + 'genome.bed'

        split_bed.split(bed_file,
                        workflowArguments['output_directory'] + os.sep + 'bed',
                        workflowArguments['threads'])

        os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs',
                    exist_ok=True)

        # Unparallelizables
        if workflowArguments['run_jointsnvmix2']:
            import utilities.dockered_pipelines.somatic_mutations.JointSNVMix2 as JointSNVMix2
            input_arguments = copy(workflowArguments)
            input_arguments['script'] = 'jsm2.{}.cmd'.format(ts)
            jointsnvmix2_job = JointSNVMix2.tumor_normal(
                input_arguments, args.container_tech)
            workflow_tasks['caller_jobs'].append(jointsnvmix2_job)

        if workflowArguments['run_somaticsniper']:
            import utilities.dockered_pipelines.somatic_mutations.SomaticSniper as SomaticSniper
            input_arguments = copy(workflowArguments)
            input_arguments['script'] = 'somaticsniper.{}.cmd'.format(ts)
            somaticsniper_job = SomaticSniper.tumor_normal(
                input_arguments, args.container_tech)
            workflow_tasks['caller_jobs'].append(somaticsniper_job)

        # Parallelizables
        to_create_merging_script = True
        for thread_i in range(1, workflowArguments['threads'] + 1):

            if workflowArguments['threads'] > 1:

                perThreadParameter = copy(workflowArguments)

                # Add OUTDIR/thread_i for each thread
                perThreadParameter['output_directory'] = workflowArguments[
                    'output_directory'] + os.sep + str(thread_i)
                perThreadParameter['inclusion_region'] = '{}/{}.bed'.format(
                    perThreadParameter['output_directory'], str(thread_i))

                os.makedirs(perThreadParameter['output_directory'] + os.sep +
                            'logs',
                            exist_ok=True)

                # Move 1.bed, 2.bed, ..., n.bed to each thread's subdirectory
                move(
                    '{}/{}.bed'.format(workflowArguments['output_directory'],
                                       thread_i),
                    '{}/{}.bed'.format(perThreadParameter['output_directory'],
                                       thread_i))

                # Results combiner
                if to_create_merging_script:
                    input_arguments = copy(workflowArguments)
                    input_arguments['script'] = 'mergeResults.{}.cmd'.format(
                        ts)
                    merging_job = tumor_normal.merge_results(
                        input_arguments, args.container_tech)
                    workflow_tasks['merging_jobs'].append(merging_job)
                    to_create_merging_script = False

            else:
                perThreadParameter = copy(workflowArguments)
                perThreadParameter['inclusion_region'] = bed_file

            # Invoke parallelizable callers one by one:
            if workflowArguments['run_mutect2']:
                import utilities.dockered_pipelines.somatic_mutations.MuTect2 as MuTect2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'mutect2.{}.cmd'.format(ts)
                mutect2_job = MuTect2.tumor_normal(input_arguments,
                                                   args.container_tech)
                workflow_tasks['caller_jobs'].append(mutect2_job)

            if workflowArguments['run_scalpel']:
                import utilities.dockered_pipelines.somatic_mutations.Scalpel as Scalpel
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'scalpel.{}.cmd'.format(ts)
                scalpel_job = Scalpel.tumor_normal(input_arguments,
                                                   args.container_tech)
                workflow_tasks['caller_jobs'].append(scalpel_job)

            if workflowArguments['run_vardict']:
                import utilities.dockered_pipelines.somatic_mutations.VarDict as VarDict
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'vardict.{}.cmd'.format(ts)
                vardict_job = VarDict.tumor_normal(input_arguments,
                                                   args.container_tech)
                workflow_tasks['caller_jobs'].append(vardict_job)

            if workflowArguments['run_varscan2']:
                import utilities.dockered_pipelines.somatic_mutations.VarScan2 as VarScan2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'varscan2.{}.cmd'.format(ts)
                varscan2_job = VarScan2.tumor_normal(input_arguments,
                                                     args.container_tech)
                workflow_tasks['caller_jobs'].append(varscan2_job)

            if workflowArguments['run_lofreq']:
                import utilities.dockered_pipelines.somatic_mutations.LoFreq as LoFreq

                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'lofreq.{}.cmd'.format(ts)

                if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'):
                    input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf']
                elif input_arguments['dbsnp_vcf'].endswith('.vcf'):
                    input_arguments[
                        'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz'
                    assert os.path.exists(input_arguments['dbsnp_gz'])
                    assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi')
                else:
                    raise Exception(
                        'LoFreq has no properly bgzipped dbsnp file.')

                lofreq_job = LoFreq.tumor_normal(input_arguments,
                                                 args.container_tech)
                workflow_tasks['caller_jobs'].append(lofreq_job)

            if workflowArguments['run_muse']:
                import utilities.dockered_pipelines.somatic_mutations.MuSE as MuSE

                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'muse.{}.cmd'.format(ts)

                if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'):
                    input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf']
                elif input_arguments['dbsnp_vcf'].endswith('.vcf'):
                    input_arguments[
                        'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz'
                    assert os.path.exists(input_arguments['dbsnp_gz'])
                    assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi')
                else:
                    raise Exception(
                        'MuSE has no properly bgzipped dbsnp file.')

                muse_job = MuSE.tumor_normal(input_arguments,
                                             args.container_tech)
                workflow_tasks['caller_jobs'].append(muse_job)

            if workflowArguments['run_strelka2']:
                import utilities.dockered_pipelines.somatic_mutations.Strelka2 as Strelka2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'strelka.{}.cmd'.format(ts)
                strelka2_job = Strelka2.tumor_normal(input_arguments,
                                                     args.container_tech)
                workflow_tasks['caller_jobs'].append(strelka2_job)

            if workflowArguments['run_somaticseq']:
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'somaticSeq.{}.cmd'.format(ts)
                somaticseq_job = tumor_normal.run_SomaticSeq(
                    input_arguments, args.container_tech)
                workflow_tasks['somaticseq_jobs'].append(somaticseq_job)

    ################# TUMOR-ONLY RUNS #################
    elif workflowArguments['which'] == 'single':

        if workflowArguments['inclusion_region']:
            bed_file = workflowArguments['inclusion_region']

        else:
            split_bed.fai2bed(
                workflowArguments['genome_reference'] + '.fai',
                workflowArguments['output_directory'] + os.sep + 'genome.bed')
            bed_file = workflowArguments[
                'output_directory'] + os.sep + 'genome.bed'

        split_bed.split(bed_file,
                        workflowArguments['output_directory'] + os.sep + 'bed',
                        workflowArguments['threads'])

        os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs',
                    exist_ok=True)

        # Parallelizables
        to_create_merging_script = True
        for thread_i in range(1, workflowArguments['threads'] + 1):

            if workflowArguments['threads'] > 1:

                perThreadParameter = copy(workflowArguments)

                # Add OUTDIR/thread_i for each thread
                perThreadParameter['output_directory'] = workflowArguments[
                    'output_directory'] + os.sep + str(thread_i)
                perThreadParameter['inclusion_region'] = '{}/{}.bed'.format(
                    perThreadParameter['output_directory'], str(thread_i))

                os.makedirs(perThreadParameter['output_directory'] + os.sep +
                            'logs',
                            exist_ok=True)

                # Move 1.bed, 2.bed, ..., n.bed to each thread's subdirectory
                move(
                    '{}/{}.bed'.format(workflowArguments['output_directory'],
                                       thread_i),
                    '{}/{}.bed'.format(perThreadParameter['output_directory'],
                                       thread_i))

                # Results combiner
                # Results combiner
                if to_create_merging_script:
                    input_arguments = copy(workflowArguments)
                    input_arguments['script'] = 'mergeResults.{}.cmd'.format(
                        ts)
                    merging_job = tumor_only.merge_results(
                        input_arguments, args.container_tech)
                    workflow_tasks['merging_jobs'].append(merging_job)
                    to_create_merging_script = False

            else:
                perThreadParameter = copy(workflowArguments)
                perThreadParameter['inclusion_region'] = bed_file

            # Invoke parallelizable callers one by one:
            if workflowArguments['run_mutect2']:
                import utilities.dockered_pipelines.somatic_mutations.MuTect2 as MuTect2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'mutect2.{}.cmd'.format(ts)
                mutect2_job = MuTect2.tumor_only(input_arguments,
                                                 args.container_tech)
                workflow_tasks['caller_jobs'].append(mutect2_job)

            if workflowArguments['run_scalpel']:
                import utilities.dockered_pipelines.somatic_mutations.Scalpel as Scalpel
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'scalpel.{}.cmd'.format(ts)
                scalpel_job = Scalpel.tumor_only(input_arguments,
                                                 args.container_tech)
                workflow_tasks['caller_jobs'].append(scalpel_job)

            if workflowArguments['run_vardict']:
                import utilities.dockered_pipelines.somatic_mutations.VarDict as VarDict
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'vardict.{}.cmd'.format(ts)
                vardict_job = VarDict.tumor_only(input_arguments,
                                                 args.container_tech)
                workflow_tasks['caller_jobs'].append(vardict_job)

            if workflowArguments['run_varscan2']:
                import utilities.dockered_pipelines.somatic_mutations.VarScan2 as VarScan2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'varscan2.{}.cmd'.format(ts)
                varscan2_job = VarScan2.tumor_only(input_arguments,
                                                   args.container_tech)
                workflow_tasks['caller_jobs'].append(varscan2_job)

            if workflowArguments['run_lofreq']:
                import utilities.dockered_pipelines.somatic_mutations.LoFreq as LoFreq

                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'lofreq.{}.cmd'.format(ts)

                if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'):
                    input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf']
                elif input_arguments['dbsnp_vcf'].endswith('.vcf'):
                    input_arguments[
                        'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz'
                    assert os.path.exists(input_arguments['dbsnp_gz'])
                    assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi')
                else:
                    raise Exception(
                        'LoFreq has no properly bgzipped dbsnp file.')

                lofreq_job = LoFreq.tumor_only(input_arguments,
                                               args.container_tech)
                workflow_tasks['caller_jobs'].append(lofreq_job)

            if workflowArguments['run_strelka2']:
                import utilities.dockered_pipelines.somatic_mutations.Strelka2 as Strelka2
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'strelka.{}.cmd'.format(ts)
                strelka2_job = Strelka2.tumor_only(input_arguments,
                                                   args.container_tech)
                workflow_tasks['caller_jobs'].append(strelka2_job)

            if workflowArguments['run_somaticseq']:
                input_arguments = copy(perThreadParameter)
                input_arguments['script'] = 'somaticSeq.{}.cmd'.format(ts)
                somaticseq_job = tumor_only.run_SomaticSeq(
                    input_arguments, args.container_tech)
                workflow_tasks['somaticseq_jobs'].append(somaticseq_job)

    ##### Log the scripts created #####
    for script_type in workflow_tasks:

        line_i = '{} {} scripts created: '.format(
            len(workflow_tasks[script_type]), script_type)
        logger.info(line_i)

        i = 1
        for script_i in workflow_tasks[script_type]:
            line_j = '{}) {}'.format(i, script_i)
            logger.info(line_j)
            i += 1

    ########## Execute the workflow ##########
    if args.run_workflow_locally:
        import utilities.dockered_pipelines.run_workflows as run_workflows
        run_workflows.run_workflows(
            (workflow_tasks['caller_jobs'], workflow_tasks['somaticseq_jobs'],
             workflow_tasks['merging_jobs']), args.threads)
        logger.info(
            'SomaticSeq Workflow Done. Check your results. You may remove the {} sub_directories.'
            .format(args.threads))

    return workflow_tasks