Пример #1
0
def sambamba(inbams,
             outbam,
             tech='docker',
             input_parameters={},
             remove_inbams=False):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = list(inbams) + [
        outbam,
    ]
    merge_line, fileDict = container.container_params(
        input_parameters['sambamba_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    mounted_outbam = fileDict[outbam]['mount_path']
    infile_string = ' '.join(
        [fileDict[file_i]['mount_path'] for file_i in inbams])

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: picard_fractional uses this to end the copying.

        out.write(f'{merge_line} \\\n')
        out.write('sambamba merge -t {} {} {}\n\n'.format(
            input_parameters['threads'], mounted_outbam, infile_string))

        if remove_inbams:
            out.write('rm {}\n\n'.format(' '.join(inbams)))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #2
0
def run_SomaticSeq(input_parameters, tech='docker'):
    
    DEFAULT_PARAMS = {'MEM': '4G', 'inclusion_region': None, 'exclusion_region': None, 'output_directory' : os.curdir, 'somaticseq_directory': 'SomaticSeq', 'action': 'echo', 'dbsnp' : None, 'cosmic': None, 'snv_classifier': None, 'indel_classifier': None, 'truth_snv': None, 'truth_indel': None, 'somaticseq_arguments': '', 'train_somaticseq': False, 'somaticseq_algorithm': 'xgboost'}
    
    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    all_paths = []
    for path_i in input_parameters['bam'], input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['inclusion_region'], input_parameters['exclusion_region'], input_parameters['dbsnp'], input_parameters['cosmic'], input_parameters['snv_classifier'], input_parameters['indel_classifier'], input_parameters['truth_snv'], input_parameters['truth_indel']:
        if path_i:
            all_paths.append( path_i )

    container_line, fileDict = container.container_params( f'lethalfang/somaticseq:{VERSION}', tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] )

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[ input_parameters['genome_reference'] ]['mount_path']
    mounted_tumor_bam        = fileDict[ input_parameters['bam'] ]['mount_path']
    mounted_outdir           = fileDict[ input_parameters['output_directory'] ]['mount_path']

    outdir  = os.path.join(input_parameters['output_directory'], input_parameters['somaticseq_directory'])
    logdir  = os.path.join(outdir, 'logs')
    outfile = os.path.join(logdir, input_parameters['script'] )

    mutect2 = '{}/MuTect2.vcf'.format(mounted_outdir)
    varscan = '{}/VarScan2.vcf'.format(mounted_outdir)
    vardict = '{}/VarDict.vcf'.format(mounted_outdir)
    lofreq  = '{}/LoFreq.vcf'.format(mounted_outdir)
    scalpel = '{}/Scalpel.vcf'.format(mounted_outdir)
    strelka = '{}/Strelka/results/variants/variants.vcf.gz'.format(mounted_outdir)

    os.makedirs(logdir, exist_ok=True)
    with open(outfile, 'w') as out:

        out.write( "#!/bin/bash\n\n" )

        out.write(f'#$ -o {logdir}\n' )
        out.write(f'#$ -e {logdir}\n' )
        out.write( '#$ -S /bin/bash\n' )
        out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) )
        out.write( 'set -e\n\n' )

        out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' )

        #out.write( 'docker pull lethalfang/somaticseq:{VERSION} \n\n'.format(VERSION=VERSION) )
        
        out.write(f'{container_line} \\\n' )
        out.write( '/opt/somaticseq/somaticseq/run_somaticseq.py \\\n' )

        if input_parameters['train_somaticseq'] and input_parameters['threads'] == 1:
            out.write( '--somaticseq-train --algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) )

        out.write( '--output-directory {} \\\n'.format( os.path.join(mounted_outdir, input_parameters['somaticseq_directory']) ) )
        out.write( '--genome-reference {} \\\n'.format(mounted_genome_reference) )

        if input_parameters['inclusion_region']:
            mounted_inclusion = fileDict[ input_parameters['inclusion_region'] ]['mount_path']
            out.write( '--inclusion-region {} \\\n'.format(mounted_inclusion) )

        if input_parameters['exclusion_region']:
            mounted_exclusion = fileDict[ input_parameters['exclusion_region'] ]['mount_path']
            out.write( '--exclusion-region {} \\\n'.format(input_parameters['exclusion_region'])  )

        if input_parameters['cosmic']:
            mounted_cosmic = fileDict[ input_parameters['cosmic'] ]['mount_path']
            out.write( '--cosmic-vcf {} \\\n'.format(mounted_cosmic) )

        if input_parameters['dbsnp']:
            mounted_dbsnp  = fileDict[ input_parameters['dbsnp'] ]['mount_path']
            out.write( '--dbsnp-vcf {} \\\n'.format(input_parameters['dbsnp_vcf']) )

        if input_parameters['snv_classifier'] or input_parameters['indel_classifier']:
            out.write( '--algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) )
            
            if input_parameters['snv_classifier']:
                out.write( '--classifier-snv {} \\\n'.format( fileDict[ input_parameters['snv_classifier'] ]['mount_path'] ) )
    
            if input_parameters['indel_classifier']:
                out.write( '--classifier-indel {} \\\n'.format( fileDict[ input_parameters['indel_classifier'] ]['mount_path'] ) )

        if input_parameters['truth_snv']:
            out.write( '--truth-snv {} \\\n'.format( fileDict[ input_parameters['truth_snv'] ]['mount_path'] ) )

        if input_parameters['truth_indel']:
            out.write( '--truth-indel {} \\\n'.format( fileDict[ input_parameters['truth_indel'] ]['mount_path'] ) )

        if input_parameters['somaticseq_algorithm']:
            out.write( '--algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) )

        if input_parameters['somaticseq_arguments']:
            out.write( '{} \\\n'.format(input_parameters['somaticseq_arguments']) )

        out.write( 'single \\\n' )
        out.write( '--bam-file  {} \\\n'.format(mounted_tumor_bam) )
        
        if input_parameters['run_mutect2']:
            out.write( '--mutect2-vcf {} \\\n'.format(mutect2) )

        if input_parameters['run_varscan2']:
            out.write( '--varscan-vcf {} \\\n'.format(varscan) )

        if input_parameters['run_vardict']:
            out.write( '--vardict-vcf {} \\\n'.format(vardict) )

        if input_parameters['run_lofreq']:
            out.write( '--lofreq-vcf {} \\\n'.format(lofreq) )

        if input_parameters['run_scalpel']:
            out.write( '--scalpel-vcf {} \\\n'.format(scalpel) )

        if input_parameters['run_strelka2']:
            out.write( '--strelka-vcf {} \\\n'.format(strelka) )

        out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' )

    # "Run" the script that was generated
    command_line = '{} {}'.format( input_parameters['action'], outfile )
    returnCode   = subprocess.call( command_line, shell=True )

    return outfile
Пример #3
0
def merge_results(input_parameters, tech='docker'):

    DEFAULT_PARAMS = {'MEM': '4G', 'output_directory': os.curdir, 'somaticseq_directory': 'SomaticSeq', 'action': 'echo', 'script': 'mergeResults.{}.cmd'.format(ts), 'snv_classifier': None, 'indel_classifier': None, 'truth_snv': None, 'truth_indel': None, 'somaticseq_arguments': '', 'train_somaticseq': False, 'somaticseq_algorithm': 'xgboost'}
    
    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    all_paths = []
    for path_i in input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['snv_classifier'], input_parameters['indel_classifier'], input_parameters['truth_snv'], input_parameters['truth_indel']:
        if path_i:
            all_paths.append( path_i )

    container_line, fileDict = container.container_params( f'lethalfang/somaticseq:{VERSION}', tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] )

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[ input_parameters['output_directory'] ]['mount_path']

    prjdir  = input_parameters['output_directory']
    logdir  = os.path.join(prjdir, 'logs')
    outfile = os.path.join(logdir, input_parameters['script'] )

    mutect2 = mounted_outdir + '/{}/MuTect2.vcf'
    varscan = mounted_outdir + '/{}/VarScan2.vcf'
    vardict = mounted_outdir + '/{}/VarDict.vcf'
    lofreq  = mounted_outdir + '/{}/LoFreq.vcf'
    scalpel = mounted_outdir + '/{}/Scalpel.vcf'
    strelka = mounted_outdir + '/{}/Strelka/results/variants/variants.vcf.gz'

    somaticdir = input_parameters['somaticseq_directory']
    
    os.makedirs(logdir, exist_ok=True)
    with open(outfile, 'w') as out:

        out.write( "#!/bin/bash\n\n" )
        
        out.write(f'#$ -o {logdir}\n' )
        out.write(f'#$ -e {logdir}\n' )
        out.write( '#$ -S /bin/bash\n' )
        out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) )
        out.write( 'set -e\n\n' )

        out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' )

        if input_parameters['run_mutect2']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( mutect2.format(i) + ' ' )
                
            out.write( '\\\n' )
            out.write('-outfile {}/MuTect2.vcf\n\n'.format(mounted_outdir) )


        if input_parameters['run_varscan2']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( varscan.format(i) + ' ' )
            
            out.write( '\\\n' )
            out.write('-outfile {}/VarScan2.vcf\n\n'.format(mounted_outdir) )


        if input_parameters['run_vardict']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( vardict.format(i) + ' ' )
            
            out.write( '\\\n' )
            out.write('-outfile {}/VarDict.vcf\n\n'.format(mounted_outdir) )


        if input_parameters['run_lofreq']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( lofreq.format(i) + ' ' )
            
            out.write( '\\\n' )
            out.write('-outfile {}/LoFreq.vcf\n\n'.format(mounted_outdir) )


        if input_parameters['run_scalpel']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( scalpel.format(i) + ' ' )
            
            out.write( '\\\n' )
            out.write('-outfile {}/Scalpel.vcf\n\n'.format(mounted_outdir) )


        if input_parameters['run_strelka2']:
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py --bgzip-output -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write( strelka.format(i) + ' ' )
                
            out.write( '\\\n' )
            out.write('-outfile {}/Strelka.vcf\n\n'.format(mounted_outdir) )


        ###### SomaticSeq #####
        if input_parameters['run_somaticseq']:
            
            # Ensemble.sSNV.tsv
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write(  '{}/{}/{}/Ensemble.sSNV.tsv'.format(mounted_outdir, i, somaticdir) + ' ' )
                
            out.write( '\\\n' )
            out.write('-outfile {}/Ensemble.sSNV.tsv\n\n'.format(mounted_outdir) )

            # Ensemble.sINDEL.tsv
            out.write(f'{container_line} \\\n' )
            out.write( 'concat.py -infiles \\\n' )
            
            for i in range(1, input_parameters['threads']+1):
                out.write(  '{}/{}/{}/Ensemble.sINDEL.tsv'.format(mounted_outdir, i, somaticdir) + ' ' )
                
            out.write( '\\\n' )
            out.write('-outfile {}/Ensemble.sINDEL.tsv\n\n'.format(mounted_outdir) )

            
            # If asked to create classifier, do it here when TSV files are combined
            if input_parameters['train_somaticseq'] and input_parameters['truth_snv']:
                out.write(f'{container_line} \\\n' )
                if input_parameters['somaticseq_algorithm'] == 'ada':
                    out.write( 'ada_model_builder_ntChange.R {}/Ensemble.sSNV.tsv\n\n'.format( mounted_outdir) )
                else:
                    out.write( 'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sSNV.tsv\n\n'.format(input_parameters['threads'], mounted_outdir) )

            if input_parameters['train_somaticseq'] and input_parameters['truth_indel']:
                out.write(f'{container_line} \\\n' )
                if input_parameters['somaticseq_algorithm'] == 'ada':
                    out.write( 'ada_model_builder_ntChange.R {}/Ensemble.sINDEL.tsv\n\n'.format( mounted_outdir) )
                else:
                    out.write( 'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sINDEL.tsv\n\n'.format(input_parameters['threads'], mounted_outdir) )


            # If in prediction mode, combine SSeq.Classified.sSNV.vcf, else Consensus.sSNV.vcf
            if input_parameters['snv_classifier']:
                
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/SSeq.Classified.sSNV.vcf'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/SSeq.Classified.sSNV.vcf\n\n'.format(mounted_outdir) )
                
                # SSeq.Classified.sSNV.tsv
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/SSeq.Classified.sSNV.tsv'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/SSeq.Classified.sSNV.tsv\n\n'.format(mounted_outdir) )

            # Consensus mode: Consensus.sSNV.vcf
            else:
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/Consensus.sSNV.vcf'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/Consensus.sSNV.vcf\n\n'.format(mounted_outdir) )
            
            
            # If in prediction mode, combine SSeq.Classified.sINDEL.vcf, else Consensus.sINDEL.vcf
            if input_parameters['indel_classifier']:
                
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/SSeq.Classified.sINDEL.vcf'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/SSeq.Classified.sINDEL.vcf\n\n'.format(mounted_outdir) )

                # SSeq.Classified.sINDEL.tsv
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/SSeq.Classified.sINDEL.tsv'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/SSeq.Classified.sINDEL.tsv\n\n'.format(mounted_outdir) )

            # Consensus mode: Consensus.sINDEL.vcf
            else:
                out.write(f'{container_line} \\\n' )
                out.write( 'concat.py --bgzip-output -infiles \\\n' )
                
                for i in range(1, input_parameters['threads']+1):
                    out.write(  '{}/{}/{}/Consensus.sINDEL.vcf'.format(mounted_outdir, i, somaticdir) + ' ' )
                    
                out.write( '\\\n' )
                out.write('-outfile {}/Consensus.sINDEL.vcf\n\n'.format(mounted_outdir) )

        out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' )

    command_line = '{} {}'.format( input_parameters['action'], outfile )
    returnCode   = subprocess.call( command_line, shell=True )

    return outfile
Пример #4
0
def bwa(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'genome_reference'], input_parameters[
                'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    bwa_line, fileDict = container.container_params(
        input_parameters['bwa_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']
    mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'] *
                                              input_parameters['threads']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{bwa_line} bash -c \\\n')
        out.write('"bwa mem \\\n')
        out.write('-R \'{}\' \\\n'.format(input_parameters['bam_header']))
        out.write('-M {} -t {} \\\n'.format(
            input_parameters['extra_bwa_arguments'],
            input_parameters['threads']))
        out.write('{} \\\n'.format(mounted_reference))
        out.write('{} \\\n'.format(mounted_fq1))

        if paired_end:
            out.write('{} \\\n'.format(mounted_fq2))

        out.write('| samtools view -Sbh - \\\n')
        out.write(
            '| samtools sort -m {MEM}G --threads {THREADS} -o {DIR}/{OUTFILE}"\n\n'
            .format(MEM=math.ceil(input_parameters['MEM'] / 2),
                    THREADS=math.ceil(input_parameters['threads'] / 2),
                    DIR=mounted_outdir,
                    OUTFILE=input_parameters['out_bam']))

        out.write(f'{bwa_line} \\\n')
        out.write('samtools index -@{} {}\n'.format(
            input_parameters['threads'],
            os.path.join(mounted_outdir, input_parameters['out_bam'])))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #5
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['strelka2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        mounted_inclusion = fileDict[
            input_parameters['inclusion_region']]['mount_path']
        bed_gz = fileDict[
            input_parameters['inclusion_region']]['filename'] + '.gz'

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # Make .bed.gz out of .bed files using tabix:
        tabix_line, tabixDict = container.container_params(
            'lethalfang/tabix:1.7', tech, all_paths)
        tabix_selector = tabixDict[
            input_parameters['inclusion_region']]['mount_path']
        tabix_outdir = tabixDict[
            input_parameters['output_directory']]['mount_path']

        out.write(
            '{DOCKER_LINE} bash -c "cat {SELECTOR} | bgzip > {OUTDIR}/{BEDGZ}\"\n'
            .format(DOCKER_LINE=tabix_line,
                    SELECTOR=tabix_selector,
                    OUTDIR=tabix_outdir,
                    BEDGZ=bed_gz))
        out.write('{DOCKER_LINE} tabix -f {OUTDIR}/{BEDGZ}\n\n'.format(
            DOCKER_LINE=tabix_line, OUTDIR=tabix_outdir, BEDGZ=bed_gz))

        out.write(f'{container_line} \\\n')
        out.write('/opt/strelka/bin/configureStrelkaSomaticWorkflow.py \\\n')
        out.write('--tumorBam={} \\\n'.format(mounted_tumor_bam))
        out.write('--normalBam={} \\\n'.format(mounted_normal_bam))
        out.write('--referenceFasta={} \\\n'.format(mounted_genome_reference))
        out.write('--callMemMb={} \\\n'.format(
            eval(input_parameters['MEM'].rstrip('G')) * 1024))
        out.write('--callRegions={}/{} \\\n'.format(mounted_outdir, bed_gz))

        if input_parameters['exome']:
            out.write('--exome \\\n')

        if input_parameters['strelka_config_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['strelka_config_arguments']))

        out.write('--runDir={}/{}\n\n'.format(mounted_outdir,
                                              input_parameters['outdir_name']))

        out.write(f'{container_line} \\\n')
        out.write('{}/{}/runWorkflow.py -m local -j 1 {}\n'.format(
            mounted_outdir, input_parameters['outdir_name'],
            input_parameters['strelka_run_arguments']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #6
0
def alienTrimmer(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    trim_line, fileDict = container.container_params(
        input_parameters['alienTrimmerImage'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # AlienTrimmer does not do bgzipped fastq files, unfortunately:
        if input_parameters['in_fastq1'].endswith('.gz'):

            out_fastq_1 = uuid.uuid4().hex + '.fastq'
            out_fastq_2 = uuid.uuid4().hex + '.fastq'

            if paired_end:
                tabix_line, tabixDict = container.container_params(
                    'lethalfang/tabix:1.7', tech,
                    (input_parameters['output_directory'],
                     input_parameters['in_fastq1'],
                     input_parameters['in_fastq2']))
            else:
                tabix_line, tabixDict = container.container_params(
                    'lethalfang/tabix:1.7', tech,
                    (input_parameters['output_directory'],
                     input_parameters['in_fastq1']))

            tabix_outdir = tabixDict[
                input_parameters['output_directory']]['mount_path']
            tabix_fq1 = tabixDict[input_parameters['in_fastq1']]['mount_path']

            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"gunzip -c {} > {}/{}"\n\n'.format(
                tabix_fq1, tabix_outdir, out_fastq_1))
            mounted_fq1 = os.path.join(mounted_outdir, out_fastq_1)

            temporary_files.append(out_fastq_1)

            if paired_end:
                tabix_fq2 = tabixDict[
                    input_parameters['in_fastq2']]['mount_path']
                out.write(f'{tabix_line} bash -c \\\n')
                out.write('"gunzip -c {} > {}/{}"\n\n'.format(
                    tabix_fq2, tabix_outdir, out_fastq_2))
                mounted_fq2 = os.path.join(mounted_outdir, out_fastq_2)

                temporary_files.append(out_fastq_2)

        else:
            mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']

            if paired_end:
                mounted_fq2 = fileDict[
                    input_parameters['in_fastq2']]['mount_path']

        out.write(f'{trim_line} \\\n')
        out.write('/opt/AlienTrimmer_0.4.0/src/AlienTrimmer \\\n')

        if paired_end:
            trimmed_fq1 = uuid.uuid4().hex + '.fastq'
            trimmed_fq2 = uuid.uuid4().hex + '.fastq'
            singleton = uuid.uuid4().hex + '.fastq'

            out.write('-if {} -ir {} \\\n'.format(mounted_fq1, mounted_fq2))
            out.write('-of {}/{} -or {}/{} \\\n'.format(
                mounted_outdir, trimmed_fq1, mounted_outdir, trimmed_fq2))
            out.write('-os {}/{} \\\n'.format(mounted_outdir, singleton))

            temporary_files.extend([trimmed_fq1, trimmed_fq2, singleton])

        else:
            trimmed_fq1 = uuid.uuid4().hex + '.fastq'
            out.write('-i {} \\\n'.write(mounted_fq1))
            out.write('-o {}/{} \\\n'.write(mounted_outdir, trimmed_fq1))

            temporary_files.append(trimmed_fq1)

        out.write('-c {} \\\n'.format(input_parameters['adapter']))
        out.write('-l {}\n\n'.format(input_parameters['minimum_length']))

        out.write(f'{tabix_line} bash -c \\\n')
        out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
            tabix_outdir, trimmed_fq1, input_parameters['threads'],
            tabix_outdir, input_parameters['out_fastq1_name']))

        if paired_end:
            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
                tabix_outdir, trimmed_fq2, input_parameters['threads'],
                tabix_outdir, input_parameters['out_fastq2_name']))

            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
                tabix_outdir, singleton, input_parameters['threads'],
                tabix_outdir, input_parameters['out_singleton_name']))

        out.write('\n')
        for file_i in temporary_files:
            out.write('rm {}\n'.format(
                os.path.join(input_parameters['output_directory'], file_i)))

        # Remove untrimmed files:
        if input_parameters['remove_untrimmed']:
            out.write('\n')
            out.write('rm {}\n'.format(input_parameters['in_fastq1']))

            if input_parameters['in_fastq2']:
                out.write('rm {}\n'.format(input_parameters['in_fastq2']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #7
0
def tumor_normal(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['vardict_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    minVAF = input_parameters['minimum_VAF']

    total_bases = 0
    num_lines = 0

    if input_parameters['inclusion_region']:

        bed_file = input_parameters['inclusion_region']

        with open(bed_file) as bed:
            line_i = bed.readline().rstrip()
            while line_i.startswith('track'):
                line_i = bed.readline().rstrip()
            while line_i:
                item = line_i.rstrip().split('\t')
                total_bases = total_bases + int(item[2]) - int(item[1])
                num_lines += 1
                line_i = bed.readline().rstrip()

    else:

        fai_file = input_parameters['genome_reference'] + '.fai'
        bed_file = os.path.join(input_parameters['output_directory'],
                                'genome.bed')

        with open(fai_file) as fai, open(bed_file, 'w') as wgs_bed:
            for line_i in fai:

                item = line_i.split('\t')

                total_bases += int(item[1])
                num_lines += 1

                wgs_bed.write('{}\t{}\t{}\n'.format(item[0], '0', item[1]))

    # However the "bed_file" is defined here, create a dockered line and mount dictionary for it:
    bed_split_line, bedDict = container.container_params(
        'lethalfang/somaticseq:{}'.format(VERSION), tech,
        (bed_file, input_parameters['output_directory']))

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_bed = bedDict[bed_file]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # Decide if Bed file needs to be "split" such that each line has a small enough region
        if input_parameters['process_bed'] or total_bases / num_lines > 50000:
            out.write(f'{bed_split_line} \\\n')
            out.write(
                '/opt/somaticseq/somaticseq/utilities/split_mergedBed.py \\\n')
            out.write('-infile {} -outfile {}/split_regions.bed\n\n'.format(
                mounted_bed,
                bedDict[input_parameters['output_directory']]['mount_path']))

            bed_file = '{}/split_regions.bed'.format(mounted_outdir)

        out.write(f'{container_line} bash -c \\\n')
        out.write('"/opt/VarDict-1.7.0/bin/VarDict \\\n')

        if input_parameters['vardict_arguments']:
            out.write('{} \\\n'.format(input_parameters['vardict_arguments']))

        out.write('-G {} \\\n'.format(mounted_genome_reference))
        out.write('-f {} -h \\\n'.format(minVAF))
        out.write('-b \'{}|{}\' \\\n'.format(mounted_tumor_bam,
                                             mounted_normal_bam))
        out.write('-Q 1 -c 1 -S 2 -E 3 -g 4 {} \\\n'.format(bed_file))
        out.write('> {}/vardict.var"\n\n'.format(mounted_outdir))

        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write(
            'bash -c "cat {}/vardict.var | awk \'NR!=1\' | /opt/VarDict/testsomatic.R | /opt/VarDict/var2vcf_paired.pl -N \'TUMOR|NORMAL\' -f {} \\\n'
            .format(mounted_outdir, minVAF))
        out.write('> {}/{}"\n\n'.format(mounted_outdir,
                                        input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #8
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['dbsnp_gz'])
    assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi')

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['tumor_bam'], input_parameters[
            'normal_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters['dbsnp_gz']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['lofreq_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inclusion = fileDict[
        input_parameters['inclusion_region']]['mount_path']
    mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{container_line} \\\n')
        out.write('lofreq somatic \\\n')
        out.write('-t {} \\\n'.format(mounted_tumor_bam))
        out.write('-n {} \\\n'.format(mounted_normal_bam))
        out.write('--call-indels \\\n')
        out.write('-l {} \\\n'.format(mounted_inclusion))
        out.write('-f {} \\\n'.format(mounted_genome_reference))
        out.write('-o {}/{} \\\n'.format(mounted_outdir,
                                         input_parameters['out_prefix']))

        if input_parameters['lofreq_arguments']:
            out.write('{} \\\n'.format(input_parameters['lofreq_arguments']))

        out.write('-d {}\n'.format(mounted_dbsnp_gz))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #9
0
def tumor_normal(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['reference_dict'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters[
                            'reference_dict']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['scalpel_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference_dict = fileDict[
        input_parameters['reference_dict']]['mount_path']
    mounted_inclusion = fileDict[
        input_parameters['inclusion_region']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{container_line} bash -c \\\n')
        out.write('"/opt/scalpel/scalpel-discovery --somatic \\\n')
        out.write('--ref {} \\\n'.format(mounted_genome_reference))
        out.write('--bed {} \\\n'.format(mounted_inclusion))
        out.write('--normal {} \\\n'.format(mounted_normal_bam))
        out.write('--tumor {} \\\n'.format(mounted_tumor_bam))
        out.write('--window 600 \\\n')

        if input_parameters['scalpel_two_pass']:
            out.write('--two-pass \\\n')

        if input_parameters['scalpel_discovery_arguments']:
            out.write('{} \\\n'.format(
                DISCOVERY_ARGS=input_parameters['scalpel_discovery_arguments'])
                      )

        out.write('--dir {}/scalpel && \\\n'.format(mounted_outdir))
        out.write('/opt/scalpel/scalpel-export --somatic \\\n')
        out.write(
            '--db {}/scalpel/main/somatic.db.dir \\\n'.format(mounted_outdir))
        out.write('--ref {} \\\n'.format(mounted_genome_reference))
        out.write('--bed {} \\\n'.format(mounted_inclusion))
        out.write('{} \\\n'.format(
            input_parameters['scalpel_export_arguments']))
        out.write('> {}/scalpel/scalpel.vcf"\n\n'.format(mounted_outdir))

        out.write(f'{container_line} bash -c \\\n')
        out.write(
            '"cat {}/scalpel/scalpel.vcf | /opt/vcfsorter.pl {} - \\\n'.format(
                mounted_outdir, mounted_reference_dict))
        out.write('> {}/{}\"\n'.format(mounted_outdir,
                                       input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #10
0
def picard(inbams,
           outbam,
           tech='docker',
           input_parameters={},
           remove_inbams=False):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = list(inbams) + [
        outbam,
    ]
    merge_line, fileDict = container.container_params(
        input_parameters['picard_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    mounted_outbam = fileDict[outbam]['mount_path']

    infile_string = ''
    for file_i in inbams:
        infile_string = infile_string + 'I={} '.format(
            fileDict[file_i]['mount_path'])

    picard_index_file = re.sub(r'm$', 'i', outbam)

    if outbam.endswith('.bam'):
        samtools_index_file = outbam + '.bai'
    elif outbam.endswith('.cram'):
        samtools_index_file = outbam + '.crai'
    else:
        raise Exception('Output file {} seems wrong.'.format(outbam))

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: picard_fractional uses this to end the copying.

        out.write(f'{merge_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/picard.jar MergeSamFiles {} {} ASSUME_SORTED=true CREATE_INDEX=true O={}\n\n'
            .format(input_parameters['MEM'], infile_string,
                    input_parameters['extra_picard_arguments'],
                    mounted_outbam))

        if remove_inbams:
            out.write('rm {}\n\n'.format(' '.join(inbams)))

        out.write('mv {} {}\n\n'.format(picard_index_file,
                                        samtools_index_file))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #11
0
def picard(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_bam']:
        if path_i:
            all_paths.append(path_i)

    markdup_line, fileDict = container.container_params(
        input_parameters['picard_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])
    samtools_line, stDict = container.container_params(
        input_parameters['samtools_image'], tech, [
            input_parameters['output_directory'],
        ], input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path']

    tempdir = uuid.uuid4().hex
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: fractional uses this to end the copying.

        out.write('mkdir -p {}/{}\n\n'.format(
            input_parameters['output_directory'], tempdir))

        out.write(f'{markdup_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/picard.jar MarkDuplicatesWithMateCigar \\\n'
            .format(input_parameters['MEM']))
        out.write('I={} \\\n'.format(mounted_inbam))
        out.write('M={}/{} \\\n'.format(
            mounted_outdir,
            re.sub(
                r'\.(bam|cram)', '',
                fileDict[input_parameters['in_bam']]['filename'] +
                '.markdup')))
        out.write('ASSUME_SORT_ORDER=coordinate \\\n')
        out.write('TMP_DIR={}/{} \\\n'.format(mounted_outdir, tempdir))
        out.write('MINIMUM_DISTANCE=1000 \\\n')
        out.write('O={}/{}\n\n'.format(mounted_outdir,
                                       input_parameters['out_bam']))

        if input_parameters['index_bam']:
            out.write(f'{samtools_line} \\\n')
            out.write('samtools index -@{} {}/{}\n\n'.format(
                input_parameters['threads'],
                stDict[input_parameters['output_directory']]['mount_path'],
                input_parameters['out_bam']))

        out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'],
                                         tempdir))

        out.write(
            '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n'
        )  # Do not change this: fractional uses this to end the copying.

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #12
0
def fractional(bed, input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    outdir = str(Path(bed).absolute().parent)

    logdir = os.path.join(outdir, 'logs')
    outfile = os.path.join(logdir, 'markdup_fractional.{}.cmd'.format(ts))
    os.makedirs(logdir, exist_ok=True)

    sambam_line, stDict = container.container_params(
        input_parameters['sambamba_image'], tech, [
            input_parameters['in_bam'],
            bed,
        ], input_parameters['extra_docker_options'])

    mounted_inbam = stDict[input_parameters['in_bam']]['mount_path']
    mounted_bed = stDict[bed]['mount_path']
    mounted_outdir = stDict[bed]['mount_dir']

    temp_split_bam = uuid.uuid4().hex + '.bam'
    split_deduped_bam = uuid.uuid4().hex + '.bam'
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{sambam_line} \\\n')
        out.write('sambamba view -L {} -t {} -f bam -o {} {}\n\n'.format(
            mounted_bed, 1, os.path.join(mounted_outdir, temp_split_bam),
            mounted_inbam))

        fractional_parameters = copy(input_parameters)
        fractional_parameters['output_directory'] = outdir
        fractional_parameters['in_bam'] = os.path.join(outdir, temp_split_bam)
        fractional_parameters['out_bam'] = split_deduped_bam
        fractional_parameters['script'] = 'to_be_deleted.{}.cmd'.format(ts)
        fractional_parameters['index_bam'] = False

        if input_parameters['software'] == 'picard':
            dedup_script = picard(fractional_parameters, tech)
        elif input_parameters['software'] == 'sambamba':

            fractional_parameters['threads'] = 2
            dedup_script = sambamba(fractional_parameters, tech)

        with open(os.path.join(logdir,
                               fractional_parameters['script'])) as dedup:

            line_i = dedup.readline()

            while not line_i.startswith('echo -e "Start'):
                line_i = dedup.readline()

            while not line_i.startswith('echo -e "Done'):
                out.write(line_i)
                line_i = dedup.readline()

        out.write('rm {}\n'.format(os.path.join(outdir, temp_split_bam)))
        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile, os.path.join(outdir, split_deduped_bam)
Пример #13
0
def sambamba(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_bam']:
        if path_i:
            all_paths.append(path_i)

    markdup_line, fileDict = container.container_params(
        input_parameters['sambamba_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path']

    tempdir = uuid.uuid4().hex
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: fractional uses this to end the copying.

        out.write('mkdir -p {}/{}\n\n'.format(
            input_parameters['output_directory'], tempdir))

        out.write(f'{markdup_line} \\\n')
        out.write('sambamba markdup -t {} --tmpdir {} {} {}\n\n'.format(
            input_parameters['threads'], os.path.join(mounted_outdir, tempdir),
            mounted_inbam,
            os.path.join(mounted_outdir, input_parameters['out_bam'])))

        out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'],
                                         tempdir))

        out.write(
            '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n'
        )  # Do not change this: fractional uses this to end the copying.

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #14
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['dbsnp_gz'])
    assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi')

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters['dbsnp_gz']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['muse_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(
            'cat {} | awk -F "\\t" \'{{print $1 "\\t" $2 "\\t" $3}}\' > {}/bed_3columns.bed\n\n'
            .format(input_parameters['inclusion_region'],
                    input_parameters['output_directory']))

        out.write(f'{container_line} \\\n')
        out.write('MuSEv1.0rc_submission_c039ffa call \\\n')
        out.write('-O {}/MuSE \\\n'.format(mounted_outdir))
        out.write('-l {}/bed_3columns.bed \\\n'.format(mounted_outdir))
        out.write('-f {} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write('{}\n\n'.format(mounted_normal_bam))

        out.write(f'{container_line} \\\n')
        out.write('MuSEv1.0rc_submission_c039ffa sump \\\n')
        out.write('-I {}/MuSE.MuSE.txt \\\n'.format(mounted_outdir))

        if input_parameters['exome']:
            out.write('-E \\\n')
        else:
            out.write('-G \\\n')

        if input_parameters['muse_arguments']:
            out.write('{} \\\n'.format(
                EXTRA_ARGS=input_parameters['muse_arguments']))

        out.write('-O {}/{} \\\n'.format(mounted_outdir,
                                         input_parameters['outfile']))
        out.write('-D {}\n'.format(mounted_dbsnp_gz))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #15
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['varscan2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])
    mpileine_line, plDict = container.container_params(
        'lethalfang/samtools:1.7',
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    # Mounted paths for mpileup dockers
    pl_genome_reference = plDict[
        input_parameters['genome_reference']]['mount_path']
    pl_tumor_bam = plDict[input_parameters['tumor_bam']]['mount_path']
    pl_normal_bam = plDict[input_parameters['normal_bam']]['mount_path']
    pl_outdir = plDict[input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        selector_text = '-l {}'.format(
            plDict[input_parameters['inclusion_region']]['mount_path'])
    else:
        selector_text = ''

    if input_parameters['minimum_VAF']:
        minVAF = input_parameters['minimum_VAF']

    outname = re.sub(r'\.[a-zA-Z]+$', '', input_parameters['outfile'])

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{mpileine_line} bash -c \\\n')
        out.write('"samtools mpileup \\\n')
        out.write(
            '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n'
            .format(minMQ=input_parameters['min_MQ'],
                    minBQ=input_parameters['min_BQ'],
                    extra_pileup_arguments=input_parameters[
                        'varscan_pileup_arguments'],
                    selector_text=selector_text))
        out.write('{} \\\n'.format(pl_genome_reference))
        out.write('{} \\\n'.format(pl_normal_bam))
        out.write('> {}/normal.pileup"\n\n'.format(pl_outdir))

        out.write(f'{mpileine_line} bash -c \\\n')
        out.write('"samtools mpileup \\\n')
        out.write(
            '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n'
            .format(minMQ=input_parameters['min_MQ'],
                    minBQ=input_parameters['min_BQ'],
                    extra_pileup_arguments=input_parameters[
                        'varscan_pileup_arguments'],
                    selector_text=selector_text))
        out.write('{} \\\n'.format(pl_genome_reference))
        out.write('{} \\\n'.format(pl_tumor_bam))
        out.write('> {}/tumor.pileup"\n\n'.format(pl_outdir))

        out.write(f'{container_line} \\\n')
        out.write('java -Xmx{} -jar /VarScan2.3.7.jar somatic \\\n'.format(
            input_parameters['MEM']))
        out.write('{}/normal.pileup \\\n'.format(mounted_outdir))
        out.write('{}/tumor.pileup \\\n'.format(mounted_outdir))
        out.write('{}/{} {} --output-vcf 1 --min-var-freq {}\n\n'.format(
            mounted_outdir, outname, input_parameters['varscan_arguments'],
            input_parameters['minimum_VAF']))

        out.write(f'{container_line} \\\n')
        out.write(
            'java -Xmx{} -jar /VarScan2.3.7.jar processSomatic \\\n'.format(
                input_parameters['MEM']))
        out.write('{}/{}.snp.vcf\n\n'.format(mounted_outdir, outname))

        out.write(f'{container_line} \\\n')
        out.write(
            'java -Xmx{} -jar /VarScan2.3.7.jar somaticFilter \\\n'.format(
                input_parameters['MEM']))
        out.write('{}/{}.snp.Somatic.hc.vcf \\\n'.format(
            mounted_outdir, outname))
        out.write('-indel-file {}/{}.indel.vcf \\\n'.format(
            mounted_outdir, outname))
        out.write('-output-file {}/{}.snp.Somatic.hc.filter.vcf\n\n'.format(
            mounted_outdir, outname))

        out.write('rm {}/normal.pileup\n'.format(
            input_parameters['output_directory']))
        out.write('rm {}/tumor.pileup\n'.format(
            input_parameters['output_directory']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #16
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['reference_dict'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['reference_dict']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['jsm2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference_dict = fileDict[
        input_parameters['reference_dict']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')
        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write(
            '/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py train joint_snv_mix_two \\\n'
        )
        out.write('--convergence_threshold {} \\\n'.format(
            input_parameters['converge_threshold']))
        out.write('--skip_size {} \\\n'.format(input_parameters['skip_size']))

        if input_parameters['jsm_train_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['jsm_train_arguments']))

        out.write('{} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_normal_bam))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write('/opt/JointSNVMix-0.7.5/config/joint_priors.cfg \\\n')
        out.write('/opt/JointSNVMix-0.7.5/config/joint_params.cfg \\\n')
        out.write('{}/jsm.parameter.cfg\n'.format(mounted_outdir))
        out.write('\n')

        out.write('echo -e \'##fileformat=VCFv4.1\' > {}/{}\n'.format(
            input_parameters['output_directory'], input_parameters['outfile']))
        out.write(
            'echo -e \'##INFO=<ID=AAAB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and AB in Tumor">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##INFO=<ID=AABB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and BB in Tumor">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNORMAL\\tTUMOR\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write('\n')

        out.write(f'{container_line} bash -c \\\n')
        out.write(
            '"/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py classify joint_snv_mix_two \\\n'
        )

        if input_parameters['jsm_classify_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['jsm_classify_arguments']))

        out.write('{} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_normal_bam))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write("{}/jsm.parameter.cfg \\\n".format(mounted_outdir))
        out.write(
            '/dev/stdout | awk -F \'\\t\' \'NR!=1 && \\$4!=\\"N\\" && \\$10+\\$11>=0.95\' | \\\n'
        )
        out.write(
            'awk -F \'\\t\' \'{print \\$1 \\"\\t\\" \\$2 \\"\\t.\\t\\" \\$3 \\"\\t\\" \\$4 \\"\\t.\\t.\\tAAAB=\\" \\$10 \\";AABB=\\" \\$11 \\"\\tRD:AD\\t\\" \\$5 \\":\\" \\$6 \\"\\t\\" \\$7 \\":\\" \\$8}\' \\\n'
        )
        out.write('| /opt/vcfsorter.pl {} - >> {}/{}"\n\n'.format(
            mounted_reference_dict, mounted_outdir,
            input_parameters['outfile']))

        if input_parameters['threads'] > 1:

            bedtool_line, outdir_i = container.container_params(
                'lethalfang/bedtools:2.26.0', tech,
                (input_parameters['output_directory'], ))
            mounted_bed_outdir = outdir_i[
                input_parameters['output_directory']]['mount_path']

            out.write('\n\ni=1\n')
            out.write('while [[ $i -le {} ]]\n'.format(
                input_parameters['threads']))
            out.write('do\n')
            out.write(
                '    {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n'
                .format(DOCKER_LINE=bedtool_line,
                        OUTDIR=mounted_bed_outdir,
                        OUTVCF=input_parameters['outfile']))
            out.write('    i=$(( $i + 1 ))\n')
            out.write('done\n')

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #17
0
def trimmomatic(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    trim_line, fileDict = container.container_params(
        input_parameters['trimmomaticImage'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']
    mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{trim_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/Trimmomatic/trimmomatic.jar \\\n'.format(
                input_parameters['MEM']))

        if paired_end:
            out.write('PE -threads {} -phred33 \\\n'.format(
                input_parameters['threads']))
            out.write(
                '{FQ1} {FQ2} {DIR}/{PAIR1} {DIR}/{UNPAIR1} {DIR}/{PAIR2} {DIR}/{UNPAIR2} \\\n'
                .format(
                    FQ1=mounted_fq1,
                    FQ2=mounted_fq2,
                    DIR=mounted_outdir,
                    PAIR1=input_parameters['out_fastq1_name'],
                    PAIR2=input_parameters['out_fastq2_name'],
                    UNPAIR1='unpaired.' + input_parameters['out_fastq1_name'],
                    UNPAIR2='unpaired.' + input_parameters['out_fastq2_name']))

        else:
            out.write('SE -threads {} -phred33 \\\n'.format(
                input_parameters['threads']))
            out.write('{FQ1} {DIR}/{PAIR1} \\\n'.format(
                FQ1=mounted_fq1,
                DIR=mounted_outdir,
                PAIR1=input_parameters['out_fastq1_name']))

        out.write(
            'ILLUMINACLIP:{ADAPTER}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:{MINLEN}\n'
            .format(ADAPTER=input_parameters['adapter'],
                    MINLEN=input_parameters['minimum_length']))

        # Remove untrimmed files:
        if input_parameters['remove_untrimmed']:
            out.write('\n')
            out.write('rm {}\n'.format(input_parameters['in_fastq1']))

            if input_parameters['in_fastq2']:
                out.write('rm {}\n'.format(input_parameters['in_fastq2']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #18
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['mutect2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])
    tumor_name_line, tumor_bam = container.container_params(
        'lethalfang/samtools:1.7', tech, (input_parameters['tumor_bam'], ))
    normal_name_line, normal_bam = container.container_params(
        'lethalfang/samtools:1.7', tech, (input_parameters['normal_bam'], ))

    # Resolve mounted paths
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        mounted_inclusion = fileDict[
            input_parameters['inclusion_region']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        tumor_bam_path = tumor_bam[input_parameters['tumor_bam']]['mount_path']
        tumor_sample_name_extraction = f'tumor_name=`{tumor_name_line} samtools view -H {tumor_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n'
        out.write(tumor_sample_name_extraction)

        normal_bam_path = normal_bam[
            input_parameters['normal_bam']]['mount_path']
        normal_sample_name_extraction = f'normal_name=`{normal_name_line} samtools view -H {normal_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n'
        out.write(normal_sample_name_extraction)

        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write('java -Xmx{} -jar /gatk/gatk.jar Mutect2 \\\n'.format(
            input_parameters['MEM']))
        out.write(f'--reference {mounted_genome_reference} \\\n')

        if input_parameters['inclusion_region']:
            out.write('--intervals {} \\\n'.format(mounted_inclusion))

        out.write('--input {} \\\n'.format(mounted_tumor_bam))
        out.write('--input {} \\\n'.format(mounted_normal_bam))

        out.write('--normal-sample ${normal_name} \\\n')
        out.write('--tumor-sample ${tumor_name} \\\n')
        out.write('--native-pair-hmm-threads {} \\\n'.format(1))

        if input_parameters['mutect2_arguments']:
            out.write('{} \\\n'.format(input_parameters['mutect2_arguments']))

        out.write('--output {}/unfiltered.{}\n\n'.format(
            mounted_outdir, input_parameters['outfile']))

        out.write(f'{container_line} \\\n')
        out.write(
            'java -Xmx{} -jar /gatk/gatk.jar FilterMutectCalls \\\n'.format(
                input_parameters['MEM']))
        out.write('--variant {}/unfiltered.{} \\\n'.format(
            mounted_outdir, input_parameters['outfile']))

        if input_parameters['mutect2_filter_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['mutect2_filter_arguments']))

        out.write('--output {}/{}\n'.format(mounted_outdir,
                                            input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Пример #19
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker' ):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists( input_parameters['normal_bam'] )
    assert os.path.exists( input_parameters['tumor_bam'] )
    assert os.path.exists( input_parameters['genome_reference'] )    
    
    logdir  = os.path.join( input_parameters['output_directory'], 'logs' )
    outfile = os.path.join( logdir, input_parameters['script'] )

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters['tumor_bam'], input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append( path_i )

    container_line, fileDict = container.container_params( input_parameters['somaticsniper_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] )

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[ input_parameters['genome_reference'] ]['mount_path']
    mounted_tumor_bam        = fileDict[ input_parameters['tumor_bam'] ]['mount_path']
    mounted_normal_bam       = fileDict[ input_parameters['normal_bam'] ]['mount_path']
    mounted_outdir           = fileDict[ input_parameters['output_directory'] ]['mount_path']
    
    if input_parameters['inclusion_region']:
        mounted_inclusion    = fileDict[ input_parameters['inclusion_region'] ]['mount_path']


    with open(outfile, 'w') as out:
        
        out.write( "#!/bin/bash\n\n" )
        
        out.write(f'#$ -o {logdir}\n' )
        out.write(f'#$ -e {logdir}\n' )
        out.write( '#$ -S /bin/bash\n' )
        out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) )
        out.write( 'set -e\n\n' )
        
        out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' )
        
        out.write(f'{container_line} \\\n' )
        out.write( '/opt/somatic-sniper/build/bin/bam-somaticsniper \\\n' )
        out.write( '-q {} -Q {} -s {} -F vcf {} \\\n'.format(input_parameters['min_MQ'], input_parameters['somatic_score'], input_parameters['prior'], input_parameters['somaticsniper_arguments']) )
        out.write( '-f {} \\\n'.format(mounted_genome_reference) )
        out.write( '{} \\\n'.format(mounted_tumor_bam) )
        out.write( '{} \\\n'.format(mounted_normal_bam) )
        out.write( '{}/{}\n'.format(mounted_outdir, input_parameters['outfile']) )

        if input_parameters['threads'] > 1:
            
            bedtool_line, outdir_i = container.container_params( 'lethalfang/bedtools:2.26.0', tech, (input_parameters['output_directory'], ) )
            mounted_bed_outdir     = outdir_i[ input_parameters['output_directory'] ]['mount_path']
            
            out.write( '\n\ni=1\n' )
            out.write( 'while [[ $i -le {} ]]\n'.format(input_parameters['threads']) )
            out.write( 'do\n' )
            out.write( '    {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n'.format(DOCKER_LINE=bedtool_line, OUTDIR=mounted_bed_outdir, OUTVCF=input_parameters['outfile']) )
            out.write( '    i=$(( $i + 1 ))\n' )
            out.write( 'done\n' )

        out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' )


    # "Run" the script that was generated
    command_line = '{} {}'.format( input_parameters['action'], outfile )
    returnCode   = subprocess.call( command_line, shell=True )

    return outfile