Пример #1
0
def principal_component_analysis(exp):

    out_dir = make_folder(f'{exp.scratch}PCA/')

    bigwigs = {
        sample: exp.sample_files[sample]['bw']
        for sample in exp.samples if len(exp.sample_files[sample]['bw']) != 0
    }
    multibw_command = f"multiBigwigSummary bins -b {' '.join(list(bigwigs.values()))} -l {' '.join(list(bigwigs.keys()))} -p 4 --chromosomesToSkip chrM,chrX,chrY -o {out_dir}{exp.name}_bwsummary.npz"

    correlation_command = f'plotCorrelation --corData {out_dir}{exp.name}_bwsummary.npz --corMethod pearson --whatToPlot heatmap --skipZeros --plotTitle "{exp.name} Binned Pearson Correlation Heatmap" --plotFileFormat png --outFileCorMatrix {out_dir}{exp.name}_CorMatrix.tab --colorMap Purples -o {out_dir}{exp.name}_CorHeatmap.png'

    pca_command = f'plotPCA --corData {out_dir}{exp.name}_bwsummary.npz --plotTitle "{exp.name} PCA Plot" --plotFileFormat png --outFileNameData {out_dir}{exp.name}_PCA_data.tab --log2 -o {out_dir}{exp.name}_PCA_Plot.png'

    command_list = [
        submission_prepend(), multibw_command, correlation_command, pca_command
    ]

    exp.job_id.append(
        send_job(command_list=command_list,
                 job_name=f"{exp.name}_Cor_PCA",
                 job_log_folder=exp.job_folder,
                 q='general',
                 mem=4000,
                 log_file=exp.log_file,
                 project=exp.project,
                 cores=5,
                 run_main=exp.run_main))

    exp.tasks_complete.append('PCA')

    return exp
Пример #2
0
def preseq(exp):

    output(
        '\nRunning QC plots: library complexity extrapolation, signal correlation and pca plots.',
        log_file=exp.log_file,
        run_main=exp.run_main)

    for sample in exp.samples:

        out_dir = make_folder(f'{exp.scratch}QC/preseq/{sample}/')

        command_list = [
            submission_prepend(
                f'preseq lc_extrap -bam -output {out_dir}{sample}_preseq.txt {exp.sample_files[sample]["bam"]}'
            )
        ]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f"{sample}_preseq",
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=5000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=1,
                     run_main=exp.run_main))

    exp.tasks_complete.append('preseq')

    return exp
Пример #3
0
def fastq_screen(exp):
    '''
    Checks fastq files for contamination with alternative genomes using Bowtie2
    '''

    output(
        f'Screening for contamination during sequencing: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
        log_file=exp.log_file,
        run_main=exp.run_main)

    # Make QC folder
    exp.qc_folder = make_folder(f'{exp.scratch}QC/')

    cwd = val_folder(os.getcwd())
    os.chdir(exp.data_folder)

    samples = [
        file for file in exp.sample_df.Scratch_File1.tolist() if is_fastq(file)
    ]

    # Submit fastqc and fastq_screen jobs for each sample
    for sample in samples:
        command_list = [
            submission_prepend(
                f'fastq_screen --threads 4 --aligner bowtie2 {sample}')
        ]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f'{sample.split("/")[-1]}_fastq_screen',
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=3000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))
        time.sleep(1)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # move to qc folder
    fastqs_files = glob.glob(f'{exp.data_folder}*screen*')
    for f in fastqs_files:
        copy2(f, exp.qc_folder)
        os.remove(f)

    # change to experimental directory in scratch
    os.chdir(cwd)

    exp.tasks_complete.append('Fastq_screen')
    output(f'Screening complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Пример #4
0
def fastqc(exp):
    '''
    Performs fastq spec analysis with FastQC
    '''
    output('Assessing fastq quality. \n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    # Make QC folder
    exp.qc_folder = make_folder(f'{exp.scratch}QC/')

    all_samples = exp.sample_df.Scratch_File1.tolist(
    ) + exp.sample_df.Scratch_File2.tolist()
    samples = [file for file in all_samples if is_fastq(file)]

    for sample in samples:
        command_list = [submission_prepend(f'fastqc {sample}')]

        exp.job_id.append(
            send_job(command_list=command_list,
                     job_name=f'{sample.split("/")[-1]}_fastqc',
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=5000,
                     log_file=exp.log_file,
                     project=exp.project,
                     run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # move to qc folder
    fastqc_files = glob.glob(f'{exp.data_folder}*.zip')
    fastqc_files = fastqc_files + glob.glob(f'{exp.data_folder}*.html')
    for f in fastqc_files:
        copy2(f, exp.qc_folder)
        os.remove(f)

    exp.tasks_complete.append('FastQC')
    output(f'FastQC complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Пример #5
0
def trim(exp):
    '''
    Trimming based on standard UM SCCC Core Nextseq 500 technical errors.
    Cudadapt can hard clip both ends, but may ignore 3' in future.
    '''

    output(f'Beginning fastq trimming: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    for sample_dict in exp.sample_df[[
            'Scratch_File1', 'Scratch_File2', 'Sequencer', 'Sample_Name'
    ]].to_dict(orient='records'):

        quality = '--nextseq-trim=20' if sample_dict['Sequencer'].lower(
        ) == 'nextseq' else '-q 20'
        seq_type = 'single' if sample_dict[
            'Scratch_File2'] == 'none' else 'paired'

        sample = sample_dict["Sample_Name"]

        paired = f'{exp.data_folder}{sample}_trim_R2.fastq.gz'
        single = f'{exp.data_folder}{sample}_trim.fastq.gz'
        data_files = glob.glob(f'{exp.data_folder}*.gz')

        if (single in data_files) or (paired in data_files):
            continue
        else:
            output(f'Trimming {sample}: ',
                   log_file=exp.log_file,
                   run_main=exp.run_main)

            if seq_type == 'paired':
                cutadapt = f'cutadapt -j 4 -a AGATCGGAAGAGC -A AGATCGGAAGAGC --cores=10 {quality} -m 18 '
                cutadapt += f'-o {exp.data_folder}{sample}_trim_R1.fastq.gz -p {exp.data_folder}{sample}_trim_R2.fastq.gz '
                cutadapt += f'{sample_dict["Scratch_File1"]} {sample_dict["Scratch_File2"]}'
            elif seq_type == 'single':
                cutadapt = f'cutadapt -j 4 -a AGATCGGAAGAGC --cores=10 {quality} -m 18 '
                cutadapt += f'-o {exp.data_folder}{sample}_trim_R1.fastq.gz {sample_dict["Scratch_File1"]}'

            command_list = [submission_prepend(cutadapt)]

            exp.job_id.append(
                send_job(command_list=command_list,
                         job_name=f"{sample}_trim",
                         job_log_folder=exp.job_folder,
                         q='general',
                         mem=5000,
                         log_file=exp.log_file,
                         project=exp.project,
                         cores=2,
                         run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file, exp.run_main)

    # move logs to qc folder
    output(
        '\nTrimming logs are found in stdout files from bsub.  Cutadapt does not handle log files in multi-core mode.',
        log_file=exp.log_file,
        run_main=exp.run_main)

    exp.tasks_complete.append('Trim')
    output(f'Trimming complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n',
           log_file=exp.log_file,
           run_main=exp.run_main)

    return exp
Пример #6
0
def spike(exp):
    '''
    If calling from jupyter.  Change backend as needed.

    Align sequencing files to drosophila.
    '''
    import pandas as pd

    if len(exp.spike_samples) == 0:
        output('Not processing Spike-ins',
               log_file=exp.log_file,
               run_main=exp.run_main)
        exp.tasks_complete.append('Spike')
        return exp

    # Make QC folder
    spike_folder = make_folder(f'{exp.scratch}spike/')
    output('Processing samples with drosophila-spike in chromatin.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    for sample in exp.spike_samples:
        bam = exp.sample_files[sample]['bam']

        spike_command = [
            submission_prepend(),
            f'samtools view -b -f 4 {bam} | samtools sort -n - | samtools fastq - > {spike_folder}{sample}.bwa_unaligned.fastq',
            f'bowtie2 -p 8 -x {exp.genome_indicies["spike_index"]} -U {spike_folder}{sample}.bwa_unaligned.fastq -S {spike_folder}{sample}.BDGP6.sam --very-sensitive-local -k 1 --no-unal',
            f'samtools view -b -F 4 {spike_folder}{sample}.BDGP6.sam | samtools sort - > {spike_folder}{sample}.BDGP6.bam',
            f'picard MarkDuplicates I={spike_folder}{sample}.BDGP6.bam O={spike_folder}{sample}.BDGP6.nodup.bam M={spike_folder}{sample}.BDGP6.nodups.markdups.qc ASSUME_SORTED=TRUE VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true',
            f'samtools flagstat {spike_folder}{sample}.BDGP6.nodup.bam > {spike_folder}{sample}.unique_drosophila.flagstat.qc',
            f'rm {spike_folder}{sample}.BDGP6.sam {spike_folder}{sample}.BDGP6.nodup.bam {spike_folder}{sample}*.fastq'
        ]

        exp.job_id.append(
            send_job(command_list=spike_command,
                     job_name=f"{sample}_spike",
                     job_log_folder=exp.job_folder,
                     q='general',
                     mem=10000,
                     log_file=exp.log_file,
                     project=exp.project,
                     cores=2,
                     run_main=exp.run_main))

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file, exp.run_main)

    spike_reads = pd.DataFrame(index=['spike_reads', 'genome_reads'])

    for sample in exp.spike_samples:
        qc_file = f'{spike_folder}{sample}.unique_drosophila.flagstat.qc'
        exp.sample_files[sample]['drosophila'] = qc_file

        with open(qc_file, 'r') as fp:
            spike_number = fp.read().split(' ')[0]

        with open(exp.sample_files[sample]['nodup_flagstat']) as fp:
            target_number = fp.read().split(' ')[0]

        spike_reads[sample] = [spike_number, target_number]

    exp.spike_reads = spike_reads.T
    condition_dict = pd.Series(exp.sample_df.Condition.values,
                               index=exp.sample_df.Sample_Name).to_dict()

    exp.spike_reads['Replicate'] = [
        x.split('_')[-1] for x in exp.spike_reads.index.tolist()
    ]
    exp.spike_reads['Condition'] = [
        condition_dict[x] for x in exp.splike_reads.index.tolist()
    ]

    for name, spike_conditions in exp.spike_comparisons.items():
        out_dir = make_folder(f'{exp.scratch}spike/{name}')
        plot = spike_in_plot(exp.spike_reads, spike_conditions, name, out_dir)
        out_result(plot,
                   f'{name.replace("_", " ")} Spike-In Comparison',
                   run_main=exp.run_main)
        output(
            f'Spike-in comparison {name.replace("_", " ")} can be found here: {plot.replace(os.scratch, "")}'
        )

    output(f'Spike-in counts:\n {spike_reads.T}',
           log_file=exp.log_file,
           run_main=exp.run_main)

    output('Spike-in alignment jobs finished.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    # Generate one dataframe for all spike_counts

    output(
        f"Spike-in processing complete: {datetime.now():%Y-%m-%d %H:%M:%S}\n",
        log_file=exp.log_file,
        run_main=exp.run_main)

    exp.tasks_complete.append('Spike')
    return exp
Пример #7
0
def UMI(exp):

    # exp.data_type = 'bam'

    IPs = exp.IPs

    for experiment in IPs.Condition.unique().tolist():
        UMI = True if 'yes' in IPs[IPs.Condition ==
                                   experiment]['UMI'].tolist() else False

        if not UMI:
            return exp

        else:
            out_dir = make_folder(f'{exp.scratch}UMI/')
            output('Deduplicating bam files using UMIs with UMI-tools.',
                   log_file=exp.log_file,
                   run_main=exp.run_main)

            for index in IPs[IPs.Condition == experiment].index.tolist():
                sample = IPs.loc[index, 'Sample_Name']
                input_sample = IPs.loc[index, 'Background_Name']

                bam = exp.sample_files[sample]['bam']
                input_bam = exp.sample_files[input_sample]['bam']
                nodup_bam = f'{out_dir}{sample}.UMI.dedup.bam'
                nodup_input = f'{out_dir}{input_sample}.UMI.dedup.bam'

                umi_string = 'umi_tools dedup --umi-separator=":" --output-stats={out_dir}{sample}deduplicated.qc -I {inbam} -S {outbam} -L {out_dir}{sample}.UMI.log'

                seq_type = False if 'none' in IPs[
                    IPs.Condition ==
                    experiment]['Scratch_File2'].tolist() else True
                if seq_type == 'paired':
                    umi_string += ' --paired'

                command_list = [
                    submission_prepend(), f'samtools index {bam}',
                    f'samtools index {input_bam}',
                    umi_string.format(inbam=bam,
                                      outbam=nodup_bam,
                                      sample=sample,
                                      out_dir=out_dir),
                    umi_string.format(inbam=input_bam,
                                      outbam=nodup_input,
                                      sample=input_sample,
                                      out_dir=out_dir)
                ]

                exp.job_id.append(
                    send_job(command_list=command_list,
                             job_name=f"{sample}_UMI_dedup",
                             job_log_folder=exp.job_folder,
                             q='bigmem',
                             mem=40000,
                             log_file=exp.log_file,
                             project=exp.project,
                             cores=1,
                             run_main=exp.run_main))

                exp.sample_files[sample]['nodup_bam'] = nodup_bam
                exp.sample_files[input_sample]['nodup_bam'] = nodup_input

    job_wait(exp.job_id, exp.log_file)

    output(
        'Dedplication complete.  Submitting deduplicated files for the remainder of processing.',
        log_file=exp.log_file,
        run_main=exp.run_main)
    exp.tasks_complete.append('UMI')

    return encode3(exp)
Пример #8
0
def encode3(exp):

    if 'Stage' not in exp.tasks_complete:
        output('Files not staged.\n', log_file=exp.log_file)
        exp = stage(exp)

    output('Running alignment and peak calling using ENCODE3 standards.',
           log_file=exp.log_file,
           run_main=exp.run_main)
    output('ENCODE3 cromwell pipeline.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    out_dir = make_folder(f'{exp.scratch}ENCODE3/')

    IPs = exp.IPs

    end_types = {'q.gz': 'fastq', '.bam': 'bam'}

    for experiment in IPs.Condition.unique().tolist():

        exp_dir = make_folder(f'{out_dir}{experiment}/')

        IP_sample_indicies = [(rep, index) for rep, index in enumerate(
            IPs[IPs.Condition == experiment].index.tolist(), start=1)]

        if len(IP_sample_indicies) > 6:
            raise IOError('Pipeline cannot handle more than 6 replicates.')

        seq_type = False if 'none' in IPs[
            IPs.Condition == experiment]['File2'].tolist() else True
        final_stage = 'align' if 'align' in IPs[
            IPs.Condition == experiment]['Final Stage'].tolist() else 'all'

        UMI_list = [
            x.lower()
            for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist()
        ]
        if len(set(UMI_list)) > 1:
            raise IOError(
                'All samples must be UMI processed or not for each condition.')
        UMI = True if UMI_list[0].lower() == 'yes' else False

        try:
            file_type = end_types[exp.sample_df[exp.sample_df.Condition ==
                                                experiment]
                                  ['Scratch_File1'].tolist()[0][-4:]]
        except KeyError:
            output(
                f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.",
                log_file=exp.log_file,
                run_main=exp.run_main)

        genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist()
        if len(genome) > 1:
            raise IOError(
                'Cannot align to more than one genome per condition.')

        chip_type = IPs[IPs.Condition ==
                        experiment]['ChIP Type'].unique().tolist()
        if len(chip_type) > 1:
            raise IOError(
                'Cannot have more than one chip type (histone or TF) for a condition.'
            )
        chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf'

        json_file = {
            'chip.pipeline_type': chip_type,
            'chip.paired_end': seq_type,
            'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]],
            'chip.bwa.mem_mb': 30000,
            'chip.macs2_mem_mb': 30000,
            'chip.peak_caller': 'macs2',
            "chip.true_rep_only": False,
            "chip.dup_marker": "picard",
            "chip.mapq_thresh": 30,
            "chip.regex_filter_reads": "chrM",
            "chip.subsample_reads": 0,
            "chip.ctl_subsample_reads": 0,
            "chip.xcor_subsample_reads": 15000000,
            "chip.keep_irregular_chr_in_bfilt_peak": False,
            "chip.always_use_pooled_ctl": False,
            "chip.ctl_depth_ratio": 1.2,
            "chip.macs2_cap_num_peak": 500000,
            "chip.pval_thresh": 0.01,
            "chip.idr_thresh": 0.05,
            "chip.bwa_cpu": 4,
            "chip.bwa_mem_mb": 20000,
            "chip.bwa_time_hr": 48,
            "chip.filter_cpu": 2,
            "chip.filter_mem_mb": 20000,
            "chip.filter_time_hr": 24,
            "chip.bam2ta_cpu": 2,
            "chip.bam2ta_mem_mb": 10000,
            "chip.bam2ta_time_hr": 6,
            "chip.fingerprint_cpu": 2,
            "chip.fingerprint_mem_mb": 12000,
            "chip.fingerprint_time_hr": 6,
            "chip.xcor_cpu": 2,
            "chip.xcor_mem_mb": 16000,
            "chip.xcor_time_hr": 24,
            "chip.macs2_time_hr": 24,
            "chip.spr_mem_mb": 16000
        }
        bams = []
        ctl_bams = []

        for rep, index in IP_sample_indicies:
            sample = exp.sample_df.loc[index, 'Sample_Name']
            input_sample = IPs.loc[index, 'Background_Name']

            if file_type == 'fastq':
                json_file[f'chip.fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{sample}_trim_R1.fastq.gz'
                ]
                json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz'
                ]
                if seq_type:
                    json_file[f'chip.fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{sample}_trim_R2.fastq.gz'
                    ]
                    json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz'
                    ]
            else:
                bams.append(f'{exp.data_folder}{sample}.bam')
                ctl_bams.append(f'{exp.data_folder}{input_sample}.bam')

        if file_type == 'bam':
            json_file[f'chip.bams'] = bams
            json_file[f'chip.ctl_bams'] = ctl_bams

        json_file['chip.align_only'] = True if UMI & (file_type
                                                      == 'fastq') else False
        json_file[
            'chip.align_only'] = True if final_stage == 'align' else False

        json_file['chip.no_dup_removal'] = True if UMI else False
        json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & (
            file_type == 'bam') else experiment
        json_file[
            "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}."

        encode_file = f'{exp_dir}{experiment}_ENCODE3.json'
        with open(encode_file, 'w') as file:
            json.dump(json_file, file, indent=4, sort_keys=True)

        pythonpath = shutil.which('python')
        miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
        cromwell_jar = re.sub(
            r'{}/.*'.format(miniconda),
            '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                miniconda), pythonpath)
        jar = cromwell_jar if os.path.isfile(
            cromwell_jar
        ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

        command_list = [
            submission_prepend(source='encode-chip-seq-pipeline'),
            f'cd {exp_dir}',
            f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}'
        ]

        sent_job = send_job(command_list=command_list,
                            job_name=f"{experiment}_ENCODE3",
                            job_log_folder=exp.job_folder,
                            q='bigmem',
                            mem=35000,
                            log_file=exp.log_file,
                            project=exp.project,
                            cores=1,
                            run_main=exp.run_main)

        exp.job_id.append(sent_job)
        job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    exp = encode_results(exp)

    exp.tasks_complete.append('ENCODE3')

    return exp
Пример #9
0
def encode3(exp):

    if 'Stage' not in exp.tasks_complete:
        output('Files not staged.\n', log_file=exp.log_file)
        exp = stage(exp)

    output('Running alignment and peak calling using ENCODE3 standards.',
           log_file=exp.log_file,
           run_main=exp.run_main)
    output('ENCODE3 cromwell pipeline.',
           log_file=exp.log_file,
           run_main=exp.run_main)

    out_dir = make_folder(f'{exp.scratch}ENCODE3/')

    IPs = exp.IPs

    end_types = {'q.gz': 'fastq', '.bam': 'bam'}

    for experiment in IPs.Condition.unique().tolist():

        exp_dir = make_folder(f'{out_dir}{experiment}/')

        IP_sample_indicies = [(rep, index) for rep, index in enumerate(
            IPs[IPs.Condition == experiment].index.tolist(), start=1)]

        if len(IP_sample_indicies) > 6:
            raise IOError('Pipeline cannot handle more than 6 replicates.')

        seq_type = False if 'none' in IPs[
            IPs.Condition == experiment]['File2'].tolist() else True

        aligner = IPs[IPs.Condition == experiment]['Aligner'].unique().tolist()
        if len(aligner) != 1:
            raise IOError(
                'All replicates must be aligned using the same aligner or not, which must be specified.'
            )
        else:
            aligner = aligner[0]

        peak_caller = IPs[IPs.Condition ==
                          experiment]['Peak Caller'].unique().tolist()
        if len(peak_caller) != 1:
            raise IOError(
                'All replicates peaks must be called or not using the same peak calling strategy.'
            )
        else:
            peak_caller = peak_caller[0]

        UMI_list = [
            x.lower()
            for x in IPs[IPs.Condition == experiment]['UMI'].unique().tolist()
        ]
        if len(set(UMI_list)) > 1:
            raise IOError(
                'All samples must be UMI processed or not for each condition.')
        UMI = True if UMI_list[0] == 'yes' else False

        try:
            file_type = end_types[exp.sample_df[exp.sample_df.Condition ==
                                                experiment]
                                  ['Scratch_File1'].tolist()[0][-4:]]
        except KeyError:
            output(
                f"{exp.sample_df[exp.sample_df.Condition == experiment]['Scratch_File1'].tolist()[0]} not a valid file type for this pipeline.",
                log_file=exp.log_file,
                run_main=exp.run_main)

        file_type = 'bam' if (UMI is True) & (
            'UMI' in exp.tasks_complete) else file_type

        genome = IPs[IPs.Condition == experiment]['Genome'].unique().tolist()
        if len(genome) > 1:
            raise IOError(
                'Cannot align to more than one genome per condition.')

        chip_type = IPs[IPs.Condition ==
                        experiment]['ChIP Type'].unique().tolist()
        if len(chip_type) > 1:
            raise IOError(
                'Cannot have more than one chip type (histone or TF) for a condition.'
            )
        chip_type = 'histone' if chip_type[0].lower() == 'histone' else 'tf'

        json_file = {
            'chip.pipeline_type': chip_type,
            'chip.paired_end': seq_type,
            'chip.genome_tsv': exp.genome_indicies['encode_tsv'][genome[0]],
            'chip.align_mem_mb': 30000,
            "chip.true_rep_only": False,
            "chip.dup_marker": "picard",
            "chip.mapq_thresh": 30,
            "chip.filter_chrs": ["chrM"],
            "chip.subsample_reads": 0,
            "chip.ctl_subsample_reads": 0,
            "chip.xcor_subsample_reads": 15000000,
            "chip.always_use_pooled_ctl": False,
            "chip.ctl_depth_ratio": 1.2,
            "chip.cap_num_peak_macs2": 500000,
            "chip.pval_thresh": 0.01,
            "chip.idr_thresh": 0.05,
            "chip.align_cpu": 4,
            "chip.align_time_hr": 48,
            "chip.filter_cpu": 2,
            "chip.filter_mem_mb": 20000,
            "chip.filter_time_hr": 24,
            "chip.bam2ta_cpu": 2,
            "chip.bam2ta_mem_mb": 10000,
            "chip.bam2ta_time_hr": 6,
            "chip.jsd_cpu": 2,
            "chip.jsd_mem_mb": 12000,
            "chip.jsd_time_hr": 6,
            "chip.xcor_cpu": 2,
            "chip.xcor_mem_mb": 16000,
            "chip.xcor_time_hr": 24,
            "chip.align_time_hr": 24,
            "chip.spr_mem_mb": 16000,
            "chip.enable_count_signal_track": True,
        }

        if peak_caller == 'macs2':
            json_file['chip.peak_caller'] = 'macs2'

        if aligner != 'none':
            json_file['chip.aligner'] = aligner

        bams = []
        ctl_bams = []

        for rep, index in IP_sample_indicies:
            sample = exp.sample_df.loc[index, 'Sample_Name']
            input_sample = IPs.loc[index, 'Background_Name']

            if file_type == 'fastq':
                json_file[f'chip.fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{sample}_trim_R1.fastq.gz'
                ]
                json_file[f'chip.ctl_fastqs_rep{rep}_R1'] = [
                    f'{exp.data_folder}{input_sample}_trim_R1.fastq.gz'
                ]
                if seq_type:
                    json_file[f'chip.fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{sample}_trim_R2.fastq.gz'
                    ]
                    json_file[f'chip.ctl_fastqs_rep{rep}_R2'] = [
                        f'{exp.data_folder}{input_sample}_trim_R2.fastq.gz'
                    ]
            else:
                bams.append(f'{exp.data_folder}{sample}.bam')
                ctl_bams.append(f'{exp.data_folder}{input_sample}.bam')

        if file_type == 'bam':
            json_file[f'chip.bams'] = bams
            json_file[f'chip.ctl_bams'] = ctl_bams

        json_file['chip.align_only'] = True if UMI & (file_type
                                                      == 'fastq') else False
        json_file[
            'chip.align_only'] = True if peak_caller == 'none' else json_file[
                'chip.align_only']

        json_file['chip.no_dup_removal'] = True if UMI else False
        json_file['chip.title'] = f'{experiment}_postUMI_dedup' if UMI & (
            file_type == 'bam') else experiment
        json_file[
            "chip.description"] = f"Cromwell ENCODE3 {experiment}: {'paired-end' if seq_type else 'single-end'} {chip_type}."

        encode_file = f'{exp_dir}{experiment}_ENCODE3.json'
        with open(encode_file, 'w') as file:
            json.dump(json_file, file, indent=4, sort_keys=True)

        pythonpath = shutil.which('python')
        miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
        cromwell_jar = re.sub(
            r'{}/.*'.format(miniconda),
            '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                miniconda), pythonpath)
        jar = cromwell_jar if os.path.isfile(
            cromwell_jar
        ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

        command_list = [
            submission_prepend(source='encode-chip-seq-pipeline'),
            f'cd {exp_dir}',
            f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {encode_file}'
        ]

        sent_job = send_job(command_list=command_list,
                            job_name=f"{experiment}_ENCODE3",
                            job_log_folder=exp.job_folder,
                            q='bigmem',
                            mem=35000,
                            log_file=exp.log_file,
                            project=exp.project,
                            cores=1,
                            run_main=exp.run_main)

        exp.job_id.append(sent_job)
        job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    # Check fraglength and resubmit with set 200 fraglen for macs2 if xcor error
    for experiment in exp.IPs.Condition.unique().tolist():
        rep_number = len(exp.IPs[exp.IPs.Condition == experiment])

        frag_list = []

        for rep in range(rep_number):
            file = glob_check(
                f'{exp.scratch}ENCODE3/{experiment}/cromwell-executions/chip/*/call-xcor/shard-{rep}/execution/*fraglen.txt'
            )
            with open(file, 'r') as f:
                frag_list.append(f.read().split()[0])

        if '-' in [x[0] for x in frag_list]:
            output(
                f'Xcor failed for {experiment}.  Resubmitting with fragment length set to 200 for failed sample/s',
                log_file=exp.log_file,
                run_main=exp.run_main)

            frag_list = [x if x[0] != '-' else '200' for x in frag_list]
            exp_dir = f'{exp.scratch}ENCODE3/{experiment}/'
            encode_file = f'{exp_dir}{experiment}_ENCODE3.json'

            with open(encode_file, 'r') as file:
                json_file = json.load(file)

            json_file["chip.fraglen"] = frag_list

            resubmit_file = f'{exp_dir}/{experiment}_ENCODE3_setfraglenth.json'
            with open(resubmit_file, 'w') as file:
                json.dump(json_file, file, indent=4, sort_keys=True)

            pythonpath = shutil.which('python')
            miniconda = [x for x in pythonpath.split('/') if 'miniconda' in x]
            cromwell_jar = re.sub(
                r'{}/.*'.format(miniconda),
                '{}/envs/chrome_chip/share/cromwell/cromwell.jar'.format(
                    miniconda), pythonpath)
            jar = cromwell_jar if os.path.isfile(
                cromwell_jar
            ) else '~/miniconda3/envs/chrome_chip/share/cromwell/cromwell.jar'

            command_list = [
                submission_prepend(source='encode-chip-seq-pipeline'),
                f'cd {exp_dir}',
                f'java -jar -Dconfig.file={exp.encode3_folder}backends/backend.conf -Dbackend.default=Local {jar} run {exp.encode3_folder}chip.wdl -i {resubmit_file}'
            ]

            sent_job = send_job(command_list=command_list,
                                job_name=f"{experiment}_ENCODE3_resubmission",
                                job_log_folder=exp.job_folder,
                                q='bigmem',
                                mem=35000,
                                log_file=exp.log_file,
                                project=exp.project,
                                cores=1,
                                run_main=exp.run_main)

            exp.job_id.append(sent_job)
            job_pending(sent_job, exp.log_file)

    # Wait for jobs to finish
    job_wait(exp.job_id, exp.log_file)

    exp = encode_results(exp)

    exp.tasks_complete.append('ENCODE3')

    return exp
Пример #10
0
        cmd = f'python chrome_chip -f {args.experimental_file}'
        if args.template_notebook:
            cmd += f' -t {args.template_notebook}'
            if args.out_notebook:
                cmd += f' -o {args.out_notebook}'

        submission_header = [submission_prepend(cmd)]

        job_name = args.experimental_file.split('.')[0]

        send_job(command_list=submission_header,
                 job_name=job_name,
                 job_log_folder=f'{os.getcwd()}/',
                 q='general',
                 mem=3000,
                 log_file=f'bsub_{job_name}.log',
                 project=args.project,
                 cores=1,
                 submit=True,
                 run_main=False
                 )

else:
    if args.template_notebook:
        if os.path.isfile(args.template_notebook) is False:
            raise IOError(f'Location of template notebook not found. Use -t option.')
        else:
            import papermill as pm

            out_notebook = args.out_notebook if args.out_notebook else args.experimental_file.replace('yml', 'ipynb')
            pm.execute_notebook(args.template_notebook, out_notebook, parameters=dict(yaml_file=args.experimental_file), log_output=True, report_mode=True)