예제 #1
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager
예제 #2
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2]
        if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index,
                               sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager,
                parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
예제 #3
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed (and shifted if necessary) Bam files
    along with a UCSC browser track.
    """
    print("Start processing ChIP-seq sample '{}'.".format(sample.name))

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
        report_dict(pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genome_index, sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(
        inputBam=sample.mapped,
        outputBam=sample.filtered,
        metricsFile=sample.dups_metrics,
        paired=sample.paired,
        cpus=args.cores,
        Q=pipe_manager.config.parameters.read_quality
    )
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Report total efficiency
    usable = (
        float(pipe_manager.stats_dict["filtered_single_ends"]) +
        (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    total = float(pipe_manager.stats_dict['fastqc_total_pass_filter_reads'])
    report_dict(
        pipe_manager,
        {"total_efficiency": (usable / total) * 100})

    # Make tracks
    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from BAM file")
    cmd = bam_to_bigwig(
        input_bam=sample.filtered,
        output_bigwig=sample.bigwig,
        genome=sample.genome,
        normalization_method="RPGC")
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(
            bam=sample.filtered,
            plot=sample.insertplot,
            output_csv=sample.insertdata
        )
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome),
        output=sample.coverage
    )
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(
        inputBam=sample.filtered,
        output=sample.qc,
        plot=sample.qc_plot,
        cpus=args.cores
    )
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    print("Finished processing sample '{}'.".format(sample.name))
    return pipe_manager
예제 #4
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing ATAC-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired
            if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired
            if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.unmapped,
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters)
        pipe_manager.run(cmd,
                         sample.trimmed1 if sample.paired else sample.trimmed,
                         shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(
            pipe_manager,
            parse_trim_stats(sample.trimlog,
                             prefix="trim_",
                             paired_end=sample.paired))

    # Map
    pipe_manager.timestamp("Mapping reads with Bowtie2")
    cmd = tk.bowtie2Map(
        inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
        inputFastq2=sample.trimmed2 if sample.paired else None,
        outputBam=sample.mapped,
        log=sample.aln_rates,
        metrics=sample.aln_metrics,
        genomeIndex=getattr(pipe_manager.config.resources.genomes,
                            sample.genome),
        maxInsert=pipe_manager.config.parameters.max_insert,
        cpus=args.cores)
    pipe_manager.run(cmd, sample.mapped, shell=True)
    report_dict(
        pipe_manager,
        parse_mapping_stats(sample.aln_rates, paired_end=sample.paired))

    # Get mitochondrial reads
    pipe_manager.timestamp("Getting mitochondrial stats")
    cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped,
                                     output=sample.mitochondrial_stats,
                                     cpus=args.cores)
    pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True)
    report_dict(
        pipe_manager,
        parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"))

    # Filter reads
    pipe_manager.timestamp("Filtering reads for quality")
    cmd = tk.filterReads(inputBam=sample.mapped,
                         outputBam=sample.filtered,
                         metricsFile=sample.dups_metrics,
                         paired=sample.paired,
                         cpus=args.cores,
                         Q=pipe_manager.config.parameters.read_quality)
    pipe_manager.run(cmd, sample.filtered, shell=True)
    report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics))

    # Shift reads
    if sample.tagmented:
        pipe_manager.timestamp("Shifting reads of tagmented sample")
        cmd = tk.shiftReads(inputBam=sample.filtered,
                            genome=sample.genome,
                            outputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted, shell=True)

    # Index bams
    pipe_manager.timestamp("Indexing bamfiles with samtools")
    cmd = tk.indexBam(inputBam=sample.mapped)
    pipe_manager.run(cmd, sample.mapped + ".bai", shell=True)
    cmd = tk.indexBam(inputBam=sample.filtered)
    pipe_manager.run(cmd, sample.filtered + ".bai", shell=True)
    if sample.tagmented:
        cmd = tk.indexBam(inputBam=sample.filteredshifted)
        pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True)

    track_dir = os.path.dirname(sample.bigwig)
    if not os.path.exists(track_dir):
        os.makedirs(track_dir)

    # Make tracks
    # right now tracks are only made for bams without duplicates
    pipe_manager.timestamp("Making bigWig tracks from bam file")
    cmd = bamToBigWig(
        inputBam=sample.filtered,
        outputBigWig=sample.bigwig,
        genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes,
                            sample.genome),
        genome=sample.genome,
        tagmented=pipe_manager.config.parameters.
        tagmented,  # by default make extended tracks
        normalize=pipe_manager.config.parameters.normalize_tracks,
        norm_factor=pipe_manager.config.parameters.norm_factor)
    pipe_manager.run(cmd, sample.bigwig, shell=True)

    # Plot fragment distribution
    if sample.paired and not os.path.exists(sample.insertplot):
        pipe_manager.timestamp("Plotting insert size distribution")
        tk.plot_atacseq_insert_sizes(bam=sample.filtered,
                                     plot=sample.insertplot,
                                     output_csv=sample.insertdata)
        pipe_manager.report_figure("insert_sizes", sample.insertplot)

    # Count coverage genome-wide
    pipe_manager.timestamp("Calculating genome-wide coverage")
    cmd = tk.genomeWideCoverage(
        inputBam=sample.filtered,
        genomeWindows=getattr(pipe_manager.config.resources.genome_windows,
                              sample.genome),
        output=sample.coverage)
    pipe_manager.run(cmd, sample.coverage, shell=True)

    # Calculate NSC, RSC
    pipe_manager.timestamp("Assessing signal/noise in sample")
    cmd = tk.peakTools(inputBam=sample.filtered,
                       output=sample.qc,
                       plot=sample.qc_plot,
                       cpus=args.cores)
    pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True)
    report_dict(pipe_manager, parse_nsc_rsc(sample.qc))
    pipe_manager.report_figure("cross_correlation", sample.qc_plot)

    # Call peaks
    pipe_manager.timestamp("Calling peaks with MACS2")
    # make dir for output (macs fails if it does not exist)
    if not os.path.exists(sample.paths.peaks):
        os.makedirs(sample.paths.peaks)

    cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered,
                                   outputDir=sample.paths.peaks,
                                   sampleName=sample.sample_name,
                                   genome=sample.genome)
    pipe_manager.run(cmd, sample.peaks, shell=True)
    report_dict(pipe_manager, parse_peak_number(sample.peaks))

    # Filter peaks
    if hasattr(pipe_manager.config.resources.blacklisted_regions,
               sample.genome):
        pipe_manager.timestamp("Filtering peaks from blacklisted regions")
        cmd = filter_peaks(
            peaks=sample.peaks,
            exclude=getattr(pipe_manager.config.resources.blacklisted_regions,
                            sample.genome),
            filtered_peaks=sample.filtered_peaks)
        pipe_manager.run(cmd, sample.filtered_peaks, shell=True)
        report_dict(
            pipe_manager,
            parse_peak_number(sample.filtered_peaks, prefix="filtered_"))

    # Calculate fraction of reads in peaks (FRiP)
    pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)")
    # on the sample's peaks
    cmd = tk.calculate_FRiP(inputBam=sample.filtered,
                            inputBed=sample.peaks,
                            output=sample.frip,
                            cpus=args.cores)
    pipe_manager.run(cmd, sample.frip, shell=True)
    total = (float(pipe_manager.stats_dict["filtered_single_ends"]) +
             (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.))
    report_dict(pipe_manager, parse_FRiP(sample.frip, total))

    # on an oracle peak list
    if hasattr(pipe_manager.config.resources.oracle_peak_regions,
               sample.genome):
        cmd = tk.calculate_FRiP(
            inputBam=sample.filtered,
            inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions,
                             sample.genome),
            output=sample.oracle_frip,
            cpus=args.cores)
        pipe_manager.run(cmd, sample.oracle_frip, shell=True)
        report_dict(pipe_manager,
                    parse_FRiP(sample.oracle_frip, total, prefix="oracle_"))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
예제 #5
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(
                " "),  # this is a list of sample paths
            merged_bam=sample.unmapped)
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(input_bam=sample.data_path,
                           output_dir=sample.paths.sample_root,
                           sample_name=sample.sample_name)
    pipe_manager.run(cmd,
                     os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     shell=True)
    report_dict(
        pipe_manager,
        parse_fastqc(os.path.join(sample.paths.sample_root,
                                  sample.sample_name + "_fastqc.zip"),
                     prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None)
    pipe_manager.run(cmd,
                     sample.fastq1 if sample.paired else sample.fastq,
                     shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped,
                                             sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run("ln -s {} {}".format(sample.fastq1, fq1),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run("ln -s {} {}".format(sample.fastq2, fq2),
                         target=os.path.join(sample.paths.hicpro_input,
                                             os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config,
                         'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(
            hicpro_config.replace("\nJOB_NAME = \n",
                                  "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root,
                                              "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run("""{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "data",
                                             sample.name,
                                             sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run("{} -s mapping -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2_global", sample.name, sample.name +
                             "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run("{} -s proc_hic -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "bowtie_results",
                             "bwt2", sample.name, sample.name +
                             "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run("{} -s quality_checks -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro, sample.paths.unmapped,
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "pic",
                             sample.name,
                             "plotMappingPairing_" + sample.name + ".pdf"),
                         nofail=True)

        pipe_manager.run("{} -s merge_persample -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(
                             sample.paths.hicpro_output, "hic_results", "data",
                             sample.name,
                             sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run("{} -s build_contact_maps -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
            sample.paths.hicpro_output, sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix",
                                             sample.name, "raw", "1000",
                                             sample.name + "_1000.matrix"))

        pipe_manager.run("{} -s ice_norm -i {} -o {} -c {}".format(
            pipe_manager.config.tools.hicpro,
            os.path.join(sample.paths.hicpro_output, "hic_results", "matrix",
                         sample.name, "raw"), sample.paths.hicpro_output,
            sample.hicpro_config),
                         target=os.path.join(sample.paths.hicpro_output,
                                             "hic_results", "matrix", "1000",
                                             "iced", "1000",
                                             "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    # # Convertions

    # # # HiC-Pro output to Juicebox ".hic"
    pipe_manager.run("{} -i {} -g {} -j {} -r {} -o {}".format(
        pipe_manager.config.tools.hicpro2juicebox,
        os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                     sample.name, sample.name + "_allValidPairs"),
        pipe_manager.config.resources.chromosome_sizes[sample.genome],
        pipe_manager.config.tools.juicertools,
        pipe_manager.config.parameters.hicpro_restriction_fragments,
        sample.paths.hicpro_output),
                     target=os.path.join(sample.paths.hicpro_output,
                                         sample.name + "_allValidPairs.hic"))

    # # # make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}"
        .format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data",
                         sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output,
                            sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run("pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz.px2"))

    # # # make cool
    pipe_manager.run("hic2cool {} {}".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.hic"),
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.cool")),
                     target=os.path.join(
                         sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores, pipe_manager.config.resources.blacklisted_regions[
                    sample.genome],
                os.path.join(sample.paths.hicpro_output,
                             sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution),
            nofail=True)

    # Call peaks with MACS2
    # # TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}"
        .format(
            os.path.join(sample.paths.hicpro_output,
                         sample.name + "_allValidPairs.bed.gz"), sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                            sample.name + "_peaks.narrowPeak"),
        nofail=True)

    # Call loops
    # # # with cLoops
    if not os.path.exists(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run("cLoops -f {} -o {} ".format(
        os.path.join(sample.paths.hicpro_output,
                     sample.name + "_allValidPairs.bed.gz"),
        os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops",
                     sample.name)) + "-m 4 " + "-eps 5000,7500,10000 " +
                     "-minPts 10,20,30,40,50 " + "-p {} ".format(args.cores) +
                     "-w -j -s -hic",
                     target=os.path.join(sample.paths.hicpro_output,
                                         "hic_results", "cLoops",
                                         sample.name + ".loop"),
                     nofail=True)

    # # # with hichipper
    # # # # make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks",
                     sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root,
                                    "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    # # # # run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper",
                            sample.name + ".filt.intra.loop_counts.bedpe"),
        nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
예제 #6
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    import textwrap

    print("Start processing Hi-C sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            input_bams=sample.data_path.split(" "),  # this is a list of sample paths
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring read quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(
        cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # HiC-Pro pipeline
    # make dir with linked fastq files for HiC-Pro
    sample.paths.hicpro_input = os.path.join(sample.paths.unmapped, sample.name)
    if not os.path.exists(sample.paths.hicpro_input):
        os.makedirs(sample.paths.hicpro_input)

    fq1 = os.path.join(sample.paths.hicpro_input, sample.name + "_R1.fastq")
    if not os.path.exists(fq1):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq1, fq1),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq1)))
    fq2 = os.path.join(sample.paths.hicpro_input, sample.name + "_R2.fastq")
    if not os.path.exists(fq2):
        pipe_manager.run(
            "ln -s {} {}".format(sample.fastq2, fq2),
            target=os.path.join(sample.paths.hicpro_input, os.path.basename(sample.fastq2)))

    # edit config
    hicpro_config = open(pipe_manager.config.parameters.hicpro_template_config, 'r').read()
    with open(sample.hicpro_config, 'w') as handle:
        handle.write(hicpro_config.replace("\nJOB_NAME = \n", "\nJOB_NAME = {}\n".format(sample.name)))

    # run
    sample.paths.hicpro_output = os.path.join(sample.paths.sample_root, "hic-pro_output")
    if args.serial:
        # run the whole HiC-Pro pipeline as once
        pipe_manager.run(
            """{} -i {} -o {} -c {}""".format(
            pipe_manager.config.tools.hicpro, sample.paths.hicpro_input,
            sample.paths.hicpro_output, sample.hicpro_config),
            target=os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"))
    else:
        # run each step in sequence
        pipe_manager.run(
            "{} -s mapping -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2_global", sample.name,
                sample.name + "_R2_{}.bwt2glob.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s proc_hic -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "bowtie_results", "bwt2"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "bowtie_results", "bwt2", sample.name,
                sample.name + "_{}.bwt2pairs.bam".format(sample.genome)))

        pipe_manager.run(
            "{} -s quality_checks -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                sample.paths.unmapped,
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "pic", sample.name,
                "plotMappingPairing_" + sample.name + ".pdf"), nofail=True)

        pipe_manager.run(
            "{} -s merge_persample -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "data", sample.name,
                sample.name + "_allValidPairs.mergestat"))

        pipe_manager.run(
            "{} -s build_contact_maps -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "data"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix", sample.name,
                "raw", "1000", sample.name + "_1000.matrix"))

        pipe_manager.run(
            "{} -s ice_norm -i {} -o {} -c {}".format(
                pipe_manager.config.tools.hicpro,
                os.path.join(sample.paths.hicpro_output, "hic_results", "matrix", sample.name, "raw"),
                sample.paths.hicpro_output,
                sample.hicpro_config),
            target=os.path.join(
                sample.paths.hicpro_output,
                "hic_results", "matrix",
                "1000", "iced", "1000", "1000_1000_iced.matrix"))

    # Report stats
    stats = get_hicpro_stats(sample)
    report_dict(pipe_manager, stats.to_dict())

    ## Convertions

    ### HiC-Pro output to Juicebox ".hic"
    pipe_manager.run(
        "{} -i {} -g {} -j {} -r {} -o {}"
            .format(pipe_manager.config.tools.hicpro2juicebox,
                os.path.join(
                    sample.paths.hicpro_output,
                    "hic_results", "data", sample.name,
                    sample.name + "_allValidPairs"),
                pipe_manager.config.resources.chromosome_sizes[sample.genome],
                pipe_manager.config.tools.juicertools,
                pipe_manager.config.parameters.hicpro_restriction_fragments,
                sample.paths.hicpro_output),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"))

    ### make pairix indexed BEDPE
    pipe_manager.run(
        "awk -v OFS='\\t' '{{print $2,$3,$3+75,$5,$6,$6+75,\".\",\".\",$4,$7}}' {} | sort -k1,1V -k4,4V -k2,2n -k5,5n | bgzip -@ {} > {}".format(
            os.path.join(sample.paths.hicpro_output, "hic_results", "data", sample.name, sample.name + "_allValidPairs"),
            args.cores,
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"))
    pipe_manager.run(
        "pairix  -s 1 -d 4 -b 2 -e 3 -u 5 -v 6 {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz.px2"))

    ### make cool
    pipe_manager.run(
        "hic2cool {} {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.hic"),
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.cool")),
        target=os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"))

    # add balanced normalizations to cooler file
    for resolution in [1, 5, 10, 25, 100, 250, 500, 1000]:
        pipe_manager.run(
            "cooler balance -p {} --blacklist {} {}::/resolutions/{}".format(
                args.cores,
                pipe_manager.config.resources.blacklisted_regions[sample.genome],
                os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.multi.cool"),
                resolution * 1000),
            lock_name="cooler.balance.{}kb".format(resolution), nofail=True)

    # Call peaks with MACS2
    ## TODO: optimize parameters further
    pipe_manager.run(
        "macs2 callpeak -t {} -f BEDPE --keep-dup auto --nomodel --extsize 147 -g hs -n {} --outdir {}".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            sample.name,
            os.path.join(sample.paths.hicpro_output, "hic_results", "peaks")),
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"), nofail=True)

    # Call loops
    ### with cLoops
    if not os.path.exists(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops")):
        os.makedirs(os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops"))
    pipe_manager.run(
        "cLoops -f {} -o {} ".format(
            os.path.join(sample.paths.hicpro_output, sample.name + "_allValidPairs.bed.gz"),
            os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name)
        ) +
        "-m 4 " +
        "-eps 5000,7500,10000 " +
        "-minPts 10,20,30,40,50 " +
        "-p {} ".format(args.cores) +
        "-w -j -s -hic",
        target=os.path.join(sample.paths.hicpro_output, "hic_results", "cLoops", sample.name + ".loop"), nofail=True)

    ### with hichipper
    #### make hichipper config file
    yaml = textwrap.dedent("""
    peaks:
     - {}
    resfrags:
     - {}
    hicpro_output:
     - {}""".format(
        os.path.join(sample.paths.hicpro_output, "hic_results", "peaks", sample.name + "_peaks.narrowPeak"),
        pipe_manager.config.resources.hicpro_restriction_fragments,
        os.path.join(sample.paths.hicpro_output)))

    if os.path.exists(os.path.join(sample.paths.sample_root, "hichipper")):
        import shutil
        shutil.rmtree(os.path.join(sample.paths.sample_root, "hichipper"))

    hichipper_config = os.path.join(sample.paths.sample_root, "hichipper_config.yaml")
    with open(hichipper_config, 'w') as handle:
        handle.write(yaml)
    #### run
    pipe_manager.run(  # TODO: I think this command has to be run from sample.paths.sample_root, needs testing
        "hichipper --out {} {}".format(
            os.path.join(sample.paths.sample_root, "hichipper"),
            hichipper_config),
        target=os.path.join(sample.paths.sample_root, "hichipper", sample.name + ".filt.intra.loop_counts.bedpe"), nofail=True)
    # or target to os.path.join(sample.paths.hicpro_output, "hic_results", "hichipper", "qcReport_make.html")

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
예제 #7
0
def process(sample, pipe_manager, args):
    """
    This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
    and removed, indexed, shifted Bam files along with a UCSC browser track.
    Peaks are called and filtered.
    """
    print("Start processing RNA-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + list(sample.paths.__dict__.keys()):
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
                raise

    # Create NGSTk instance
    tk = NGSTk(pm=pipe_manager)

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe_manager.timestamp("Merging bam files from replicates")
        cmd = tk.merge_bams(
            # this is a list of sample paths
            input_bams=sample.data_path.split(" "),
            merged_bam=sample.unmapped
        )
        pipe_manager.run(cmd, sample.unmapped, shell=True)
        sample.data_path = sample.unmapped

    # Fastqc
    pipe_manager.timestamp("Measuring sample quality with Fastqc")
    cmd = tk.fastqc_rename(
        input_bam=sample.data_path,
        output_dir=sample.paths.sample_root,
        sample_name=sample.sample_name
    )
    pipe_manager.run(cmd, os.path.join(sample.paths.sample_root,
                                       sample.sample_name + "_fastqc.zip"), shell=True)
    report_dict(pipe_manager, parse_fastqc(os.path.join(
        sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_"))

    # Convert bam to fastq
    pipe_manager.timestamp("Converting to Fastq format")
    cmd = tk.bam2fastq(
        inputBam=sample.data_path,
        outputFastq=sample.fastq1 if sample.paired else sample.fastq,
        outputFastq2=sample.fastq2 if sample.paired else None,
        unpairedFastq=sample.fastq_unpaired if sample.paired else None
    )
    pipe_manager.run(
        cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True)
    if not sample.paired:
        pipe_manager.clean_add(sample.fastq, conditional=True)
    if sample.paired:
        pipe_manager.clean_add(sample.fastq1, conditional=True)
        pipe_manager.clean_add(sample.fastq2, conditional=True)
        pipe_manager.clean_add(sample.fastq_unpaired, conditional=True)

    # Trim reads
    pipe_manager.timestamp("Trimming adapters from sample")
    if pipe_manager.config.parameters.trimmer == "trimmomatic":
        cmd = tk.trimmomatic(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters,
            log=sample.trimlog
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)
            pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True)

    elif pipe_manager.config.parameters.trimmer == "skewer":
        cmd = tk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(
                sample.paths.unmapped, sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            trimLog=sample.trimlog,
            cpus=args.cores,
            adapters=pipe_manager.config.resources.adapters
        )
        pipe_manager.run(
            cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True)
        if not sample.paired:
            pipe_manager.clean_add(sample.trimmed, conditional=True)
        else:
            pipe_manager.clean_add(sample.trimmed1, conditional=True)
            pipe_manager.clean_add(sample.trimmed2, conditional=True)

        report_dict(pipe_manager, parse_trim_stats(
            sample.trimlog, prefix="trim_", paired_end=sample.paired))

    # Quantify gene expression
    pipe_manager.timestamp("Quantifying expression with Kallisto")
    cmd = kallisto(
        fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed],
        kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome),
        read_type=sample.read_type,
        output_dir=sample.kallisto_output_dir,
        threads=args.cores,
        bootstrap_number=pipe_manager.config.parameters.bootstrap_number,
        fragment_size=pipe_manager.config.parameters.fragment_size,
        fragment_std=pipe_manager.config.parameters.fragment_std)
    pipe_manager.run(cmd, sample.kallisto_quantification, shell=True)
    report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification))

    # Finish up
    print(pipe_manager.stats_dict)

    pipe_manager.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)