def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. :param Sample sample: individual Sample object to process :param pypiper.PipelineManager pipe_manager: PipelineManager to use during Sample processing :param argparse.Namespace args: binding between command-line option and argument, for specifying values various pipeline parameters """ print("Start processing ChIP-seq sample %s." % sample.name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.input_file_paths) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams(input_bams=sample.input_file_paths, merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(file=sample.data_source, output_dir=sample.paths.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=os.path.join(sample.paths.unmapped, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads(input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genome_wide_coverage( input_bam=sample.filtered, genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp(input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # If the sample is a control, we're finished. # The type/value for the comparison Sample in this case should be either # absent or a null-indicative/-suggestive value. comparison = getattr(sample, CHIP_COMPARE_COLUMN, None) if comparison in [None, "", "NA"]: pipe_manager.stop_pipeline() print("Finished processing sample {}".format(sample.name)) return # The pipeline will now wait for the comparison sample file to be completed pipe_manager._wait_for_file( sample.filtered.replace(sample.name, comparison)) # Call peaks. broad_mode = sample.broad peaks_folder = sample.paths.peaks treatment_file = sample.filtered control_file = sample.filtered.replace(sample.name, comparison) if not os.path.exists(peaks_folder): os.makedirs(peaks_folder) # TODO: include the filepaths as caller-neutral positionals/keyword args # TODO (cont.) once NGSTK API is tweaked. peak_call_kwargs = { "output_dir": peaks_folder, "broad": broad_mode, "qvalue": args.qvalue } if args.peak_caller == "macs2": cmd = tk.macs2_call_peaks(treatment_bams=treatment_file, control_bams=control_file, sample_name=sample.name, pvalue=args.pvalue, genome=sample.genome, paired=sample.paired, **peak_call_kwargs) else: cmd = tk.spp_call_peaks(treatment_bam=treatment_file, control_bam=control_file, treatment_name=sample.name, control_name=comparison, cpus=args.cpus, **peak_call_kwargs) pipe_manager.run(cmd, target=sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Do plotting as desired. if args.peak_caller == "macs2" and not broad_mode: pipe_manager.timestamp("Plotting MACS2 model") model_files_base = sample.name + "_model" # Create the command to run the model script. name_model_script = model_files_base + ".r" path_model_script = os.path.join(peaks_folder, name_model_script) exec_model_script = \ "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script) # Create the command to create and rename the model plot. plot_name = model_files_base + ".pdf" src_plot_path = os.path.join(os.getcwd(), plot_name) dst_plot_path = os.path.join(peaks_folder, plot_name) rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path) # Run the model script and rename the model plot. pipe_manager.run([exec_model_script, rename_model_plot], target=dst_plot_path, shell=True, nofail=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip(input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_")) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig(input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC") pipe_manager.run(cmd, sample.bigwig, shell=True) print("Finished processing sample %s." % sample.name) pipe_manager.stop_pipeline()
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing STARR-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): if not os.path.exists(sample.paths[path]): try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_source.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.mergeBams( input_bams=sample.data_source.split( " "), # this is a list of sample paths output_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(sample.data_source, output_dir=sample.paths.sample_root) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=os.path.join(sample.paths.unmapped, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, trim_log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.resources.genomes, sample.genome), max_insert=pipe_manager.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads(input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = tk.bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome_sizes=getattr(pipe_manager.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=False, # by default make extended tracks normalize=True) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plotInsertSizesFit(bam=sample.filtered, plot=sample.insertplot, outputCSV=sample.insertdata) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genome_wide_coverage(input_bam=sample.filtered, genome_windows=getattr( pipe_manager.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp(input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2_call_peaks_atacseq(treatment_bam=sample.filtered, output_dir=sample.paths.peaks, sample_name=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculateFRiP(input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip) pipe_manager.run(cmd, sample.frip, shell=True) print("Finished processing sample %s." % sample.sample_name) pipe_manager.stop_pipeline()