def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing RNA-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( # this is a list of sample paths input_bams=sample.data_path.split(" "), merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Quantify gene expression pipe_manager.timestamp("Quantifying expression with Kallisto") cmd = kallisto( fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed], kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome), read_type=sample.read_type, output_dir=sample.kallisto_output_dir, threads=args.cores, bootstrap_number=pipe_manager.config.parameters.bootstrap_number, fragment_size=pipe_manager.config.parameters.fragment_size, fragment_std=pipe_manager.config.parameters.fragment_std) pipe_manager.run(cmd, sample.kallisto_quantification, shell=True) report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification)) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing STARR-seq sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.mergeBams( inputBams=sample.data_path.split( " "), # this is a list of sample paths outputBam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(inputBam=sample.data_path, outputDir=sample.paths.sample_root, sampleName=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.resources.genomes, sample.genome), maxInsert=pipe_manager.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = tk.bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=False, # by default make extended tracks normalize=True) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plotInsertSizesFit(bam=sample.filtered, plot=sample.insertplot, outputCSV=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage(inputBam=sample.filtered, genomeWindows=getattr( pipe_manager.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculateFRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip) pipe_manager.run(cmd, sample.frip, shell=True) print("Finished processing sample %s." % sample.sample_name) pipe_manager.stop_pipeline()
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. :param Sample sample: individual Sample object to process :param pypiper.PipelineManager pipe_manager: PipelineManager to use during Sample processing :param argparse.Namespace args: binding between command-line option and argument, for specifying values various pipeline parameters """ print("Start processing ChIP-seq sample %s." % sample.name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.input_file_paths) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams(input_bams=sample.input_file_paths, merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc(file=sample.data_source, output_dir=sample.paths.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=os.path.join(sample.paths.unmapped, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads(input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genome_wide_coverage( input_bam=sample.filtered, genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp(input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # If the sample is a control, we're finished. # The type/value for the comparison Sample in this case should be either # absent or a null-indicative/-suggestive value. comparison = getattr(sample, CHIP_COMPARE_COLUMN, None) if comparison in [None, "", "NA"]: pipe_manager.stop_pipeline() print("Finished processing sample {}".format(sample.name)) return # The pipeline will now wait for the comparison sample file to be completed pipe_manager._wait_for_file( sample.filtered.replace(sample.name, comparison)) # Call peaks. broad_mode = sample.broad peaks_folder = sample.paths.peaks treatment_file = sample.filtered control_file = sample.filtered.replace(sample.name, comparison) if not os.path.exists(peaks_folder): os.makedirs(peaks_folder) # TODO: include the filepaths as caller-neutral positionals/keyword args # TODO (cont.) once NGSTK API is tweaked. peak_call_kwargs = { "output_dir": peaks_folder, "broad": broad_mode, "qvalue": args.qvalue } if args.peak_caller == "macs2": cmd = tk.macs2_call_peaks(treatment_bams=treatment_file, control_bams=control_file, sample_name=sample.name, pvalue=args.pvalue, genome=sample.genome, paired=sample.paired, **peak_call_kwargs) else: cmd = tk.spp_call_peaks(treatment_bam=treatment_file, control_bam=control_file, treatment_name=sample.name, control_name=comparison, cpus=args.cpus, **peak_call_kwargs) pipe_manager.run(cmd, target=sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Do plotting as desired. if args.peak_caller == "macs2" and not broad_mode: pipe_manager.timestamp("Plotting MACS2 model") model_files_base = sample.name + "_model" # Create the command to run the model script. name_model_script = model_files_base + ".r" path_model_script = os.path.join(peaks_folder, name_model_script) exec_model_script = \ "{} {}".format(pipe_manager.config.tools.Rscript, path_model_script) # Create the command to create and rename the model plot. plot_name = model_files_base + ".pdf" src_plot_path = os.path.join(os.getcwd(), plot_name) dst_plot_path = os.path.join(peaks_folder, plot_name) rename_model_plot = "mv {} {}".format(src_plot_path, dst_plot_path) # Run the model script and rename the model plot. pipe_manager.run([exec_model_script, rename_model_plot], target=dst_plot_path, shell=True, nofail=True) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip(input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_")) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig(input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC") pipe_manager.run(cmd, sample.bigwig, shell=True) print("Finished processing sample %s." % sample.name) pipe_manager.stop_pipeline()
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. """ print("Start processing ChIP-seq sample '{}'.".format(sample.name)) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) print("Finished processing sample '{}'.".format(sample.name)) return pipe_manager
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed (and shifted if necessary) Bam files along with a UCSC browser track. """ print("Start processing ChIP-seq sample '{}'.".format(sample.name)) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split(" "), # this is a list of sample paths merged_bam=sample.unmapped ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename( input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name ) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict(pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None ) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog ) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters ) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict(pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genome_index, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores ) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict(pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads( inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality ) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Report total efficiency usable = ( float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) total = float(pipe_manager.stats_dict['fastqc_total_pass_filter_reads']) report_dict( pipe_manager, {"total_efficiency": (usable / total) * 100}) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC") pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes( bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata ) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage ) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools( inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores ) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) print("Finished processing sample '{}'.".format(sample.name)) return pipe_manager
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) for path in ["sample_root"] + sample.paths.__dict__.keys(): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_path.split( " "), # this is a list of sample paths merged_bam=sample.unmapped) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename(input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict( pipe_manager, parse_fastqc(os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join(sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters) pipe_manager.run(cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2Map( inputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, inputFastq2=sample.trimmed2 if sample.paired else None, outputBam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genomeIndex=getattr(pipe_manager.config.resources.genomes, sample.genome), maxInsert=pipe_manager.config.parameters.max_insert, cpus=args.cores) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired)) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads(bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_")) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filterReads(inputBam=sample.mapped, outputBam=sample.filtered, metricsFile=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Shift reads if sample.tagmented: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shiftReads(inputBam=sample.filtered, genome=sample.genome, outputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted, shell=True) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.indexBam(inputBam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.indexBam(inputBam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) if sample.tagmented: cmd = tk.indexBam(inputBam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # Make tracks # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from bam file") cmd = bamToBigWig( inputBam=sample.filtered, outputBigWig=sample.bigwig, genomeSizes=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), genome=sample.genome, tagmented=pipe_manager.config.parameters. tagmented, # by default make extended tracks normalize=pipe_manager.config.parameters.normalize_tracks, norm_factor=pipe_manager.config.parameters.norm_factor) pipe_manager.run(cmd, sample.bigwig, shell=True) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes(bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata) pipe_manager.report_figure("insert_sizes", sample.insertplot) # Count coverage genome-wide pipe_manager.timestamp("Calculating genome-wide coverage") cmd = tk.genomeWideCoverage( inputBam=sample.filtered, genomeWindows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), output=sample.coverage) pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.peakTools(inputBam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) pipe_manager.report_figure("cross_correlation", sample.qc_plot) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(sample.paths.peaks): os.makedirs(sample.paths.peaks) cmd = tk.macs2CallPeaksATACSeq(treatmentBam=sample.filtered, outputDir=sample.paths.peaks, sampleName=sample.sample_name, genome=sample.genome) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Filter peaks if hasattr(pipe_manager.config.resources.blacklisted_regions, sample.genome): pipe_manager.timestamp("Filtering peaks from blacklisted regions") cmd = filter_peaks( peaks=sample.peaks, exclude=getattr(pipe_manager.config.resources.blacklisted_regions, sample.genome), filtered_peaks=sample.filtered_peaks) pipe_manager.run(cmd, sample.filtered_peaks, shell=True) report_dict( pipe_manager, parse_peak_number(sample.filtered_peaks, prefix="filtered_")) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") # on the sample's peaks cmd = tk.calculate_FRiP(inputBam=sample.filtered, inputBed=sample.peaks, output=sample.frip, cpus=args.cores) pipe_manager.run(cmd, sample.frip, shell=True) total = (float(pipe_manager.stats_dict["filtered_single_ends"]) + (float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.)) report_dict(pipe_manager, parse_FRiP(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = tk.calculate_FRiP( inputBam=sample.filtered, inputBed=getattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict(pipe_manager, parse_FRiP(sample.oracle_frip, total, prefix="oracle_")) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing ATAC-seq sample %s." % sample.sample_name) # for path in ["sample_root"] + list(sample.__dict__.keys()): for path in [ "sample_root", "unmapped_dir", "mapped_dir", "peaks_dir", "coverage_dir", "tss_dir", ]: p = getattr(sample, path) try: exists = os.path.exists(p) except TypeError: continue if not exists: msg = "Cannot create '{}' path: {}".format(path, p) try: os.mkdir(p) except OSError(msg): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate # if len(sample.data_source.split(" ")) > 1: if (type(sample.data_source) == list) & (len(sample.data_source) > 1): pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( input_bams=sample.data_source, # this is a list of sample paths merged_bam=sample.unmapped, ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_source = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") if not os.path.exists(sample.fastqc): cmd = tk.fastqc(file=sample.data_source, output_dir=sample.sample_root) pipe_manager.run(cmd, sample.fastqc_initial_output, shell=False) # # rename output if os.path.exists(sample.fastqc_initial_output): os.rename(sample.fastqc_initial_output, sample.fastqc) report_dict(pipe_manager, parse_fastqc(sample.fastqc, prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( input_bam=sample.data_source, output_fastq=sample.fastq1 if sample.paired else sample.fastq, output_fastq2=sample.fastq2 if sample.paired else None, unpaired_fastq=sample.fastq_unpaired if sample.paired else None, ) pipe_manager.run(cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq1_unpaired=sample.trimmed1_unpaired if sample.paired else None, output_fastq2=sample.trimmed2 if sample.paired else None, output_fastq2_unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( input_fastq1=sample.fastq1 if sample.paired else sample.fastq, input_fastq2=sample.fastq2 if sample.paired else None, output_prefix=pjoin(sample.unmapped_dir, sample.sample_name), output_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, output_fastq2=sample.trimmed2 if sample.paired else None, log=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True, ) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict( pipe_manager, parse_trim_stats(sample.trimlog, prefix="trim_", paired_end=sample.paired), ) # Map pipe_manager.timestamp("Mapping reads with Bowtie2") cmd = tk.bowtie2_map( input_fastq1=sample.trimmed1 if sample.paired else sample.trimmed, input_fastq2=sample.trimmed2 if sample.paired else None, output_bam=sample.mapped, log=sample.aln_rates, metrics=sample.aln_metrics, genome_index=getattr(pipe_manager.config.resources.genome_index, sample.genome), max_insert=pipe_manager.config.parameters.max_insert, cpus=args.cores, ) pipe_manager.run(cmd, sample.mapped, shell=True) report_dict( pipe_manager, parse_mapping_stats(sample.aln_rates, paired_end=sample.paired), ) # Get mitochondrial reads pipe_manager.timestamp("Getting mitochondrial stats") cmd = tk.get_mitochondrial_reads( bam_file=sample.mapped, output=sample.mitochondrial_stats, cpus=args.cores, ) pipe_manager.run(cmd, sample.mitochondrial_stats, shell=True, nofail=True) report_dict( pipe_manager, parse_duplicate_stats(sample.mitochondrial_stats, prefix="MT_"), ) # Filter reads pipe_manager.timestamp("Filtering reads for quality") cmd = tk.filter_reads( input_bam=sample.mapped, output_bam=sample.filtered, metrics_file=sample.dups_metrics, paired=sample.paired, cpus=args.cores, Q=pipe_manager.config.parameters.read_quality, ) pipe_manager.run(cmd, sample.filtered, shell=True) report_dict(pipe_manager, parse_duplicate_stats(sample.dups_metrics)) # Index bams pipe_manager.timestamp("Indexing bamfiles with samtools") cmd = tk.index_bam(input_bam=sample.mapped) pipe_manager.run(cmd, sample.mapped + ".bai", shell=True) cmd = tk.index_bam(input_bam=sample.filtered) pipe_manager.run(cmd, sample.filtered + ".bai", shell=True) # Shift reads if args.shift_reads: pipe_manager.timestamp("Shifting reads of tagmented sample") cmd = tk.shift_reads( input_bam=sample.filtered, genome=sample.genome, output_bam=sample.filteredshifted, ) pipe_manager.run(cmd, sample.filteredshifted, shell=True) cmd = tk.index_bam(input_bam=sample.filteredshifted) pipe_manager.run(cmd, sample.filteredshifted + ".bai", shell=True) # Run TSS enrichment tss_enrichment = run_tss_analysis( sample=sample, bam_file=sample.filtered, chrom_file=getattr(pipe_manager.config.resources.chromosome_sizes, sample.genome), tss_file=getattr(pipe_manager.config.resources.unique_tss, sample.genome), ) report_dict(pipe_manager, {"tss_enrichment": tss_enrichment}) # Call peaks pipe_manager.timestamp("Calling peaks with MACS2") # make dir for output (macs fails if it does not exist) if not os.path.exists(os.path.dirname(sample.peaks)): os.makedirs(os.path.dirname(sample.peaks)) cmd = tk.macs2_call_peaks_atacseq( treatment_bam=sample.filtered, output_dir=sample.peaks_dir, sample_name=sample.sample_name, genome=sample.genome, ) pipe_manager.run(cmd, sample.peaks, shell=True) report_dict(pipe_manager, parse_peak_number(sample.peaks)) # Calculate fraction of reads in peaks (FRiP) pipe_manager.timestamp("Calculating fraction of reads in peaks (FRiP)") cmd = tk.calculate_frip( input_bam=sample.filtered, input_bed=sample.peaks, output=sample.frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.frip, shell=True) total = float(pipe_manager.stats_dict["filtered_single_ends"]) + ( float(pipe_manager.stats_dict["filtered_paired_ends"]) / 2.0) report_dict(pipe_manager, parse_frip(sample.frip, total)) # on an oracle peak list if hasattr(pipe_manager.config.resources.oracle_peak_regions, sample.genome): cmd = calculate_frip( input_bam=sample.filtered, input_bed=getattr( pipe_manager.config.resources.oracle_peak_regions, sample.genome), output=sample.oracle_frip, cpus=args.cores, ) pipe_manager.run(cmd, sample.oracle_frip, shell=True) report_dict( pipe_manager, parse_frip(sample.oracle_frip, total, prefix="oracle_"), ) # Plot fragment distribution if sample.paired and not os.path.exists(sample.insertplot): pipe_manager.timestamp("Plotting insert size distribution") tk.plot_atacseq_insert_sizes( bam=sample.filtered, plot=sample.insertplot, output_csv=sample.insertdata, ) # # Count coverage genome-wide # pipe_manager.timestamp("Calculating genome-wide coverage") # cmd = tk.genome_wide_coverage( # input_bam=sample.filtered, # genome_windows=getattr(pipe_manager.config.resources.genome_windows, sample.genome), # output=sample.coverage) # pipe_manager.run(cmd, sample.coverage, shell=True) # Calculate NSC, RSC pipe_manager.timestamp("Assessing signal/noise in sample") cmd = tk.run_spp( input_bam=sample.filtered, output=sample.qc, plot=sample.qc_plot, cpus=args.cores, ) pipe_manager.run(cmd, sample.qc_plot, shell=True, nofail=True) report_dict(pipe_manager, parse_nsc_rsc(sample.qc)) # Make tracks track_dir = os.path.dirname(sample.bigwig) if not os.path.exists(track_dir): os.makedirs(track_dir) # right now tracks are only made for bams without duplicates pipe_manager.timestamp("Making bigWig tracks from BAM file") cmd = bam_to_bigwig( input_bam=sample.filtered, output_bigwig=sample.bigwig, genome=sample.genome, normalization_method="RPGC", ) pipe_manager.run(cmd, sample.bigwig, shell=True) print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)
def process(sample, pipe_manager, args): """ This takes unmapped Bam files and makes trimmed, aligned, duplicate marked and removed, indexed, shifted Bam files along with a UCSC browser track. Peaks are called and filtered. """ print("Start processing RNA-seq sample %s." % sample.sample_name) for path in ["sample_root"] + list(sample.paths.__dict__.keys()): try: exists = os.path.exists(sample.paths[path]) except TypeError: continue if not exists: try: os.mkdir(sample.paths[path]) except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])): raise # Create NGSTk instance tk = NGSTk(pm=pipe_manager) # Merge Bam files if more than one technical replicate if len(sample.data_path.split(" ")) > 1: pipe_manager.timestamp("Merging bam files from replicates") cmd = tk.merge_bams( # this is a list of sample paths input_bams=sample.data_path.split(" "), merged_bam=sample.unmapped ) pipe_manager.run(cmd, sample.unmapped, shell=True) sample.data_path = sample.unmapped # Fastqc pipe_manager.timestamp("Measuring sample quality with Fastqc") cmd = tk.fastqc_rename( input_bam=sample.data_path, output_dir=sample.paths.sample_root, sample_name=sample.sample_name ) pipe_manager.run(cmd, os.path.join(sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), shell=True) report_dict(pipe_manager, parse_fastqc(os.path.join( sample.paths.sample_root, sample.sample_name + "_fastqc.zip"), prefix="fastqc_")) # Convert bam to fastq pipe_manager.timestamp("Converting to Fastq format") cmd = tk.bam2fastq( inputBam=sample.data_path, outputFastq=sample.fastq1 if sample.paired else sample.fastq, outputFastq2=sample.fastq2 if sample.paired else None, unpairedFastq=sample.fastq_unpaired if sample.paired else None ) pipe_manager.run( cmd, sample.fastq1 if sample.paired else sample.fastq, shell=True) if not sample.paired: pipe_manager.clean_add(sample.fastq, conditional=True) if sample.paired: pipe_manager.clean_add(sample.fastq1, conditional=True) pipe_manager.clean_add(sample.fastq2, conditional=True) pipe_manager.clean_add(sample.fastq_unpaired, conditional=True) # Trim reads pipe_manager.timestamp("Trimming adapters from sample") if pipe_manager.config.parameters.trimmer == "trimmomatic": cmd = tk.trimmomatic( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq1unpaired=sample.trimmed1_unpaired if sample.paired else None, outputFastq2=sample.trimmed2 if sample.paired else None, outputFastq2unpaired=sample.trimmed2_unpaired if sample.paired else None, cpus=args.cores, adapters=pipe_manager.config.resources.adapters, log=sample.trimlog ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed1_unpaired, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) pipe_manager.clean_add(sample.trimmed2_unpaired, conditional=True) elif pipe_manager.config.parameters.trimmer == "skewer": cmd = tk.skewer( inputFastq1=sample.fastq1 if sample.paired else sample.fastq, inputFastq2=sample.fastq2 if sample.paired else None, outputPrefix=os.path.join( sample.paths.unmapped, sample.sample_name), outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed, outputFastq2=sample.trimmed2 if sample.paired else None, trimLog=sample.trimlog, cpus=args.cores, adapters=pipe_manager.config.resources.adapters ) pipe_manager.run( cmd, sample.trimmed1 if sample.paired else sample.trimmed, shell=True) if not sample.paired: pipe_manager.clean_add(sample.trimmed, conditional=True) else: pipe_manager.clean_add(sample.trimmed1, conditional=True) pipe_manager.clean_add(sample.trimmed2, conditional=True) report_dict(pipe_manager, parse_trim_stats( sample.trimlog, prefix="trim_", paired_end=sample.paired)) # Quantify gene expression pipe_manager.timestamp("Quantifying expression with Kallisto") cmd = kallisto( fastq_files=[sample.trimmed1, sample.trimmed2] if sample.paired else [sample.trimmed], kallisto_index=getattr(pipe_manager.config.resources.kallisto_index, sample.genome), read_type=sample.read_type, output_dir=sample.kallisto_output_dir, threads=args.cores, bootstrap_number=pipe_manager.config.parameters.bootstrap_number, fragment_size=pipe_manager.config.parameters.fragment_size, fragment_std=pipe_manager.config.parameters.fragment_std) pipe_manager.run(cmd, sample.kallisto_quantification, shell=True) report_dict(pipe_manager, parse_kallisto_stats(sample.kallisto_quantification)) # Finish up print(pipe_manager.stats_dict) pipe_manager.stop_pipeline() print("Finished processing sample %s." % sample.sample_name)