def call_consensus(samples): """ call consensus peaks on the narrow/Broad peakfiles from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) break elif "broadPeak" in fn: peakfiles.append(fn) break elif dd.get_chip_method(data) == "atac": for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) for data in dd.sample_data_iterator(samples): new_samples.append([ tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) ]) return new_samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def create_ataqv_report(samples): """ make the ataqv report from a set of ATAC-seq samples """ data = samples[0][0] new_samples = [] reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv") sentinel = os.path.join(reportdir, "index.html") if utils.file_exists(sentinel): ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data) mkarv = config_utils.get_program("mkarv", dd.get_config(data)) ataqv_files = [] for data in dd.sample_data_iterator(samples): qc = dd.get_summary_qc(data) ataqv_file = tz.get_in(("ataqv", "base"), qc, None) if ataqv_file and utils.file_exists(ataqv_file): ataqv_files.append(ataqv_file) if not ataqv_files: return samples ataqv_json_file_string = " ".join(ataqv_files) with file_transaction(reportdir) as txreportdir: cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}" message = f"Creating ataqv report from {ataqv_json_file_string}." do.run(cmd, message) new_data = [] ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data)
def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser( sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): if bam.is_paired(dd.get_work_bam(data)): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) else: logger.info(f"Creating peak table from full BAM file because " f"{dd.get_work_bam(data)} is single-ended.") peakcounts.append(tz.get_in(("peak_counts", "full"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples out_file = os.path.join(work_dir, "sailfish", "combined.sf") if not file_exists(out_file): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(out_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_combined(data, out_file) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([ file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene] ]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def stringtie_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) for data in dd.sample_data_iterator(samples): dd.set_assembled_gtf(data, merged_gtf) return samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) ref_file = dd.get_ref_file(data) out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf") if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, out_file) updated_samples.append([data]) return updated_samples return samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_assembled_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def detect_fusions(samples): """Run fusion with a standalone tool, specified in config as fusion_caller. If fusion_mode is True, and no fusion_caller is specified, or fusion_caller == 'aligner', it is assumed that gene fusion detection was run on the alignment step. """ fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode) if not fusion_mode: return samples caller = dd.get_in_samples(samples, dd.get_fusion_caller) if not caller or caller == 'aligner': logger.info("No standalone fusion caller specified in the config.") return samples STANDALONE_CALLERS = { 'ericscript': ericscript.run, } caller_fn = STANDALONE_CALLERS.get(caller) if not caller_fn: logger.warning("Gene fusion detection with %s is not supported." "Supported callers:\n%s" % ', '.join(STANDALONE_CALLERS.keys())) return samples logger.info("Running gene fusion detection with %s" % caller) return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
def scrnaseq_concatenate_metadata(samples): """ Create file same dimension than mtx.colnames with metadata and sample name to help in the creation of the SC object. """ barcodes = {} counts = "" metadata = {} for sample in dd.sample_data_iterator(samples): with open(dd.get_sample_barcodes(sample)) as inh: for line in inh: cols = line.strip().split(",") if len(cols) == 1: # Assign sample name in case of missing in barcodes cols.append("NaN") barcodes[cols[0]] = cols[1:] counts = dd.get_combined_counts(sample) meta = map(str, list(sample["metadata"].values())) meta_cols = list(sample["metadata"].keys()) meta = ["NaN" if not v else v for v in meta] metadata[dd.get_sample_name(sample)] = meta metadata_fn = counts + ".metadata" if not file_exists(metadata_fn): with open(metadata_fn, 'w') as outh: outh.write(",".join(["sample"] + meta_cols) + '\n') with open(counts + ".colnames") as inh: for line in inh: sample = line.split(":")[0] barcode = sample.split("-")[1] outh.write(",".join(barcodes[barcode] + metadata[sample]) + '\n') return samples
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = any([tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples)]) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("atropos") with prun.start(_wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1 if not needs_trimming else None) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def load_summarizedexperiment(samples): """ create summarizedexperiment rds object fails with n_samples = 1 """ # using r36 (4.0) - will eventually drop R3.5 rcmd = Rscript_cmd("r36") se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts", "R", "bcbio2se.R") data = samples[0][0] work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "salmon") summarized_experiment = os.path.join(out_dir, "bcbio-se.rds") if not file_exists(summarized_experiment): with file_transaction(summarized_experiment) as tx_out_file: cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}" message = f"Loading SummarizedExperiment." try: do.run(cmd, message) except Exception: logger.error("SE creation failed") if file_exists(summarized_experiment): try: se_qc_report = generate_se_qc_report(work_dir) except Exception: se_qc_report = None logger.error("SE QC failed") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_summarized_experiment(data, summarized_experiment) updated_samples.append([data]) return updated_samples else: return samples
def run_salmon_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon") gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data)) salmon_index(gtf_file, data, salmon_dir) return samples
def scrnaseq_concatenate_metadata(samples): """ Create file same dimension than mtx.colnames with metadata and sample name to help in the creation of the SC object. """ barcodes = {} counts = "" metadata = {} for sample in dd.sample_data_iterator(samples): with open(dd.get_sample_barcodes(sample)) as inh: for line in inh: cols = line.strip().split(",") if len(cols) == 1: # Assign sample name in case of missing in barcodes cols.append("NaN") barcodes[(dd.get_sample_name(sample), cols[0])] = cols[1:] counts = dd.get_combined_counts(sample) meta = map(str, list(sample["metadata"].values())) meta_cols = list(sample["metadata"].keys()) meta = ["NaN" if not v else v for v in meta] metadata[dd.get_sample_name(sample)] = meta metadata_fn = counts + ".metadata" if not file_exists(metadata_fn): with open(metadata_fn, 'w') as outh: outh.write(",".join(["sample"] + meta_cols) + '\n') with open(counts + ".colnames") as inh: for line in inh: sample = line.split(":")[0] barcode = sample.split("-")[1] outh.write(",".join(barcodes[(sample, barcode)] + metadata[sample]) + '\n') return samples
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = any([ tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples) ]) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("atropos") with prun.start( _wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1 if not needs_trimming else None) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ gtf_file = dd.get_gtf_file(samples[0][0], None) dexseq_gff = dd.get_dexseq_gff(samples[0][0]) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def call_consensus(samples): """ call consensus peaks on the narrowPeak files from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) elif "broadPeak" in fn: peakfiles.append(fn) elif dd.get_chip_method(data) == "atac": if bam.is_paired(dd.get_work_bam(data)): for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) else: logger.info( f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended." ) for fn in tz.get_in(("peaks_files", "full", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) if not utils.file_exists(consensusfile): logger.warning("No consensus peaks found.") return samples saffile = consensus_to_saf(consensusfile, os.path.splitext(consensusfile)[0] + ".saf") for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) new_samples.append([data]) return new_samples
def get_samples_by_batch(samples): batch_samples = defaultdict(list) for data in dd.sample_data_iterator(samples): batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, list): batch = tuple(batch) batch_samples[batch].append(data) return batch_samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run_salmon_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon") gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file salmon_index(gtf_file, fasta_file, data, salmon_dir) return samples
def concatenate_cb_histograms(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "cb-histogram.txt") files = [dd.get_histogram_counts(data) for data in dd.sample_data_iterator(samples) if dd.get_histogram_counts(data)] files = " ".join(files) cmd = "cat {files} > {out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Concay cb histograms." do.run(cmd.format(**locals()), message) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_histogram(data, out_file)]) return newsamples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run_rapmap_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) rapmap_dir = os.path.join(work_dir, "rapmap") gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir) return samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(work_dir, "sailfish", "combined.sf") transcript_tpm_file = os.path.join(work_dir, "sailfish", "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(work_dir, "sailfish", "combined.gene.sf.tpm") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file]]): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot(None, "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) updated_samples.append([data]) return updated_samples
def run_rapmap_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) rapmap_dir = os.path.join(work_dir, "rapmap") gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) assert file_exists( fasta_file), "%s was not found, exiting." % fasta_file rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir) return samples
def run_kallisto_index(*samples): for data in dd.sample_data_iterator(samples): work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto") gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kallisto_index(gtf_file, fasta_file, data, kallisto_dir) return samples
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" if not combined: return None to_combine = [ dd.get_express_counts(x) for x in dd.sample_data_iterator(samples) if dd.get_express_counts(x) ] gtf_file = dd.get_gtf_file(samples[0][0]) isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt") isoform_to_gene_file = express.isoform_to_gene_name( gtf_file, isoform_to_gene_file, dd.sample_data_iterator(samples).next()) if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_counts" eff_counts_combined = count.combine_count_files( to_combine, eff_counts_combined_file, ext=".counts") to_combine = [ dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_tpm(x) ] tpm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_tpm" tpm_counts_combined = count.combine_count_files( to_combine, tpm_counts_combined_file) to_combine = [ dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_fpkm(x) ] fpkm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_fpkm" fpkm_counts_combined = count.combine_count_files( to_combine, fpkm_counts_combined_file, ext=".fpkm") return { 'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file } return {}
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def concatenate_cb_histograms(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "cb-histogram.txt") files = [ dd.get_histogram_counts(data) for data in dd.sample_data_iterator(samples) if dd.get_histogram_counts(data) ] files = " ".join(files) cmd = "cat {files} > {out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Concat cellular barcode histograms: %s." % files do.run(cmd.format(**locals()), message) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_histogram(data, out_file)]) return newsamples
def run_sailfish_index(*samples): fq1, _ = dd.get_input_sequence_files(samples[0][0]) kmer_size = estimate_kmer_size(fq1) Build = namedtuple('Build', ['build', 'ref', 'gtf']) builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x)) for x in dd.sample_data_iterator(samples)} data = samples[0][0] indexdirs = {} for build in builds: indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data, build.build, kmer_size) return samples
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" to_combine = [dd.get_express_counts(x) for x in dd.sample_data_iterator(samples) if dd.get_express_counts(x)] gtf_file = dd.get_gtf_file(samples[0][0]) isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt") isoform_to_gene_file = express.isoform_to_gene_name(gtf_file, isoform_to_gene_file) if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_counts" eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file) to_combine = [dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_tpm(x)] tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_tpm" tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file) to_combine = [dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_fpkm(x)] fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_fpkm" fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file) return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file} return {}
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_express", samples) express_counts_combined = combine_express(samples, combined) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def _is_trim_set(samples): for sample in dd.sample_data_iterator(samples): return utils.get_in(sample, ["algorithm", "trim_reads"]) return None