def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx") if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse " "{gene_map_flag}" "--cb_histogram {cb_histogram} {bam} {tx_out_file}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_transcriptome_gtf(odata) if not gtf_file: gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join( dd.get_work_dir(data), "annotation", os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [ umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames" ] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([ file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene] ]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = tx2genefile(gtf_file, tx2gene_file, tsv=False) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = tx2genefile(gtf_file, tx2gene_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples