def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples
def _detect_rRNA(data): gtf_file = dd.get_gtf_file(data) count_file = dd.get_count_file(data) rrna_features = gtf.get_rRNA(gtf_file) genes = [x[0] for x in rrna_features if x] count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"]) return {'rRNA': sum(count_table.ix[genes]["counts"])}
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ gtf_file = dd.get_gtf_file(samples[0][0], None) dexseq_gff = dd.get_dexseq_gff(samples[0][0]) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def _maybe_add_scrnaseq(algorithm, sample, out): count_file = dd.get_count_file(sample) if not count_file: return out out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) return out
def _maybe_add_barcode_histogram(algorithm, sample, out): if not dd.get_count_file(sample): return out count_file = sample["count_file"] histogram_file = os.path.join(os.path.dirname(count_file), "cb-histogram.txt") out.append({"path": histogram_file, "type": "tsv", "ext": "barcodes"}) return out
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def _detect_rRNA(data): gtf_file = dd.get_gtf_file(data) count_file = dd.get_count_file(data) rrna_features = gtf.get_rRNA(gtf_file) genes = [x[0] for x in rrna_features if x] if not genes: return {'rRNA': "NA", "rRNA_rate": "NA"} count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"]) rrna = sum(count_table[count_table["id"].isin(genes)]["counts"]) rrna_rate = float(rrna) / sum(count_table["counts"]) return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
def _maybe_add_counts(algorithm, sample, out): if not dd.get_count_file(sample): return out out.append({"path": sample["count_file"], "type": "counts", "ext": "ready"}) stats_file = os.path.splitext(sample["count_file"])[0] + ".stats" if utils.file_exists(stats_file): out.append({"path": stats_file, "type": "count_stats", "ext": "ready"}) return out
def _maybe_add_counts(algorithm, sample, out): if not dd.get_count_file(sample): return out out.append({ "path": sample["count_file"], "type": "counts", "ext": "ready" }) stats_file = os.path.splitext(sample["count_file"])[0] + ".stats" if utils.file_exists(stats_file): out.append({"path": stats_file, "type": "count_stats", "ext": "ready"}) return out
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def _maybe_add_scrnaseq(algorithm, sample, out): count_file = dd.get_count_file(sample) if not count_file: return out else: out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({"path": umi_file + ".rownames", "type": "rownames"}) out.append({"path": umi_file + ".colnames", "type": "colnames"}) return out
def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_express", samples) express_counts_combined = combine_express(samples, combined) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples