def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files([x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" to_combine_isoform = [x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]] fpkm_isoform_combined = count.combine_count_files(to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined return samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ gtf_file = dd.get_gtf_file(samples[0][0], None) dexseq_gff = dd.get_dexseq_gff(samples[0][0]) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" to_combine = [x[0]["eff_counts"] for x in samples if "eff_counts" in x[0]] if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.eff_counts" eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file) to_combine = [x[0]["tpm_counts"] for x in samples if "tpm_counts" in x[0]] tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.tpm" tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file) to_combine = [x[0]["fpkm_counts"] for x in samples if "fpkm_counts" in x[0]] fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.eff_fpkm" fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file) return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined} return None
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): if bam.is_paired(dd.get_work_bam(data)): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) else: logger.info(f"Creating peak table from full BAM file because " f"{dd.get_work_bam(data)} is single-ended.") peakcounts.append(tz.get_in(("peak_counts", "full"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard"]), samples, config, dirs, "trimming") as run_parallel: samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: samples = rnaseq.estimate_expression(samples, run_parallel) #samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def test_dexseq_combine(self): count_files = test_data.DEXSEQ_COUNT_FILES test_file = os.path.join(self.out_dir, "dexseq-combined.txt") out_file = count.combine_count_files(count_files, out_file=test_file, ext=".txt") self.assertTrue(file_exists(out_file))
def run(self, config, config_file, parallel, dirs, samples): with prun.start(parallel, samples, config, dirs, "trimming") as run_parallel: samples = run_parallel("trim_lane", samples) with prun.start( _wprogs(parallel, ["aligner"], {"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples), ) as run_parallel: samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) with prun.start( _wprogs(parallel, ["samtools", "gatk", "cufflinks"]), samples, config, dirs, "rnaseqcount" ) as run_parallel: samples = rnaseq.estimate_expression(samples, run_parallel) # samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) organism = utils.get_in(samples[0][0], ("genome_resources", "aliases", "ensembl"), None) annotated = annotate_combined_count_file(combined, organism) for x in samples: x[0]["combined_counts"] = combined x[0]["annotated_combined_counts"] = annotated with prun.start( _wprogs(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample" ) as run_parallel: samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_express", samples) express_counts_combined = combine_express(samples, combined) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" if not combined: return None to_combine = [ dd.get_express_counts(x) for x in dd.sample_data_iterator(samples) if dd.get_express_counts(x) ] gtf_file = dd.get_gtf_file(samples[0][0]) isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt") isoform_to_gene_file = express.isoform_to_gene_name( gtf_file, isoform_to_gene_file, dd.sample_data_iterator(samples).next()) if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_counts" eff_counts_combined = count.combine_count_files( to_combine, eff_counts_combined_file, ext=".counts") to_combine = [ dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_tpm(x) ] tpm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_tpm" tpm_counts_combined = count.combine_count_files( to_combine, tpm_counts_combined_file) to_combine = [ dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_fpkm(x) ] fpkm_counts_combined_file = os.path.splitext( combined)[0] + ".isoform.express_fpkm" fpkm_counts_combined = count.combine_count_files( to_combine, fpkm_counts_combined_file, ext=".fpkm") return { 'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file } return {}
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files( [x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" to_combine_isoform = [ x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0] ] fpkm_isoform_combined = count.combine_count_files( to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples] to_combine_dexseq = filter(lambda x: x, to_combine_dexseq) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined if dexseq_combined: x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file) return samples
def combine_express(samples, combined): """Combine tpm, effective counts and fpkm from express results""" to_combine = [dd.get_express_counts(x) for x in dd.sample_data_iterator(samples) if dd.get_express_counts(x)] gtf_file = dd.get_gtf_file(samples[0][0]) isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt") isoform_to_gene_file = express.isoform_to_gene_name(gtf_file, isoform_to_gene_file) if len(to_combine) > 0: eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_counts" eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file) to_combine = [dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_tpm(x)] tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_tpm" tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file) to_combine = [dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples) if dd.get_express_fpkm(x)] fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_fpkm" fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file) return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined, 'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file} return {}
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) samples = disambiguate.split(lane_items) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("generate_transcript_counts", samples) combined = combine_count_files([x[0].get("count_file") for x in samples]) for x in samples: x[0]["combined_counts"] = combined samples = qcsummary.generate_parallel(samples, run_parallel) #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) return samples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files([x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" to_combine_isoform = [x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]] fpkm_isoform_combined = count.combine_count_files(to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples] to_combine_dexseq = filter(lambda x: x, to_combine_dexseq) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined if dexseq_combined: x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={ "tophat": 8, "tophat2": 8, "star": 40 }), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) combined = combine_count_files( [x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) samples = disambiguate.split(lane_items) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = rnaseq.estimate_expression(samples, run_parallel) #samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) organism = utils.get_in(samples[0][0], ('genome_resources', 'aliases', 'ensembl'), None) annotated = annotate_combined_count_file(combined, organism) for x in samples: x[0]["combined_counts"] = combined x[0]["annotated_combined_counts"] = annotated samples = qcsummary.generate_parallel(samples, run_parallel) #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) return samples
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples