Пример #1
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    combined = count.combine_count_files([x[0]["count_file"] for x in samples
                                          if "count_file" in x[0]])
    gtf_file = get_in(samples[0][0], ('genome_resources', 'rnaseq',
                                      'transcripts'), None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)
    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]]
    fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    to_combine_isoform = [x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]]
    fpkm_isoform_combined = count.combine_count_files(to_combine_isoform,
                                                      fpkm_isoform_combined_file,
                                                      ".isoform.fpkm")
    for x in samples:
        x[0]["combined_counts"] = combined
        if annotated:
            x[0]["annotated_combined_counts"] = annotated
        if fpkm_combined:
            x[0]["combined_fpkm"] = fpkm_combined
        if fpkm_isoform_combined:
            x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined
    return samples
Пример #2
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Пример #3
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Пример #4
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    to_combine = [x[0]["eff_counts"] for x in samples if "eff_counts" in x[0]]
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.eff_counts"
        eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file)
        to_combine = [x[0]["tpm_counts"] for x in samples if "tpm_counts" in x[0]]
        tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.tpm"
        tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file)
        to_combine = [x[0]["fpkm_counts"] for x in samples if "fpkm_counts" in x[0]]
        fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.eff_fpkm"
        fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file)
        return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined,
                'fpkm': fpkm_counts_combined}
    return None
Пример #5
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Пример #6
0
    def run(self, config, config_file, parallel, dirs, samples):
        with prun.start(_wres(parallel, ["picard"]),
                        samples, config, dirs, "trimming") as run_parallel:
            samples = run_parallel("process_lane", samples)
            samples = run_parallel("trim_lane", samples)
        with prun.start(_wres(parallel, ["aligner"],
                              ensure_mem={"tophat": 8, "tophat2": 8, "star": 30}),
                        samples, config, dirs, "multicore",
                        multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
            samples = disambiguate.split(samples)
            samples = run_parallel("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)

        with prun.start(_wres(parallel, ["samtools", "cufflinks"]),
                        samples, config, dirs, "rnaseqcount") as run_parallel:
            samples = rnaseq.estimate_expression(samples, run_parallel)
            #samples = rnaseq.detect_fusion(samples, run_parallel)

        combined = combine_count_files([x[0].get("count_file") for x in samples])
        gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq',
                                                'transcripts'), None)
        annotated = annotate_combined_count_file(combined, gtf_file)
        for x in samples:
            x[0]["combined_counts"] = combined
            if annotated:
                x[0]["annotated_combined_counts"] = annotated

        with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]),
                        samples, config, dirs, "persample") as run_parallel:
            samples = qcsummary.generate_parallel(samples, run_parallel)
        return samples
Пример #7
0
 def test_dexseq_combine(self):
     count_files = test_data.DEXSEQ_COUNT_FILES
     test_file = os.path.join(self.out_dir, "dexseq-combined.txt")
     out_file = count.combine_count_files(count_files,
                                          out_file=test_file,
                                          ext=".txt")
     self.assertTrue(file_exists(out_file))
Пример #8
0
    def run(self, config, config_file, parallel, dirs, samples):
        with prun.start(parallel, samples, config, dirs, "trimming") as run_parallel:
            samples = run_parallel("trim_lane", samples)
        with prun.start(
            _wprogs(parallel, ["aligner"], {"tophat": 8, "tophat2": 8, "star": 30}),
            samples,
            config,
            dirs,
            "multicore",
            multiplier=alignprep.parallel_multiplier(samples),
        ) as run_parallel:
            samples = disambiguate.split(samples)
            samples = run_parallel("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)

        with prun.start(
            _wprogs(parallel, ["samtools", "gatk", "cufflinks"]), samples, config, dirs, "rnaseqcount"
        ) as run_parallel:
            samples = rnaseq.estimate_expression(samples, run_parallel)
            # samples = rnaseq.detect_fusion(samples, run_parallel)

        combined = combine_count_files([x[0].get("count_file") for x in samples])
        organism = utils.get_in(samples[0][0], ("genome_resources", "aliases", "ensembl"), None)
        annotated = annotate_combined_count_file(combined, organism)
        for x in samples:
            x[0]["combined_counts"] = combined
            x[0]["annotated_combined_counts"] = annotated

        with prun.start(
            _wprogs(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample"
        ) as run_parallel:
            samples = qcsummary.generate_parallel(samples, run_parallel)
        return samples
Пример #9
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files)
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    samples = run_parallel("run_express", samples)
    express_counts_combined = combine_express(samples, combined)

    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                      fpkm_isoform_combined_file,
                                                      ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
    else:
        dexseq_combined = None

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Пример #10
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    if not combined:
        return None
    to_combine = [
        dd.get_express_counts(x) for x in dd.sample_data_iterator(samples)
        if dd.get_express_counts(x)
    ]
    gtf_file = dd.get_gtf_file(samples[0][0])
    isoform_to_gene_file = os.path.join(os.path.dirname(combined),
                                        "isoform_to_gene.txt")
    isoform_to_gene_file = express.isoform_to_gene_name(
        gtf_file, isoform_to_gene_file,
        dd.sample_data_iterator(samples).next())
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_counts"
        eff_counts_combined = count.combine_count_files(
            to_combine, eff_counts_combined_file, ext=".counts")
        to_combine = [
            dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_tpm(x)
        ]
        tpm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_tpm"
        tpm_counts_combined = count.combine_count_files(
            to_combine, tpm_counts_combined_file)
        to_combine = [
            dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_fpkm(x)
        ]
        fpkm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_fpkm"
        fpkm_counts_combined = count.combine_count_files(
            to_combine, fpkm_counts_combined_file, ext=".fpkm")
        return {
            'counts': eff_counts_combined,
            'tpm': tpm_counts_combined,
            'fpkm': fpkm_counts_combined,
            'isoform_to_gene': isoform_to_gene_file
        }
    return {}
Пример #11
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    combined = count.combine_count_files(
        [x[0]["count_file"] for x in samples if "count_file" in x[0]])
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)
    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]]
    fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(
        combined)[0] + ".isoform.fpkm"
    to_combine_isoform = [
        x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]
    ]
    fpkm_isoform_combined = count.combine_count_files(
        to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples]
    to_combine_dexseq = filter(lambda x: x, to_combine_dexseq)
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
    else:
        dexseq_combined = None

    for x in samples:
        x[0]["combined_counts"] = combined
        if annotated:
            x[0]["annotated_combined_counts"] = annotated
        if fpkm_combined:
            x[0]["combined_fpkm"] = fpkm_combined
        if fpkm_isoform_combined:
            x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined
        if dexseq_combined:
            x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file)

    return samples
Пример #12
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    to_combine = [dd.get_express_counts(x) for x in
                  dd.sample_data_iterator(samples) if dd.get_express_counts(x)]
    gtf_file = dd.get_gtf_file(samples[0][0])
    isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt")
    isoform_to_gene_file = express.isoform_to_gene_name(gtf_file, isoform_to_gene_file)
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_counts"
        eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file)
        to_combine = [dd.get_express_tpm(x) for x in
                      dd.sample_data_iterator(samples) if dd.get_express_tpm(x)]
        tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_tpm"
        tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file)
        to_combine = [dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples)
                      if dd.get_express_fpkm(x)]
        fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_fpkm"
        fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file)
        return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined,
                'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file}
    return {}
Пример #13
0
    def run(self, config, config_file, run_parallel, parallel, dirs, lane_items):
        lane_items = run_parallel("trim_lane", lane_items)
        samples = disambiguate.split(lane_items)
        samples = run_parallel("process_alignment", samples)
        samples = disambiguate.resolve(samples, run_parallel)
        samples = run_parallel("generate_transcript_counts", samples)
        combined = combine_count_files([x[0].get("count_file") for x in samples])
        for x in samples:
            x[0]["combined_counts"] = combined

        samples = qcsummary.generate_parallel(samples, run_parallel)
        #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
        return samples
Пример #14
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    combined = count.combine_count_files([x[0]["count_file"] for x in samples
                                          if "count_file" in x[0]])
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)
    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]]
    fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    to_combine_isoform = [x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]]
    fpkm_isoform_combined = count.combine_count_files(to_combine_isoform,
                                                      fpkm_isoform_combined_file,
                                                      ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples]
    to_combine_dexseq = filter(lambda x: x, to_combine_dexseq)
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
    else:
        dexseq_combined = None

    for x in samples:
        x[0]["combined_counts"] = combined
        if annotated:
            x[0]["annotated_combined_counts"] = annotated
        if fpkm_combined:
            x[0]["combined_fpkm"] = fpkm_combined
        if fpkm_isoform_combined:
            x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined
        if dexseq_combined:
            x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file)

    return samples
Пример #15
0
    def run(self, config, config_file, parallel, dirs, samples):
        with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples,
                        config, dirs, "trimming") as run_parallel:
            with profile.report("adapter trimming", dirs):
                samples = run_parallel("process_lane", samples)
                samples = run_parallel("trim_lane", samples)
        with prun.start(_wres(parallel, ["aligner", "picard"],
                              ensure_mem={
                                  "tophat": 8,
                                  "tophat2": 8,
                                  "star": 40
                              }),
                        samples,
                        config,
                        dirs,
                        "multicore",
                        multiplier=alignprep.parallel_multiplier(
                            samples)) as run_parallel:
            with profile.report("alignment", dirs):
                samples = disambiguate.split(samples)
                samples = run_parallel("process_alignment", samples)

        with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples,
                        config, dirs, "rnaseqcount") as run_parallel:
            with profile.report("disambiguation", dirs):
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("estimate expression", dirs):
                samples = rnaseq.estimate_expression(samples, run_parallel)

        combined = combine_count_files(
            [x[0].get("count_file") for x in samples])
        gtf_file = utils.get_in(samples[0][0],
                                ('genome_resources', 'rnaseq', 'transcripts'),
                                None)
        annotated = annotate_combined_count_file(combined, gtf_file)
        for x in samples:
            x[0]["combined_counts"] = combined
            if annotated:
                x[0]["annotated_combined_counts"] = annotated

        with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]),
                        samples, config, dirs, "persample") as run_parallel:
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Пример #16
0
    def run(self, config, config_file, run_parallel, parallel, dirs, lane_items):
        lane_items = run_parallel("trim_lane", lane_items)
        samples = disambiguate.split(lane_items)
        samples = run_parallel("process_alignment", samples)
        samples = disambiguate.resolve(samples, run_parallel)
        samples = rnaseq.estimate_expression(samples, run_parallel)
        #samples = rnaseq.detect_fusion(samples, run_parallel)
        combined = combine_count_files([x[0].get("count_file") for x in samples])
        organism = utils.get_in(samples[0][0], ('genome_resources', 'aliases',
                                                'ensembl'), None)
        annotated = annotate_combined_count_file(combined, organism)
        for x in samples:
            x[0]["combined_counts"] = combined
            x[0]["annotated_combined_counts"] = annotated

        samples = qcsummary.generate_parallel(samples, run_parallel)
        #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
        return samples
Пример #17
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Пример #18
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation",
                                "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files,
                                                  fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing(
        [dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined_file = os.path.splitext(
            combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(
            isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing(
        [dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data,
                                         express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(
                data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
Пример #19
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files and combined:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files and combined:
        fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data
                                        in samples])
    if to_combine_dexseq and combined:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        if dexseq_combined:
            dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
Пример #20
0
 def test_dexseq_combine(self):
     count_files = test_data.DEXSEQ_COUNT_FILES
     test_file = os.path.join(self.out_dir, "dexseq-combined.txt")
     out_file = count.combine_count_files(count_files, out_file=test_file,
                                          ext=".txt")
     self.assertTrue(file_exists(out_file))