示例#1
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples
示例#2
0
def _detect_rRNA(data):
    gtf_file = dd.get_gtf_file(data)
    count_file = dd.get_count_file(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    genes = [x[0] for x in rrna_features if x]
    count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"])
    return {'rRNA': sum(count_table.ix[genes]["counts"])}
示例#3
0
def _detect_rRNA(data):
    gtf_file = dd.get_gtf_file(data)
    count_file = dd.get_count_file(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    genes = [x[0] for x in rrna_features if x]
    count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"])
    return {'rRNA': sum(count_table.ix[genes]["counts"])}
示例#4
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
示例#5
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
示例#6
0
def _maybe_add_scrnaseq(algorithm, sample, out):
    count_file = dd.get_count_file(sample)
    if not count_file:
        return out
    out.append({"path": count_file, "type": "mtx"})
    out.append({"path": count_file + ".rownames", "type": "rownames"})
    out.append({"path": count_file + ".colnames", "type": "colnames"})
    return out
示例#7
0
def _maybe_add_barcode_histogram(algorithm, sample, out):
    if not dd.get_count_file(sample):
        return out
    count_file = sample["count_file"]
    histogram_file = os.path.join(os.path.dirname(count_file),
                                  "cb-histogram.txt")
    out.append({"path": histogram_file, "type": "tsv", "ext": "barcodes"})
    return out
示例#8
0
def _maybe_add_barcode_histogram(algorithm, sample, out):
    if not dd.get_count_file(sample):
        return out
    count_file = sample["count_file"]
    histogram_file = os.path.join(os.path.dirname(count_file), "cb-histogram.txt")
    out.append({"path": histogram_file,
                "type": "tsv",
                "ext": "barcodes"})
    return out
示例#9
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
示例#10
0
def _detect_rRNA(data):
    gtf_file = dd.get_gtf_file(data)
    count_file = dd.get_count_file(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    genes = [x[0] for x in rrna_features if x]
    if not genes:
        return {'rRNA': "NA", "rRNA_rate": "NA"}
    count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"])
    rrna = sum(count_table[count_table["id"].isin(genes)]["counts"])
    rrna_rate = float(rrna) / sum(count_table["counts"])
    return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
示例#11
0
def _maybe_add_scrnaseq(algorithm, sample, out):
    count_file = dd.get_count_file(sample)
    if not count_file:
        return out
    out.append({"path": count_file,
             "type": "mtx"})
    out.append({"path": count_file + ".rownames",
             "type": "rownames"})
    out.append({"path": count_file + ".colnames",
             "type": "colnames"})
    return out
示例#12
0
def _maybe_add_counts(algorithm, sample, out):
    if not dd.get_count_file(sample):
        return out
    out.append({"path": sample["count_file"],
             "type": "counts",
             "ext": "ready"})
    stats_file = os.path.splitext(sample["count_file"])[0] + ".stats"
    if utils.file_exists(stats_file):
        out.append({"path": stats_file,
                    "type": "count_stats",
                    "ext": "ready"})
    return out
示例#13
0
def _maybe_add_counts(algorithm, sample, out):
    if not dd.get_count_file(sample):
        return out
    out.append({
        "path": sample["count_file"],
        "type": "counts",
        "ext": "ready"
    })
    stats_file = os.path.splitext(sample["count_file"])[0] + ".stats"
    if utils.file_exists(stats_file):
        out.append({"path": stats_file, "type": "count_stats", "ext": "ready"})
    return out
示例#14
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [dd.get_count_file(data) for data in
            dd.sample_data_iterator(samples)
            if dd.get_count_file(data)]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
示例#15
0
def _maybe_add_scrnaseq(algorithm, sample, out):
    count_file = dd.get_count_file(sample)
    if not count_file:
        return out
    else:
        out.append({"path": count_file, "type": "mtx"})
        out.append({"path": count_file + ".rownames", "type": "rownames"})
        out.append({"path": count_file + ".colnames", "type": "colnames"})
    umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx"
    if utils.file_exists(umi_file):
        out.append({"path": umi_file, "type": "mtx"})
        out.append({"path": umi_file + ".rownames", "type": "rownames"})
        out.append({"path": umi_file + ".colnames", "type": "colnames"})
    return out
示例#16
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [dd.get_count_file(data) for data in
             dd.sample_data_iterator(samples)
             if dd.get_count_file(data)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples
示例#17
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files)
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    samples = run_parallel("run_express", samples)
    express_counts_combined = combine_express(samples, combined)

    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                      fpkm_isoform_combined_file,
                                                      ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
    else:
        dexseq_combined = None

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
示例#18
0
def _maybe_add_scrnaseq(algorithm, sample, out):
    count_file = dd.get_count_file(sample)
    if not count_file:
        return out
    else:
        out.append({"path": count_file,
                    "type": "mtx"})
        out.append({"path": count_file + ".rownames",
                    "type": "rownames"})
        out.append({"path": count_file + ".colnames",
                    "type": "colnames"})
    umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx"
    if utils.file_exists(umi_file):
        out.append({"path": umi_file,
                    "type": "mtx"})
        out.append({"path": umi_file + ".rownames",
                    "type": "rownames"})
        out.append({"path": umi_file + ".colnames",
                    "type": "colnames"})
    return out
示例#19
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files and combined:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files and combined:
        fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data
                                        in samples])
    if to_combine_dexseq and combined:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        if dexseq_combined:
            dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
示例#20
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation",
                                "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files,
                                                  fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing(
        [dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined_file = os.path.splitext(
            combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(
            isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing(
        [dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data,
                                         express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(
                data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples