Пример #1
0
def rmats2sashimi(infile, designfile, FDR, outfile):
    '''Module to generate sashimi plots from rMATS output

    Module generates a statement to call rmats2sashimiplot and provides
    it with correct arguments. Only results containing no NA in results
    and below FDR threshold are drawn to prevent unneccassary compute and
    memory use.

    Arguments
    ---------
    infile: string
        path to rMATS results file (can be one of five types)
    designfile: string
        path to design file
    FDR: string
        FDR threshold for drawing plots'
    outfile: string
        directory path for sashimiplot output
    '''

    Design = Expression.ExperimentalDesign(designfile)
    if len(Design.groups) != 2:
        raise ValueError("Please specify exactly 2 groups per experiment.")

    g1 = Design.getSamplesInGroup(Design.groups[0])
    g2 = Design.getSamplesInGroup(Design.groups[1])

    if len(g1) != len(g2):
        g1 = g1[:min(len(g1), len(g2))]
        g2 = g2[:min(len(g1), len(g2))]
        E.info("The two groups compared were of unequal size. For  " +
               "visual display using sashimi they have been truncated " +
               "to the same length")

    group1 = ",".join(["%s.bam" % x for x in g1])
    group2 = ",".join(["%s.bam" % x for x in g2])
    group1name = Design.groups[0]
    group2name = Design.groups[1]
    event = os.path.basename(os.path.normpath(outfile))
    if "MXE" in infile:
        column = "22"
    else:
        column = "20"

    statement = '''cat
    %(infile)s|grep -v NA|
    awk '$%(column)s < %(FDR)s' > %(infile)s_sig.txt;
    rmats2sashimiplot
    --b1 %(group1)s
    --b2 %(group2)s
    -t %(event)s
    -e %(infile)s_sig.txt
    --l1 %(group1name)s
    --l2 %(group2name)s
    -o %(outfile)s
    > %(outfile)s/%(event)s.log
    ''' % locals()

    P.run(statement, job_condaenv="splicing")
Пример #2
0
def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
    '''Module to generate rMATS statment

    Module offers the option to permute group name labels and
    calculates readlength, which must be identical in all reads.

    Arguments
    ---------
    gtffile: string
        path to :term:`gtf` file
    designfile: string
        path to design file
    pvalue: string
        threshold for FDR testing
    strand: string
        strandedness option: can be 'fr-unstranded', 'fr-firststrand',
        or 'fr-secondstrand'
    outdir: string
        directory path for rMATS results
    permute : 1 or 0
        option to activate random shuffling of sample groups
    '''

    design = Expression.ExperimentalDesign(designfile)
    if permute == 1:
        permutelist = design.table.group.tolist()
        random.shuffle(permutelist)
        design.table.group = permutelist
    group1 = ",".join(
        ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])])
    with open(outdir + "/b1.txt", "w") as f:
        f.write(group1)
    group2 = ",".join(
        ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])])
    with open(outdir + "/b2.txt", "w") as f:
        f.write(group2)
    readlength = BamTools.estimateTagSize(design.samples[0]+".bam")

    statement = '''rMATS
    --b1 %(outdir)s/b1.txt
    --b2 %(outdir)s/b2.txt
    --gtf <(gunzip -c %(gtffile)s)
    --od %(outdir)s
    --readLength %(readlength)s
    --cstat %(pvalue)s
    --libType %(strand)s
    ''' % locals()

    # if Paired End Reads
    if BamTools.is_paired(design.samples[0]+".bam"):
        statement += '''-t paired''' % locals()

    statement += '''
    > %(outdir)s/%(designfile)s.log
    '''

    P.run(statement, job_condaenv="splicing")
Пример #3
0
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile):
    ''' use the plotting methods for Counts object to make summary plots'''

    with iotools.openFile(logfile, "w") as log:

        plot_prefix = P.snip(logfile, ".log")

        # need to manually read in data as index column is not the first column
        counts = Counts.Counts(pd.read_table(counts_inf, sep="\t"))
        counts.table.set_index(["transcript_id"])

        design = Expression.ExperimentalDesign(design_inf)

        # make certain counts table only include samples in design
        counts.restrict(design)

        cor_outfile = plot_prefix + "_pairwise_correlations.png"
        pca_var_outfile = plot_prefix + "_pca_variance.png"
        pca1_outfile = plot_prefix + "_pc1_pc2.png"
        pca2_outfile = plot_prefix + "_pc3_pc4.png"
        heatmap_outfile = plot_prefix + "_heatmap.png"

        counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False)

        counts_highExp = counts_log10.clone()
        counts_highExp.table['order'] = counts_highExp.table.apply(
            np.mean, axis=1)
        counts_highExp.table.sort(["order"], ascending=0, inplace=True)
        counts_highExp.table = counts_highExp.table.iloc[0:500, :]
        counts_highExp.table.drop("order", axis=1, inplace=True)

        log.write("plot correlations: %s\n" % cor_outfile)
        counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000)

        log.write("plot pc3,pc4: %s\n" % pca1_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile, pca1_outfile,
                             x_axis="PC1", y_axis="PC2",
                             colour="group", shape="group")

        log.write("plot pc3,pc4: %s\n" % pca2_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile, pca2_outfile,
                             x_axis="PC3", y_axis="PC4",
                             colour="group", shape="group")

        log.write("plot heatmap: %s\n" % heatmap_outfile)
        counts_highExp.heatmap(heatmap_outfile)
Пример #4
0
def runSleuthAll(samples, base_dir, counts, tpm):
    ''' run sleuth for all samples to obtain counts and tpm tables

    Note: all samples in the design table must also
    have a directory with the same name in `base_dir` with kallisto
    results in a file called abundance.h5
    '''

    design = pd.DataFrame({
        "group": ([0, 1] * ((len(samples) + 1) / 2))[0:len(samples)],
        "include": [1, ] * len(samples),
        "pair": [0, ] * len(samples)})

    design.index = samples

    Design = Expression.ExperimentalDesign(design)
    exp = Expression.DEExperiment_Sleuth()

    res = exp.run(Design, base_dir, counts=counts, tpm=tpm,
                  model="~group", dummy_run=True)
Пример #5
0
def runSleuth(design, base_dir, model, contrasts, outfile, counts, tpm,
              fdr, lrt=False, reduced_model=None):
    ''' run sleuth. Note: all samples in the design table must also
    have a directory with the same name in `base_dir` with kallisto
    results in a file called abundance.h5'''

    outfile_prefix = P.snip(outfile, ".tsv")

    Design = Expression.ExperimentalDesign(design)
    exp = Expression.DEExperiment_Sleuth()

    res = exp.run(Design, base_dir, model, contrasts, outfile_prefix,
                  counts, tpm, fdr, lrt, reduced_model)

    res.getResults(fdr)
    for contrast in set(res.table['contrast']):
        res.plotMA(contrast, outfile_prefix)
        res.plotVolcano(contrast, outfile_prefix)

    res.table.to_csv(outfile, sep="\t", index=False)
Пример #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--tag-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("ttest", "sleuth", "edger", "deseq2", "mock",
                               "dexseq"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq2-dispersion-method",
                      dest="deseq2_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq2 [default=%default].")

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq2 [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion",
                      type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    # currently not implemented
    # parser.add_option("-R", "--output-R-code", dest="save_r_environment",
    #                  type="string",
    #                  help="save R environment to loc [default=%default]")

    parser.add_option("-r",
                      "--reference-group",
                      dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--reduced-model",
                      dest="reduced_model",
                      type="string",
                      help=("reduced model for LRT"))

    parser.add_option("--contrast",
                      dest="contrast",
                      type="string",
                      help=("contrast for differential expression testing"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing expression estimates"
                            "from sleuth. Sleuth expects counts"
                            "files to be called abundance.h5"))

    parser.add_option("--dexseq-counts-dir",
                      dest="dexseq_counts_dir",
                      type="string",
                      help=("directory containing counts for dexseq. DEXSeq "
                            "expects counts files to be called .txt and"
                            "to be generated by the DEXSeq_counts.py script"))

    parser.add_option("--dexseq-flattened-file",
                      dest="dexseq_flattened_file",
                      type="string",
                      help=("directory containing flat gtf for dexseq. DEXSeq "
                            "expects this to be generated by the"
                            "DEXSeq_prepare_annotations.py script"))

    parser.add_option(
        "--outfile-sleuth-count",
        dest="outfile_sleuth_count",
        type="string",
        help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option(
        "--sleuth-genewise",
        dest="sleuth_genewise",
        action="store_true",
        help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.add_option("--de-test",
                      dest="DEtest",
                      type="choice",
                      choices=("wald", "lrt"),
                      help=("Differential expression test"))

    parser.add_option("--Rhistory",
                      dest="Rhistory",
                      type="string",
                      help=("Outfile for R history"))

    parser.add_option("--Rimage",
                      dest="Rimage",
                      type="string",
                      help=("Outfile for R image"))

    parser.set_defaults(input_filename_tags="-",
                        input_filename_design=None,
                        output_filename=sys.stdout,
                        method="deseq2",
                        fdr=0.1,
                        deseq2_dispersion_method="pooled",
                        deseq2_fit_type="parametric",
                        edger_dispersion=0.4,
                        ref_group=False,
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        spike_foldchange_max=4.0,
                        spike_expression_max=5.0,
                        spike_expression_bin_width=0.5,
                        spike_foldchange_bin_width=0.5,
                        spike_max_counts_per_bin=50,
                        model=None,
                        contrast=None,
                        output_filename_pattern=None,
                        sleuth_counts_dir=None,
                        dexseq_counts_dir=None,
                        dexseq_flattened_file=None,
                        outfile_sleuth_count=None,
                        outfile_sleuth_tpm=None,
                        use_ihw=False,
                        sleuth_genewise=False,
                        gene_biomart=None,
                        DEtest="wald",
                        reduced_model=None,
                        Rhistory=None,
                        Rimage=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    RH = None
    if options.Rhistory or options.Rimage:
        RH = R.R_with_History()

    outfile_prefix = options.output_filename_pattern

    # Expression.py currently expects a refernce group for edgeR and
    # sleuth, regardless of which test is used
    if not options.ref_group and (options.method is "edger"
                                  or options.method is "sleuth"):
        raise ValueError(
            "Must provide a reference group ('--reference-group')")

    # create Design object
    design = expression.ExperimentalDesign(
        pd.read_csv(iotools.open_file(options.input_filename_design, "r"),
                    sep="\t",
                    index_col=0,
                    comment="#"))

    if len(set(design.table[options.contrast])) > 2:

        if options.method == "deseq2" or options.method == "sleuth":
            if options.DEtest == "wald":
                raise ValueError(
                    "Factor must have exactly two levels for Wald Test. "
                    "If you have more than two levels in your factor, "
                    "consider LRT")
        else:
            E.info('''There are more than 2 levels for the contrast
            specified" "(%s:%s). The log2fold changes in the results table
            and MA plots will be for the first two levels in the
            contrast. The p-value will be the p-value for the overall
            significance of the contrast. Hence, some genes will have a
            signficant p-value but 0-fold change between the first two
            levels''' % (options.contrast, set(design[options.contrast])))

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files "
            " (--sleuth-counts-dir)")

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart,
                                 DE_test=options.DEtest,
                                 ref_group=options.ref_group,
                                 reduced_model=options.reduced_model)

    # DEXSeq reads in data itself
    elif options.method == "dexseq":
        assert options.dexseq_counts_dir, (
            "need to specify the location of the .txt counts files")

        # create Design object
        design = expression.ExperimentalDesign(
            pd.read_csv(iotools.open_file(options.input_filename_design, "r"),
                        sep="\t",
                        index_col=0,
                        comment="#"))

        # validate design against counts and model
        # design.validate(model=options.model)

        experiment = expression.DEExperiment_DEXSeq()
        results = experiment.run(design,
                                 base_dir=options.dexseq_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 ref_group=options.ref_group,
                                 outfile_prefix=outfile_prefix,
                                 flattenedfile=options.dexseq_flattened_file,
                                 fdr=options.fdr)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(sys.stdin,
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))
        else:
            counts = Counts.Counts(
                pd.io.parsers.read_csv(iotools.open_file(
                    options.input_filename_tags, "r"),
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     ref_group=options.ref_group,
                                     fdr=options.fdr,
                                     dispersion=options.edger_dispersion)

        elif options.method == "deseq2":

            experiment = expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group,
                                     DEtest=options.DEtest,
                                     R=RH)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in list(results.Summary.keys()):
        outf = iotools.open_file(
            "_".join([outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n" %
                   results.Summary[test_group].asTable())
        outf.close()

    if options.Rhistory:
        RH.saveHistory(options.Rhistory)
    if options.Rimage:
        RH.saveImage(options.Rimage)

    E.stop()
Пример #7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("filter", "spike", "normalize"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--spike-change-bin-min",
                      dest="min_cbin",
                      type="float",
                      help="minimum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-max",
                      dest="max_cbin",
                      type="float",
                      help="maximum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-width",
                      dest="width_cbin",
                      type="float",
                      help="bin width for change bins [default=%default].")

    parser.add_option("--spike-initial-bin-min",
                      dest="min_ibin",
                      type="float",
                      help="minimum bin for initial bins[default=%default].")

    parser.add_option("--spike-initial-bin-max",
                      dest="max_ibin",
                      type="float",
                      help="maximum bin for intitial bins[default=%default].")

    parser.add_option("--spike-initial-bin-width",
                      dest="width_ibin",
                      type="float",
                      help="bin width intitial bins[default=%default].")

    parser.add_option(
        "--spike-minimum",
        dest="min_spike",
        type="int",
        help="minimum number of spike-ins required within each bin\
                      [default=%default].")

    parser.add_option(
        "--spike-maximum",
        dest="max_spike",
        type="int",
        help="maximum number of spike-ins allowed within each bin\
                      [default=%default].")

    parser.add_option("--spike-difference-method",
                      dest="difference",
                      type="choice",
                      choices=("relative", "logfold", "abs_logfold"),
                      help="method to use for calculating difference\
                      [default=%default].")

    parser.add_option("--spike-iterations",
                      dest="iterations",
                      type="int",
                      help="number of iterations to generate spike-ins\
                      [default=%default].")

    parser.add_option("--spike-cluster-maximum-distance",
                      dest="cluster_max_distance",
                      type="int",
                      help="maximum distance between adjacent loci in cluster\
                      [default=%default].")

    parser.add_option("--spike-cluster-minimum-size",
                      dest="cluster_min_size",
                      type="int",
                      help="minimum number of loci required per cluster\
                      [default=%default].")

    parser.add_option("--spike-type",
                      dest="spike_type",
                      type="choice",
                      choices=("row", "cluster"),
                      help="spike in type [default=%default].")

    parser.add_option("--spike-subcluster-min-size",
                      dest="min_sbin",
                      type="int",
                      help="minimum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-max-size",
                      dest="max_sbin",
                      type="int",
                      help="maximum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-bin-width",
                      dest="width_sbin",
                      type="int",
                      help="bin width for subcluster size\
                      [default=%default].")

    parser.add_option("--spike-output-method",
                      dest="output_method",
                      type="choice",
                      choices=("append", "seperate"),
                      help="defines whether the spike-ins should be appended\
                      to the original table or seperately [default=%default].")

    parser.add_option("--spike-shuffle-column-suffix",
                      dest="shuffle_suffix",
                      type="string",
                      help="the suffix of the columns which are to be shuffled\
                      [default=%default].")

    parser.add_option("--spike-keep-column-suffix",
                      dest="keep_suffix",
                      type="string",
                      help="a list of suffixes for the columns which are to be\
                      keep along with the shuffled columns[default=%default].")

    parser.add_option("--normalization-method",
                      dest="normalization_method",
                      type="choice",
                      choices=("deseq-size-factors", "total-count",
                               "total-column", "total-row"),
                      help="normalization method to apply [%default]")

    parser.add_option("-t",
                      "--tags-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.set_defaults(input_filename_tags="-",
                        method="filter",
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        output_method="seperate",
                        difference="logfold",
                        spike_type="row",
                        min_cbin=0,
                        max_cbin=100,
                        width_cbin=100,
                        min_ibin=0,
                        max_ibin=100,
                        width_ibin=100,
                        max_spike=100,
                        min_spike=None,
                        iterations=1,
                        cluster_max_distance=100,
                        cluster_min_size=10,
                        min_sbin=1,
                        max_sbin=1,
                        width_sbin=1,
                        shuffle_suffix=None,
                        keep_suffix=None,
                        normalization_method="deseq-size-factors")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # load
    if options.keep_suffix:
        # if using suffix, loadTagDataPandas will throw an error as it
        # looks for column names which exactly match the design
        # "tracks" need to write function in Counts.py to handle
        # counts table and design table + suffix
        counts = pd.read_csv(options.stdin, sep="\t", comment="#")
        inf = iotools.open_file(options.input_filename_design)
        design = pd.read_csv(inf, sep="\t", index_col=0)
        inf.close()
        design = design[design["include"] != 0]

        if options.method in ("filter", "spike"):
            if options.input_filename_design is None:
                raise ValueError("method '%s' requires a design file" %
                                 options.method)
    else:
        # create Counts object
        # TS if spike type is cluster, need to keep "contig" and "position"
        # columns out of index
        if options.spike_type == "cluster":
            index = None,
        else:
            index = 0
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(options.stdin,
                                       sep="\t",
                                       index_col=index,
                                       comment="#"))
        else:
            counts = Counts.Counts(iotools.open_file(
                options.input_filename_tags, "r"),
                                   sep="\t",
                                   index_col=index,
                                   comment="#")

        # TS normalization doesn't require a design table
        if not options.method == "normalize":

            assert options.input_filename_design and os.path.exists(
                options.input_filename_design)

            # create Design object
            design = Expression.ExperimentalDesign(
                pd.read_csv(iotools.open_file(options.input_filename_design,
                                              "r"),
                            sep="\t",
                            index_col=0,
                            comment="#"))

    if options.method == "filter":

        assert (options.filter_min_counts_per_sample is not None or
                options.filter_min_counts_per_row is not None or
                options.filter_percentile_rowsums is not None), \
            "no filtering parameters have been suplied"

        # filter
        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        nobservations, nsamples = counts.table.shape

        if nobservations == 0:
            E.warn("no observations remaining after filtering- no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "normalize":

        counts.normalise(method=options.normalization_method,
                         row_title="total")

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "spike":
        # check parameters are sensible and set parameters where they
        # are not explicitly set
        if not options.min_spike:
            E.info("setting minimum number of spikes per bin to equal"
                   "maximum number of spikes per bin (%s)" % options.max_spike)
            options.min_spike = options.max_spike

        if options.spike_type == "cluster":

            assert options.max_sbin <= options.cluster_min_size, \
                ("max size of subscluster: %s is greater than min size of"
                 "cluster: %s" % (options.max_sbin, options.cluster_min_size))

            counts_columns = set(counts.table.columns.values.tolist())

            assert ("contig" in counts_columns and
                    "position" in counts_columns), \
                ("cluster analysis requires columns named 'contig' and"
                 "'position' in the dataframe")

            counts.sort(sort_columns=["contig", "position"], reset_index=True)

        # restrict design table to first pair only

        design.firstPairOnly()

        # get dictionaries to map group members to column names
        # use different methods depending on whether suffixes are supplied
        if options.keep_suffix:
            g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix(
                options.shuffle_suffix, options.keep_suffix)
        else:
            # if no suffixes supplied, spike and keep tracks are the same
            g_to_track = design.getGroups2Samples()
            g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track)

        # set up numpy arrays for change and initial values
        change_bins = np.arange(options.min_cbin, options.max_cbin,
                                options.width_cbin)
        initial_bins = np.arange(options.min_ibin, options.max_ibin,
                                 options.width_ibin)

        E.info("Column boundaries are: %s" % str(change_bins))
        E.info("Row boundaries are: %s" % str(initial_bins))

        # shuffle rows/clusters
        if options.spike_type == "cluster":
            E.info("looking for clusters...")
            clusters_dict = Counts.findClusters(counts_sort,
                                                options.cluster_max_distance,
                                                options.cluster_min_size,
                                                g_to_spike_tracks, groups)
            if len(clusters_dict) == 0:
                raise Exception("no clusters were found, check parameters")

            E.info("shuffling subcluster regions...")
            output_indices, counts = Counts.shuffleCluster(
                initial_bins, change_bins, g_to_spike_tracks, groups,
                options.difference, options.max_spike, options.iterations,
                clusters_dict, options.max_sbin, options.min_sbin,
                options.width_sbin)

        elif options.spike_type == "row":

            E.info("shuffling rows...")
            output_indices, bin_counts = counts.shuffleRows(
                options.min_cbin, options.max_cbin, options.width_cbin,
                options.min_ibin, options.max_ibin, options.width_ibin,
                g_to_spike_tracks, design.groups, options.difference,
                options.max_spike, options.iterations)

        filled_bins = Counts.thresholdBins(output_indices, bin_counts,
                                           options.min_spike)

        assert len(filled_bins) > 0, "No bins contained enough spike-ins"

        # write out
        counts.outputSpikes(filled_bins,
                            g_to_keep_tracks,
                            design.groups,
                            output_method=options.output_method,
                            spike_type=options.spike_type,
                            min_cbin=options.min_cbin,
                            width_cbin=options.width_cbin,
                            max_cbin=options.max_cbin,
                            min_ibin=options.min_ibin,
                            width_ibin=options.width_ibin,
                            max_ibin=options.max_ibin,
                            min_sbin=options.min_sbin,
                            width_sbin=options.width_sbin,
                            max_sbin=options.max_sbin)

    E.stop()
Пример #8
0
def runSleuth(infiles, outfiles, design_name, quantifier):
    ''' run sleuth to identify differentially expression transcripts/genes'''

    design_name = design_name.lower()
    counts, design = infiles
    transcripts, genes = counts
    transcript_out, gene_out = outfiles

    transcript_prefix = P.snip(transcript_out, ".tsv")
    transcript_log = transcript_prefix + ".log"

    gene_prefix = P.snip(gene_out, ".tsv")
    gene_log = gene_prefix + ".log"

    model = PARAMS['sleuth_model%s' % design_name]
    E.info(model)
    reduced_model = PARAMS['sleuth_reduced_model%s' % design_name]

    contrast = PARAMS['sleuth_contrast%s' % design_name]
    refgroup = PARAMS['sleuth_refgroup%s' % design_name]
    detest = PARAMS['sleuth_detest']
    transcripts = os.path.join("geneset.dir",
                               P.snip(PARAMS['geneset'], ".gtf.gz") + ".fa")

    # to estimate sleuth memory, we need to know the number of
    # samples, transcripts and boostraps
    number_transcripts = 0
    with iotools.open_file(transcripts, "r") as inf:
        for line in inf:
            if line.startswith(">"):
                number_transcripts += 1

    Design = Expression.ExperimentalDesign("design%s.tsv" % design_name)
    number_samples = sum(Design.table['include'])

    job_memory = rnaseq.estimateSleuthMemory(
        PARAMS["%(quantifier)s_bootstrap" % locals()], number_samples,
        number_transcripts)

    statement = '''
    python -m cgatpipelines.tasks.counts2table
    --design-tsv-file=%(design)s
    --output-filename-pattern=%(transcript_prefix)s
    --log=%(transcript_log)s
    --method=sleuth
    --fdr=%(sleuth_fdr)s
    --model=%(model)s
    --contrast=%(contrast)s
    --sleuth-counts-dir=%(quantifier)s.dir
    --reference-group=%(refgroup)s
    --de-test=%(detest)s
    '''
    if detest == "lrt":
        statement += '''
        --reduced-model=%(reduced_model)s
        '''
    statement += '''
    -v 0
    >%(transcript_out)s
    '''

    P.run(statement)

    if PARAMS['sleuth_genewise']:

        assert PARAMS['sleuth_gene_biomart'], (
            "Must provide a biomart (see pipeline.yml)")

        # gene-wise sleuth seems to be even more memory hungry!
        # Use 2 * transcript memory estimate
        job_memory = rnaseq.estimateSleuthMemory(
            PARAMS["%(quantifier)s_bootstrap" % locals()], 2 * number_samples,
            number_transcripts)

        statement = '''
        python -m cgatpipelines.tasks.counts2table
        --design-tsv-file=%(design)s
        --output-filename-pattern=%(gene_prefix)s
        --log=%(gene_log)s
        --method=sleuth
        --fdr=%(sleuth_fdr)s
        --model=%(model)s
        --contrast=%(contrast)s
        --sleuth-genewise
        --sleuth-counts-dir=%(quantifier)s.dir
        --reference-group=%(refgroup)s
        --gene-biomart=%(sleuth_gene_biomart)s
        --de-test=%(detest)s
        '''
        if detest == "lrt":
            statement += '''
            --reduced-model=%(reduced_model)s
            '''
        statement += '''
        -v 0
        >%(transcript_out)s
        '''

        P.run(statement)