示例#1
0
    def setUp(self):

        self.counts = Counts.Counts(
            pandas.DataFrame({
                'sample1': [0, 1, 2],
                'sample2': [2, 4, 3]
            }))
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile):
    ''' use the plotting methods for Counts object to make summary plots'''

    with IOTools.openFile(logfile, "w") as log:

        plot_prefix = P.snip(logfile, ".log")

        # need to manually read in data as index column is not the first column
        counts = Counts.Counts(pd.read_table(counts_inf, sep="\t"))
        counts.table.set_index(["transcript_id"])

        design = Expression.ExperimentalDesign(design_inf)

        # make certain counts table only include samples in design
        counts.restrict(design)

        cor_outfile = plot_prefix + "_pairwise_correlations.png"
        pca_var_outfile = plot_prefix + "_pca_variance.png"
        pca1_outfile = plot_prefix + "_pc1_pc2.png"
        pca2_outfile = plot_prefix + "_pc3_pc4.png"
        heatmap_outfile = plot_prefix + "_heatmap.png"

        counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False)

        counts_highExp = counts_log10.clone()
        counts_highExp.table['order'] = counts_highExp.table.apply(np.mean,
                                                                   axis=1)
        counts_highExp.table.sort(["order"], ascending=0, inplace=True)
        counts_highExp.table = counts_highExp.table.iloc[0:500, :]
        counts_highExp.table.drop("order", axis=1, inplace=True)

        log.write("plot correlations: %s\n" % cor_outfile)
        counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000)

        log.write("plot pc3,pc4: %s\n" % pca1_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca1_outfile,
                             x_axis="PC1",
                             y_axis="PC2",
                             colour="group",
                             shape="group")

        log.write("plot pc3,pc4: %s\n" % pca2_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca2_outfile,
                             x_axis="PC3",
                             y_axis="PC4",
                             colour="group",
                             shape="group")

        log.write("plot heatmap: %s\n" % heatmap_outfile)
        counts_highExp.heatmap(heatmap_outfile)
示例#3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("sleuth", "edger", "deseq2", "mock"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion", type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--contrasts",
                      dest="contrasts",
                      action="append",
                      help=("contrasts for post-hoc testing writen as comma "
                            "seperated list `condition,replicate` etc"))

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="string",
                      help=("fit type used for observed dispersion mean "
                            "relationship in deseq2"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing counts for sleuth. Sleuth "
                            "expects counts files to be called abundance.h5"))

    parser.add_option("--outfile-sleuth-count",
                      dest="outfile_sleuth_count",
                      type="string",
                      help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option("--sleuth-genewise",
                      dest="sleuth_genewise",
                      action="store_true",
                      help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.set_defaults(
        input_filename_tags="-",
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq2",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=False,
        save_r_environment=None,
        filter_min_counts_per_row=None,
        filter_min_counts_per_sample=None,
        filter_percentile_rowsums=None,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
        contrasts=None,
        output_filename_pattern=None,
        deseq2_fit_type="parametric",
        sleuth_counts_dir=None,
        outfile_sleuth_count=None,
        outfile_sleuth_tpm=None,
        use_ihw=False,
        sleuth_genewise=False,
        gene_biomart=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfile_prefix = options.output_filename_pattern + "_" + options.method

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrasts=options.contrasts,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(pd.io.parsers.read_csv(
                sys.stdin, sep="\t", index_col=0, comment="#"))
        else:
            counts = Counts.Counts(pd.io.parsers.read_csv(
                IOTools.openFile(options.input_filename_tags, "r"),
                sep="\t", index_col=0, comment="#"))

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     disperion=options.edger_dispersion,
                                     ref_group=options.ref_group,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix)
        results.plotMA(contrast, outfile_prefix=outfile_prefix)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in results.Summary.keys():
        outf = IOTools.openFile("_".join(
            [outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n"
                   % results.Summary[test_group].asTable())
        outf.close()

    E.Stop()
示例#4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--tag-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("ttest", "sleuth", "edger", "deseq2", "mock",
                               "dexseq"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq2-dispersion-method",
                      dest="deseq2_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq2 [default=%default].")

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq2 [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion",
                      type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    # currently not implemented
    # parser.add_option("-R", "--output-R-code", dest="save_r_environment",
    #                  type="string",
    #                  help="save R environment to loc [default=%default]")

    parser.add_option("-r",
                      "--reference-group",
                      dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--reduced-model",
                      dest="reduced_model",
                      type="string",
                      help=("reduced model for LRT"))

    parser.add_option("--contrast",
                      dest="contrast",
                      type="string",
                      help=("contrast for differential expression testing"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing expression estimates"
                            "from sleuth. Sleuth expects counts"
                            "files to be called abundance.h5"))

    parser.add_option("--dexseq-counts-dir",
                      dest="dexseq_counts_dir",
                      type="string",
                      help=("directory containing counts for dexseq. DEXSeq "
                            "expects counts files to be called .txt and"
                            "to be generated by the DEXSeq_counts.py script"))

    parser.add_option("--dexseq-flattened-file",
                      dest="dexseq_flattened_file",
                      type="string",
                      help=("directory containing flat gtf for dexseq. DEXSeq "
                            "expects this to be generated by the"
                            "DEXSeq_prepare_annotations.py script"))

    parser.add_option(
        "--outfile-sleuth-count",
        dest="outfile_sleuth_count",
        type="string",
        help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option(
        "--sleuth-genewise",
        dest="sleuth_genewise",
        action="store_true",
        help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.add_option("--de-test",
                      dest="DEtest",
                      type="choice",
                      choices=("wald", "lrt"),
                      help=("Differential expression test"))

    parser.add_option("--Rhistory",
                      dest="Rhistory",
                      type="string",
                      help=("Outfile for R history"))

    parser.add_option("--Rimage",
                      dest="Rimage",
                      type="string",
                      help=("Outfile for R image"))

    parser.set_defaults(input_filename_tags="-",
                        input_filename_design=None,
                        output_filename=sys.stdout,
                        method="deseq2",
                        fdr=0.1,
                        deseq2_dispersion_method="pooled",
                        deseq2_fit_type="parametric",
                        edger_dispersion=0.4,
                        ref_group=False,
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        spike_foldchange_max=4.0,
                        spike_expression_max=5.0,
                        spike_expression_bin_width=0.5,
                        spike_foldchange_bin_width=0.5,
                        spike_max_counts_per_bin=50,
                        model=None,
                        contrast=None,
                        output_filename_pattern=None,
                        sleuth_counts_dir=None,
                        dexseq_counts_dir=None,
                        dexseq_flattened_file=None,
                        outfile_sleuth_count=None,
                        outfile_sleuth_tpm=None,
                        use_ihw=False,
                        sleuth_genewise=False,
                        gene_biomart=None,
                        DEtest="wald",
                        reduced_model=None,
                        Rhistory=None,
                        Rimage=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    RH = None
    if options.Rhistory or options.Rimage:
        RH = R.R_with_History()

    outfile_prefix = options.output_filename_pattern

    # Expression.py currently expects a refernce group for edgeR and
    # sleuth, regardless of which test is used
    if not options.ref_group and (options.method is "edger"
                                  or options.method is "sleuth"):
        raise ValueError(
            "Must provide a reference group ('--reference-group')")

    # create Design object
    design = Expression.ExperimentalDesign(
        pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                    sep="\t",
                    index_col=0,
                    comment="#"))

    if len(set(design.table[options.contrast])) > 2:

        if options.method == "deseq2" or options.method == "sleuth":
            if options.DEtest == "wald":
                raise ValueError(
                    "Factor must have exactly two levels for Wald Test. "
                    "If you have more than two levels in your factor, "
                    "consider LRT")
        else:
            E.info('''There are more than 2 levels for the contrast
            specified" "(%s:%s). The log2fold changes in the results table
            and MA plots will be for the first two levels in the
            contrast. The p-value will be the p-value for the overall
            significance of the contrast. Hence, some genes will have a
            signficant p-value but 0-fold change between the first two
            levels''' % (options.contrast, set(design[options.contrast])))

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files "
            " (--sleuth-counts-dir)")

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart,
                                 DE_test=options.DEtest,
                                 ref_group=options.ref_group,
                                 reduced_model=options.reduced_model)

    # DEXSeq reads in data itself
    elif options.method == "dexseq":
        assert options.dexseq_counts_dir, (
            "need to specify the location of the .txt counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t",
                        index_col=0,
                        comment="#"))

        # validate design against counts and model
        # design.validate(model=options.model)

        experiment = Expression.DEExperiment_DEXSeq()
        results = experiment.run(design,
                                 base_dir=options.dexseq_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 ref_group=options.ref_group,
                                 outfile_prefix=outfile_prefix,
                                 flattenedfile=options.dexseq_flattened_file,
                                 fdr=options.fdr)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(sys.stdin,
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))
        else:
            counts = Counts.Counts(
                pd.io.parsers.read_csv(IOTools.openFile(
                    options.input_filename_tags, "r"),
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     ref_group=options.ref_group,
                                     fdr=options.fdr,
                                     dispersion=options.edger_dispersion)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group,
                                     DEtest=options.DEtest,
                                     R=RH)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in list(results.Summary.keys()):
        outf = IOTools.openFile(
            "_".join([outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n" %
                   results.Summary[test_group].asTable())
        outf.close()

    if options.Rhistory:
        RH.saveHistory(options.Rhistory)
    if options.Rimage:
        RH.saveImage(options.Rimage)

    E.Stop()
示例#5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("filter", "spike", "normalize"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--spike-change-bin-min",
                      dest="min_cbin",
                      type="float",
                      help="minimum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-max",
                      dest="max_cbin",
                      type="float",
                      help="maximum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-width",
                      dest="width_cbin",
                      type="float",
                      help="bin width for change bins [default=%default].")

    parser.add_option("--spike-initial-bin-min",
                      dest="min_ibin",
                      type="float",
                      help="minimum bin for initial bins[default=%default].")

    parser.add_option("--spike-initial-bin-max",
                      dest="max_ibin",
                      type="float",
                      help="maximum bin for intitial bins[default=%default].")

    parser.add_option("--spike-initial-bin-width",
                      dest="width_ibin",
                      type="float",
                      help="bin width intitial bins[default=%default].")

    parser.add_option(
        "--spike-minimum",
        dest="min_spike",
        type="int",
        help="minimum number of spike-ins required within each bin\
                      [default=%default].")

    parser.add_option(
        "--spike-maximum",
        dest="max_spike",
        type="int",
        help="maximum number of spike-ins allowed within each bin\
                      [default=%default].")

    parser.add_option("--spike-difference-method",
                      dest="difference",
                      type="choice",
                      choices=("relative", "logfold", "abs_logfold"),
                      help="method to use for calculating difference\
                      [default=%default].")

    parser.add_option("--spike-iterations",
                      dest="iterations",
                      type="int",
                      help="number of iterations to generate spike-ins\
                      [default=%default].")

    parser.add_option("--spike-cluster-maximum-distance",
                      dest="cluster_max_distance",
                      type="int",
                      help="maximum distance between adjacent loci in cluster\
                      [default=%default].")

    parser.add_option("--spike-cluster-minimum-size",
                      dest="cluster_min_size",
                      type="int",
                      help="minimum number of loci required per cluster\
                      [default=%default].")

    parser.add_option("--spike-type",
                      dest="spike_type",
                      type="choice",
                      choices=("row", "cluster"),
                      help="spike in type [default=%default].")

    parser.add_option("--spike-subcluster-min-size",
                      dest="min_sbin",
                      type="int",
                      help="minimum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-max-size",
                      dest="max_sbin",
                      type="int",
                      help="maximum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-bin-width",
                      dest="width_sbin",
                      type="int",
                      help="bin width for subcluster size\
                      [default=%default].")

    parser.add_option("--spike-output-method",
                      dest="output_method",
                      type="choice",
                      choices=("append", "seperate"),
                      help="defines whether the spike-ins should be appended\
                      to the original table or seperately [default=%default].")

    parser.add_option("--spike-shuffle-column-suffix",
                      dest="shuffle_suffix",
                      type="string",
                      help="the suffix of the columns which are to be shuffled\
                      [default=%default].")

    parser.add_option("--spike-keep-column-suffix",
                      dest="keep_suffix",
                      type="string",
                      help="a list of suffixes for the columns which are to be\
                      keep along with the shuffled columns[default=%default].")

    parser.add_option("--normalization-method",
                      dest="normalization_method",
                      type="choice",
                      choices=("deseq-size-factors", "total-count",
                               "total-column", "total-row"),
                      help="normalization method to apply [%default]")

    parser.add_option("-t",
                      "--tags-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.set_defaults(input_filename_tags="-",
                        method="filter",
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        output_method="seperate",
                        difference="logfold",
                        spike_type="row",
                        min_cbin=0,
                        max_cbin=100,
                        width_cbin=100,
                        min_ibin=0,
                        max_ibin=100,
                        width_ibin=100,
                        max_spike=100,
                        min_spike=None,
                        iterations=1,
                        cluster_max_distance=100,
                        cluster_min_size=10,
                        min_sbin=1,
                        max_sbin=1,
                        width_sbin=1,
                        shuffle_suffix=None,
                        keep_suffix=None,
                        normalization_method="deseq-size-factors")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # load
    if options.keep_suffix:
        # if using suffix, loadTagDataPandas will throw an error as it
        # looks for column names which exactly match the design
        # "tracks" need to write function in Counts.py to handle
        # counts table and design table + suffix
        counts = pd.read_csv(options.stdin, sep="\t", comment="#")
        inf = IOTools.openFile(options.input_filename_design)
        design = pd.read_csv(inf, sep="\t", index_col=0)
        inf.close()
        design = design[design["include"] != 0]

        if options.method in ("filter", "spike"):
            if options.input_filename_design is None:
                raise ValueError("method '%s' requires a design file" %
                                 options.method)
    else:
        # create Counts object
        # TS if spike type is cluster, need to keep "contig" and "position"
        # columns out of index
        if options.spike_type == "cluster":
            index = None,
        else:
            index = 0
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(options.stdin,
                                       sep="\t",
                                       index_col=index,
                                       comment="#"))
        else:
            counts = Counts.Counts(IOTools.openFile(
                options.input_filename_tags, "r"),
                                   sep="\t",
                                   index_col=index,
                                   comment="#")

        # TS normalization doesn't require a design table
        if not options.method == "normalize":

            assert options.input_filename_design and os.path.exists(
                options.input_filename_design)

            # create Design object
            design = Expression.ExperimentalDesign(
                pd.read_csv(IOTools.openFile(options.input_filename_design,
                                             "r"),
                            sep="\t",
                            index_col=0,
                            comment="#"))

    if options.method == "filter":

        assert (options.filter_min_counts_per_sample is not None or
                options.filter_min_counts_per_row is not None or
                options.filter_percentile_rowsums is not None), \
            "no filtering parameters have been suplied"

        # filter
        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        nobservations, nsamples = counts.table.shape

        if nobservations == 0:
            E.warn("no observations remaining after filtering- no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "normalize":

        counts.normalise(method=options.normalization_method,
                         row_title="total")

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "spike":
        # check parameters are sensible and set parameters where they
        # are not explicitly set
        if not options.min_spike:
            E.info("setting minimum number of spikes per bin to equal"
                   "maximum number of spikes per bin (%s)" % options.max_spike)
            options.min_spike = options.max_spike

        if options.spike_type == "cluster":

            assert options.max_sbin <= options.cluster_min_size, \
                ("max size of subscluster: %s is greater than min size of"
                 "cluster: %s" % (options.max_sbin, options.cluster_min_size))

            counts_columns = set(counts.table.columns.values.tolist())

            assert ("contig" in counts_columns and
                    "position" in counts_columns), \
                ("cluster analysis requires columns named 'contig' and"
                 "'position' in the dataframe")

            counts.sort(sort_columns=["contig", "position"], reset_index=True)

        # restrict design table to first pair only

        design.firstPairOnly()

        # get dictionaries to map group members to column names
        # use different methods depending on whether suffixes are supplied
        if options.keep_suffix:
            g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix(
                options.shuffle_suffix, options.keep_suffix)
        else:
            # if no suffixes supplied, spike and keep tracks are the same
            g_to_track = design.getGroups2Samples()
            g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track)

        # set up numpy arrays for change and initial values
        change_bins = np.arange(options.min_cbin, options.max_cbin,
                                options.width_cbin)
        initial_bins = np.arange(options.min_ibin, options.max_ibin,
                                 options.width_ibin)

        E.info("Column boundaries are: %s" % str(change_bins))
        E.info("Row boundaries are: %s" % str(initial_bins))

        # shuffle rows/clusters
        if options.spike_type == "cluster":
            E.info("looking for clusters...")
            clusters_dict = Counts.findClusters(counts_sort,
                                                options.cluster_max_distance,
                                                options.cluster_min_size,
                                                g_to_spike_tracks, groups)
            if len(clusters_dict) == 0:
                raise Exception("no clusters were found, check parameters")

            E.info("shuffling subcluster regions...")
            output_indices, counts = Counts.shuffleCluster(
                initial_bins, change_bins, g_to_spike_tracks, groups,
                options.difference, options.max_spike, options.iterations,
                clusters_dict, options.max_sbin, options.min_sbin,
                options.width_sbin)

        elif options.spike_type == "row":

            E.info("shuffling rows...")
            output_indices, bin_counts = counts.shuffleRows(
                options.min_cbin, options.max_cbin, options.width_cbin,
                options.min_ibin, options.max_ibin, options.width_ibin,
                g_to_spike_tracks, design.groups, options.difference,
                options.max_spike, options.iterations)

        filled_bins = Counts.thresholdBins(output_indices, bin_counts,
                                           options.min_spike)

        assert len(filled_bins) > 0, "No bins contained enough spike-ins"

        # write out
        counts.outputSpikes(filled_bins,
                            g_to_keep_tracks,
                            design.groups,
                            output_method=options.output_method,
                            spike_type=options.spike_type,
                            min_cbin=options.min_cbin,
                            width_cbin=options.width_cbin,
                            max_cbin=options.max_cbin,
                            min_ibin=options.min_ibin,
                            width_ibin=options.width_ibin,
                            max_ibin=options.max_ibin,
                            min_sbin=options.min_sbin,
                            width_sbin=options.width_sbin,
                            max_sbin=options.max_sbin)

    E.Stop()
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("filter", "spike", "normalize"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this numer   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--spike-change-bin-min", dest="min_cbin",
                      type="float",
                      help="minimum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-max", dest="max_cbin",
                      type="float",
                      help="maximum bin for change bins [default=%default].")

    parser.add_option("--spike-change-bin-width", dest="width_cbin",
                      type="float",
                      help="bin width for change bins [default=%default].")

    parser.add_option("--spike-initial-bin-min", dest="min_ibin",
                      type="float",
                      help="minimum bin for initial bins[default=%default].")

    parser.add_option("--spike-initial-bin-max", dest="max_ibin",
                      type="float",
                      help="maximum bin for intitial bins[default=%default].")

    parser.add_option("--spike-initial-bin-width", dest="width_ibin",
                      type="float",
                      help="bin width intitial bins[default=%default].")

    parser.add_option("--spike-minimum", dest="min_spike",
                      type="int",
                      help="minimum number of spike-ins required within each bin\
                      [default=%default].")

    parser.add_option("--spike-maximum", dest="max_spike",
                      type="int",
                      help="maximum number of spike-ins allowed within each bin\
                      [default=%default].")

    parser.add_option("--spike-difference-method", dest="difference",
                      type="choice",
                      choices=("relative", "logfold", "abs_logfold"),
                      help="method to use for calculating difference\
                      [default=%default].")

    parser.add_option("--spike-iterations", dest="iterations", type="int",
                      help="number of iterations to generate spike-ins\
                      [default=%default].")

    parser.add_option("--spike-cluster-maximum-distance",
                      dest="cluster_max_distance", type="int",
                      help="maximum distance between adjacent loci in cluster\
                      [default=%default].")

    parser.add_option("--spike-cluster-minimum-size",
                      dest="cluster_min_size", type="int",
                      help="minimum number of loci required per cluster\
                      [default=%default].")

    parser.add_option("--spike-type",
                      dest="spike_type", type="choice",
                      choices=("row", "cluster"),
                      help="spike in type [default=%default].")

    parser.add_option("--spike-subcluster-min-size",
                      dest="min_sbin", type="int",
                      help="minimum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-max-size",
                      dest="max_sbin", type="int",
                      help="maximum size of subcluster\
                      [default=%default].")

    parser.add_option("--spike-subcluster-bin-width",
                      dest="width_sbin", type="int",
                      help="bin width for subcluster size\
                      [default=%default].")

    parser.add_option("--spike-output-method",
                      dest="output_method", type="choice",
                      choices=("append", "seperate"),
                      help="defines whether the spike-ins should be appended\
                      to the original table or seperately [default=%default].")

    parser.add_option("--spike-shuffle-column-suffix",
                      dest="shuffle_suffix", type="string",
                      help="the suffix of the columns which are to be shuffled\
                      [default=%default].")

    parser.add_option("--spike-keep-column-suffix",
                      dest="keep_suffix", type="string",
                      help="a list of suffixes for the columns which are to be\
                      keep along with the shuffled columns[default=%default].")

    parser.add_option("--normalization-method",
                      dest="normalization_method", type="choice",
                      choices=("deseq-size-factors",
                               "total-count",
                               "total-column",
                               "total-row"),
                      help="normalization method to apply [%default]")

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.set_defaults(
        input_filename_tags="-",
        method="filter",
        filter_min_counts_per_row=None,
        filter_min_counts_per_sample=None,
        filter_percentile_rowsums=None,
        output_method="seperate",
        difference="logfold",
        spike_type="row",
        min_cbin=0,
        max_cbin=100,
        width_cbin=100,
        min_ibin=0,
        max_ibin=100,
        width_ibin=100,
        max_spike=100,
        min_spike=None,
        iterations=1,
        cluster_max_distance=100,
        cluster_min_size=10,
        min_sbin=1,
        max_sbin=1,
        width_sbin=1,
        shuffle_suffix=None,
        keep_suffix=None,
        normalization_method="deseq-size-factors"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # load
    if options.keep_suffix:
        # if using suffix, loadTagDataPandas will throw an error as it
        # looks for column names which exactly match the design
        # "tracks" need to write function in Counts.py to handle
        # counts table and design table + suffix
        counts = pd.read_csv(options.stdin, sep="\t",  comment="#")
        inf = IOTools.openFile(options.input_filename_design)
        design = pd.read_csv(inf, sep="\t", index_col=0)
        inf.close()
        design = design[design["include"] != 0]

        if options.method in ("filter", "spike"):
            if options.input_filename_design is None:
                raise ValueError("method '%s' requires a design file" %
                                 options.method)
    else:
        # create Counts object
        # TS if spike type is cluster, need to keep "contig" and "position"
        # columns out of index
        if options.spike_type == "cluster":
            index = None,
        else:
            index = 0
        if options.input_filename_tags == "-":
            counts = Counts.Counts(pd.io.parsers.read_csv(
                options.stdin, sep="\t", index_col=index, comment="#"))
        else:
            counts = Counts.Counts(
                IOTools.openFile(options.input_filename_tags, "r"),
                sep="\t", index_col=index, comment="#")

        # TS normalization doesn't require a design table
        if not options.method == "normalize":

            assert options.input_filename_design and os.path.exists(
                options.input_filename_design)

            # create Design object
            design = Expression.ExperimentalDesign(
                pd.read_csv(
                    IOTools.openFile(options.input_filename_design, "r"),
                    sep="\t", index_col=0, comment="#"))

    if options.method == "filter":

        assert (options.filter_min_counts_per_sample is not None or
                options.filter_min_counts_per_row is not None or
                options.filter_percentile_rowsums is not None), \
            "no filtering parameters have been suplied"

        # filter
        # remove sample with low counts
        if options.filter_min_counts_per_sample:
                counts.removeSamples(
                    min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
                counts.removeObservationsFreq(
                    min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
                counts.removeObservationsPerc(
                    percentile_rowsums=options.filter_percentile_rowsums)

        nobservations, nsamples = counts.table.shape

        if nobservations == 0:
            E.warn("no observations remaining after filtering- no output")
            return

        if nsamples == 0:
            E.warn("no samples remain after filtering - no output")
            return

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "normalize":

        counts.normalise(method=options.normalization_method,
                         row_title="total")

        # write out
        counts.table.to_csv(options.stdout, sep="\t", header=True)

    elif options.method == "spike":
        # check parameters are sensible and set parameters where they
        # are not explicitly set
        if not options.min_spike:
            E.info("setting minimum number of spikes per bin to equal"
                   "maximum number of spikes per bin (%s)" % options.max_spike)
            options.min_spike = options.max_spike

        if options.spike_type == "cluster":

            assert options.max_sbin <= options.cluster_min_size, \
                ("max size of subscluster: %s is greater than min size of"
                 "cluster: %s" % (options.max_sbin, options.cluster_min_size))

            counts_columns = set(counts.table.columns.values.tolist())

            assert ("contig" in counts_columns and
                    "position" in counts_columns), \
                ("cluster analysis requires columns named 'contig' and"
                 "'position' in the dataframe")

            counts.sort(sort_columns=["contig", "position"], reset_index=True)

        # restrict design table to first pair only

        design.firstPairOnly()

        # get dictionaries to map group members to column names
        # use different methods depending on whether suffixes are supplied
        if options.keep_suffix:
            g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix(
                options.shuffle_suffix, options.keep_suffix)
        else:
            # if no suffixes supplied, spike and keep tracks are the same
            g_to_track = design.getGroups2Samples()
            g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track)

        # set up numpy arrays for change and initial values
        change_bins = np.arange(options.min_cbin, options.max_cbin,
                                options.width_cbin)
        initial_bins = np.arange(options.min_ibin, options.max_ibin,
                                 options.width_ibin)

        E.info("Column boundaries are: %s" % str(change_bins))
        E.info("Row boundaries are: %s" % str(initial_bins))

        # shuffle rows/clusters
        if options.spike_type == "cluster":
            E.info("looking for clusters...")
            clusters_dict = Counts.findClusters(
                counts_sort, options.cluster_max_distance,
                options.cluster_min_size, g_to_spike_tracks, groups)
            if len(clusters_dict) == 0:
                raise Exception("no clusters were found, check parameters")

            E.info("shuffling subcluster regions...")
            output_indices, counts = Counts.shuffleCluster(
                initial_bins, change_bins, g_to_spike_tracks, groups,
                options.difference, options.max_spike,
                options.iterations, clusters_dict,
                options.max_sbin, options.min_sbin, options.width_sbin)

        elif options.spike_type == "row":

            E.info("shuffling rows...")
            output_indices, bin_counts = counts.shuffleRows(
                options.min_cbin, options.max_cbin, options.width_cbin,
                options.min_ibin, options.max_ibin, options.width_ibin,
                g_to_spike_tracks, design.groups, options.difference,
                options.max_spike, options.iterations)

        filled_bins = Counts.thresholdBins(output_indices, bin_counts,
                                           options.min_spike)

        assert len(filled_bins) > 0, "No bins contained enough spike-ins"

        # write out
        counts.outputSpikes(
            filled_bins,
            g_to_keep_tracks, design.groups,
            output_method=options.output_method,
            spike_type=options.spike_type,
            min_cbin=options.min_cbin,
            width_cbin=options.width_cbin,
            max_cbin=options.max_cbin,
            min_ibin=options.min_ibin,
            width_ibin=options.width_ibin,
            max_ibin=options.max_ibin,
            min_sbin=options.min_sbin,
            width_sbin=options.width_sbin,
            max_sbin=options.max_sbin)

    E.Stop()