Пример #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("sleuth", "edger", "deseq2", "mock"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion", type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--contrasts",
                      dest="contrasts",
                      action="append",
                      help=("contrasts for post-hoc testing writen as comma "
                            "seperated list `condition,replicate` etc"))

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="string",
                      help=("fit type used for observed dispersion mean "
                            "relationship in deseq2"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing counts for sleuth. Sleuth "
                            "expects counts files to be called abundance.h5"))

    parser.add_option("--outfile-sleuth-count",
                      dest="outfile_sleuth_count",
                      type="string",
                      help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option("--sleuth-genewise",
                      dest="sleuth_genewise",
                      action="store_true",
                      help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.set_defaults(
        input_filename_tags="-",
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq2",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=False,
        save_r_environment=None,
        filter_min_counts_per_row=None,
        filter_min_counts_per_sample=None,
        filter_percentile_rowsums=None,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
        contrasts=None,
        output_filename_pattern=None,
        deseq2_fit_type="parametric",
        sleuth_counts_dir=None,
        outfile_sleuth_count=None,
        outfile_sleuth_tpm=None,
        use_ihw=False,
        sleuth_genewise=False,
        gene_biomart=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfile_prefix = options.output_filename_pattern + "_" + options.method

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrasts=options.contrasts,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(pd.io.parsers.read_csv(
                sys.stdin, sep="\t", index_col=0, comment="#"))
        else:
            counts = Counts.Counts(pd.io.parsers.read_csv(
                IOTools.openFile(options.input_filename_tags, "r"),
                sep="\t", index_col=0, comment="#"))

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t", index_col=0, comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     disperion=options.edger_dispersion,
                                     ref_group=options.ref_group,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrasts=options.contrasts,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix)
        results.plotMA(contrast, outfile_prefix=outfile_prefix)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in results.Summary.keys():
        outf = IOTools.openFile("_".join(
            [outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n"
                   % results.Summary[test_group].asTable())
        outf.close()

    E.Stop()
Пример #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--tag-tsv-file",
                      dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option("-d",
                      "--design-tsv-file",
                      dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("ttest", "sleuth", "edger", "deseq2", "mock",
                               "dexseq"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq2-dispersion-method",
                      dest="deseq2_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq2 [default=%default].")

    parser.add_option("--deseq2-fit-type",
                      dest="deseq2_fit_type",
                      type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq2 [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion",
                      type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      type="float",
                      help="fdr to apply [default=%default].")

    # currently not implemented
    # parser.add_option("-R", "--output-R-code", dest="save_r_environment",
    #                  type="string",
    #                  help="save R environment to loc [default=%default]")

    parser.add_option("-r",
                      "--reference-group",
                      dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--reduced-model",
                      dest="reduced_model",
                      type="string",
                      help=("reduced model for LRT"))

    parser.add_option("--contrast",
                      dest="contrast",
                      type="string",
                      help=("contrast for differential expression testing"))

    parser.add_option("--sleuth-counts-dir",
                      dest="sleuth_counts_dir",
                      type="string",
                      help=("directory containing expression estimates"
                            "from sleuth. Sleuth expects counts"
                            "files to be called abundance.h5"))

    parser.add_option("--dexseq-counts-dir",
                      dest="dexseq_counts_dir",
                      type="string",
                      help=("directory containing counts for dexseq. DEXSeq "
                            "expects counts files to be called .txt and"
                            "to be generated by the DEXSeq_counts.py script"))

    parser.add_option("--dexseq-flattened-file",
                      dest="dexseq_flattened_file",
                      type="string",
                      help=("directory containing flat gtf for dexseq. DEXSeq "
                            "expects this to be generated by the"
                            "DEXSeq_prepare_annotations.py script"))

    parser.add_option(
        "--outfile-sleuth-count",
        dest="outfile_sleuth_count",
        type="string",
        help=("outfile for full count table generated by sleuth"))

    parser.add_option("--outfile-sleuth-tpm",
                      dest="outfile_sleuth_tpm",
                      type="string",
                      help=("outfile for full tpm table generated by sleuth"))

    parser.add_option("--use-ihw",
                      dest="use_ihw",
                      action="store_true",
                      help=("use the independent hypothesis weighting method "
                            "to obtain weighted FDR"))

    parser.add_option(
        "--sleuth-genewise",
        dest="sleuth_genewise",
        action="store_true",
        help=("run genewise, rather than transcript level testing"))

    parser.add_option("--gene-biomart",
                      dest="gene_biomart",
                      type="string",
                      help=("name of ensemble gene biomart"))

    parser.add_option("--de-test",
                      dest="DEtest",
                      type="choice",
                      choices=("wald", "lrt"),
                      help=("Differential expression test"))

    parser.add_option("--Rhistory",
                      dest="Rhistory",
                      type="string",
                      help=("Outfile for R history"))

    parser.add_option("--Rimage",
                      dest="Rimage",
                      type="string",
                      help=("Outfile for R image"))

    parser.set_defaults(input_filename_tags="-",
                        input_filename_design=None,
                        output_filename=sys.stdout,
                        method="deseq2",
                        fdr=0.1,
                        deseq2_dispersion_method="pooled",
                        deseq2_fit_type="parametric",
                        edger_dispersion=0.4,
                        ref_group=False,
                        filter_min_counts_per_row=None,
                        filter_min_counts_per_sample=None,
                        filter_percentile_rowsums=None,
                        spike_foldchange_max=4.0,
                        spike_expression_max=5.0,
                        spike_expression_bin_width=0.5,
                        spike_foldchange_bin_width=0.5,
                        spike_max_counts_per_bin=50,
                        model=None,
                        contrast=None,
                        output_filename_pattern=None,
                        sleuth_counts_dir=None,
                        dexseq_counts_dir=None,
                        dexseq_flattened_file=None,
                        outfile_sleuth_count=None,
                        outfile_sleuth_tpm=None,
                        use_ihw=False,
                        sleuth_genewise=False,
                        gene_biomart=None,
                        DEtest="wald",
                        reduced_model=None,
                        Rhistory=None,
                        Rimage=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    RH = None
    if options.Rhistory or options.Rimage:
        RH = R.R_with_History()

    outfile_prefix = options.output_filename_pattern

    # Expression.py currently expects a refernce group for edgeR and
    # sleuth, regardless of which test is used
    if not options.ref_group and (options.method is "edger"
                                  or options.method is "sleuth"):
        raise ValueError(
            "Must provide a reference group ('--reference-group')")

    # create Design object
    design = Expression.ExperimentalDesign(
        pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                    sep="\t",
                    index_col=0,
                    comment="#"))

    if len(set(design.table[options.contrast])) > 2:

        if options.method == "deseq2" or options.method == "sleuth":
            if options.DEtest == "wald":
                raise ValueError(
                    "Factor must have exactly two levels for Wald Test. "
                    "If you have more than two levels in your factor, "
                    "consider LRT")
        else:
            E.info('''There are more than 2 levels for the contrast
            specified" "(%s:%s). The log2fold changes in the results table
            and MA plots will be for the first two levels in the
            contrast. The p-value will be the p-value for the overall
            significance of the contrast. Hence, some genes will have a
            signficant p-value but 0-fold change between the first two
            levels''' % (options.contrast, set(design[options.contrast])))

    # Sleuth reads in data itself so we don't need to create a counts object
    if options.method == "sleuth":
        assert options.sleuth_counts_dir, (
            "need to specify the location of the abundance.h5 counts files "
            " (--sleuth-counts-dir)")

        # validate design against counts and model
        design.validate(model=options.model)

        experiment = Expression.DEExperiment_Sleuth()
        results = experiment.run(design,
                                 base_dir=options.sleuth_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 outfile_prefix=outfile_prefix,
                                 counts=options.outfile_sleuth_count,
                                 tpm=options.outfile_sleuth_tpm,
                                 fdr=options.fdr,
                                 genewise=options.sleuth_genewise,
                                 gene_biomart=options.gene_biomart,
                                 DE_test=options.DEtest,
                                 ref_group=options.ref_group,
                                 reduced_model=options.reduced_model)

    # DEXSeq reads in data itself
    elif options.method == "dexseq":
        assert options.dexseq_counts_dir, (
            "need to specify the location of the .txt counts files")

        # create Design object
        design = Expression.ExperimentalDesign(
            pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                        sep="\t",
                        index_col=0,
                        comment="#"))

        # validate design against counts and model
        # design.validate(model=options.model)

        experiment = Expression.DEExperiment_DEXSeq()
        results = experiment.run(design,
                                 base_dir=options.dexseq_counts_dir,
                                 model=options.model,
                                 contrast=options.contrast,
                                 ref_group=options.ref_group,
                                 outfile_prefix=outfile_prefix,
                                 flattenedfile=options.dexseq_flattened_file,
                                 fdr=options.fdr)

    else:
        # create Counts object
        if options.input_filename_tags == "-":
            counts = Counts.Counts(
                pd.io.parsers.read_csv(sys.stdin,
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))
        else:
            counts = Counts.Counts(
                pd.io.parsers.read_csv(IOTools.openFile(
                    options.input_filename_tags, "r"),
                                       sep="\t",
                                       index_col=0,
                                       comment="#"))

        # validate design against counts and model
        design.validate(counts, options.model)

        # restrict counts to samples in design table
        counts.restrict(design)

        # remove sample with low counts
        if options.filter_min_counts_per_sample:
            counts.removeSamples(
                min_counts_per_sample=options.filter_min_counts_per_sample)

        # remove observations with low counts
        if options.filter_min_counts_per_row:
            counts.removeObservationsFreq(
                min_counts_per_row=options.filter_min_counts_per_row)

        # remove bottom percentile of observations
        if options.filter_percentile_rowsums:
            counts.removeObservationsPerc(
                percentile_rowsums=options.filter_percentile_rowsums)

        # check samples are the same in counts and design following counts
        # filtering and, if not, restrict design table and re-validate
        design.revalidate(counts, options.model)

        # set up experiment and run tests
        if options.method == "ttest":
            experiment = Expression.DEExperiment_TTest()
            results = experiment.run(counts, design)

        elif options.method == "edger":
            experiment = Expression.DEExperiment_edgeR()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     ref_group=options.ref_group,
                                     fdr=options.fdr,
                                     dispersion=options.edger_dispersion)

        elif options.method == "deseq2":

            experiment = Expression.DEExperiment_DESeq2()
            results = experiment.run(counts,
                                     design,
                                     model=options.model,
                                     contrast=options.contrast,
                                     outfile_prefix=outfile_prefix,
                                     fdr=options.fdr,
                                     fit_type=options.deseq2_fit_type,
                                     ref_group=options.ref_group,
                                     DEtest=options.DEtest,
                                     R=RH)

    results.getResults(fdr=options.fdr)

    if options.use_ihw:
        results.calculateIHW(alpha=options.fdr)

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH)
        results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    results.summariseDEResults()

    # write out summary tables for each comparison/contrast
    for test_group in list(results.Summary.keys()):
        outf = IOTools.openFile(
            "_".join([outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n" %
                   results.Summary[test_group].asTable())
        outf.close()

    if options.Rhistory:
        RH.saveHistory(options.Rhistory)
    if options.Rimage:
        RH.saveImage(options.Rimage)

    E.Stop()