예제 #1
0
 def lm(self, l, h):
     for i in range(l, h + 1):
         data_frame, data_model = self.mount_reg_params(i)
         print data_model
         rpy.set_default_mode(rpy.NO_CONVERSION)
         linear_model = r.lm(r(data_model), data=data_frame)
         rpy.set_default_mode(rpy.BASIC_CONVERSION)
         print r.summary(linear_model)['r.squared']
 def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3):
     times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output.
     SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"]
     for tag in tag_list:
         self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false
         rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
         data = rc.list(y=self.y[tag],X=self.X[tag])
         model = "y~X-1" # Use -1 because X has an intercept already
         if self.regtype=="Independent Linear":
             try:
                 result = rc.lm(model,data=data)
             except:
                 pdb.set_trace()
         elif self.regtype=="Independent Logistic":
             result = rc.glm(model,family=rc.binomial("logit"),data=data)
         rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode.
         summary = rc.summary(result,correlation=rc.TRUE)
         self._record_regression_stats(tag, summary)
         beta_dict = dict()
         sorted_sources = self.sorted_sources[tag]
         coeff_matrix = summary["coefficients"]
         for i in range(len(sorted_sources)):
             try:
                 cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:]))
             except IndexError:
                 util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag)
                 if remove_tags_when_bad_regression:
                     self._remove_tag(tag)
                     break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features....
                 continue
             try:
                 cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10)
             except OverflowError:
                 pass
             beta_dict[sorted_sources[i]] = cur_source_dict
         if tag not in self.features: # We've removed this tag a few lines above, so skip it.
             continue
         self.beta[tag] = beta_dict
         if times_showed_summary < n_times_show_summary:
             self._print_regression_summary(tag, summary)
             times_showed_summary += 1
예제 #3
0
    ggmod.nma = 1
    ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0])
    ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]),
                      maxiter=1000)
    print 'ggres.params', ggres.params

    g11res = optimize.fmin(
        lambda params: -loglike_GARCH11(params, x - x.mean())[0],
        [0.6, 0.6, 0.2])
    print g11res
    llf = loglike_GARCH11(g11res, x - x.mean())
    print llf[0]

    garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated')
    fit = r.garchFit(f, data=x - x.mean(), include_mean=False, trace=False)
    print r.summary(fit)
'''based on R default simulation
model = list(omega = 1e-06, alpha = 0.1, beta = 0.8)
nobs = 1000
(with nobs=500, gjrgarch doesn't do well

>>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
Optimization terminated successfully.
         Current function value: -448.861335
         Iterations: 385
         Function evaluations: 690
>>> print 'ggres.params', ggres.params
ggres.params [ -7.75090330e-01   1.57714749e-01  -9.60223930e-02   8.76021411e-07]
rearranged
8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
예제 #4
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        """ Set up and estimate R model with data and design """
        r.library("MASS")  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r("y ~ %s-1" % "+".join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results["residuals"])
        if isinstance(self.results["residuals"], dict):
            self.resid = np.zeros((len(list(self.results["residuals"].keys()))))
            for i in list(self.results["residuals"].keys()):
                self.resid[int(i) - 1] = self.results["residuals"][i]
        else:
            self.resid = self.results["residuals"]
        self.fittedvalues = self.results["fitted.values"]
        self.df_resid = self.results["df.residual"]
        self.params = rsum["coefficients"][:, 0]
        self.bse = rsum["coefficients"][:, 1]
        self.bt = rsum["coefficients"][:, 2]
        try:
            self.pvalues = rsum["coefficients"][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault("r.squared", None)
        self.rsquared_adj = rsum.setdefault("adj.r.squared", None)
        self.aic_R = rsum.setdefault("aic", None)
        self.fvalue = rsum.setdefault("fstatistic", None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault("value", None)  # for wls
        df = rsum.setdefault("df", None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault("cov.unscaled", None)
        self.bcov = rsum.setdefault("cov.scaled", None)
        if "sigma" in rsum:
            self.scale = rsum["sigma"]
        elif "dispersion" in rsum:
            self.scale = rsum["dispersion"]
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
예제 #5
0
        x1 = poly_x_vals[i,0]
        x2 = poly_x_vals[i,1]
        y1 = poly_y_vals[i,0]
        y2 = poly_y_vals[i,1]
        xy = poly_xy_vals[i]
        if poly_values: poly_values = poly_values + ","
        poly_values += "(%s, %f, %f, %f, %f, %f)" % (id, x1, x2, y1, y2, xy)
        i = i+1

    query = query + poly_values
#    print query
    c.execute(query)

    
    model = r.lm(r("delta ~ poly(x, 2) + poly(y, 2) + poly(x*y, 1)"), data=r.data_frame(x=py_x, y=py_y, delta=py_delta), weights=py_wt)
    model_summary = r.summary(model)
    model_coeff = array(model_summary['coefficients'])
    if not model_coeff.shape == (6,4):
        print "Bad model for %s" % exp
        continue
    
    c0 = model_coeff[0][0]
    c0_sigma = model_coeff[0][1]
    cx1 = model_coeff[1][0]
    cx1_sigma = model_coeff[1][1]
    cx2 = model_coeff[2][0]
    cx2_sigma = model_coeff[2][1]
    cy1 = model_coeff[3][0]
    cy1_sigma = model_coeff[3][1]
    cy2 = model_coeff[4][0]
    cy2_sigma = model_coeff[4][1]
예제 #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--tree-nh-file", dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header", dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson",
                               "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                # n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not
                        # possible to use cor.test or lsfit directly,
                        # as you have to perform a regression through
                        # the origin.

                        # uncomment to check pearson r against
                        # phylip's value r =
                        # calculateCorrelationCoefficient(columns[x],
                        # columns[y])

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(
                            R("y ~ x - 1"), data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y],
                             options.value_format % phy_r,
                             options.pvalue_format % p,
                             code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(
                            map(lambda x: options.value_format % x, d)) + "\n")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(node_id, bl, c1, c2, ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(
                                node_id, node.data.branchlength, c1, c2)
                        else:
                            assert(node_id == tree.root)
                            assert(len(node.succ) == 3)
                            update_data(
                                node_id, node.data.branchlength,
                                node.succ[0], node.succ[1])
                            update_data(
                                max_index - 1, node.data.branchlength,
                                node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write(
                    "node_id\tvariance\t%s\n" % "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[
                            node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
    x = np.asarray(xr)
    ggmod = Garch(x-x.mean())
    ggmod.nar = 1
    ggmod.nma = 1
    ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0])
    ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
    print 'ggres.params', ggres.params

    g11res = optimize.fmin(lambda params: -loglike_GARCH11(params, x-x.mean())[0], [0.6, 0.6, 0.2])
    print g11res
    llf = loglike_GARCH11(g11res, x-x.mean())
    print llf[0]

    garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated')
    fit = r.garchFit(f, data = x-x.mean(), include_mean=False, trace=False)
    print r.summary(fit)

'''based on R default simulation
model = list(omega = 1e-06, alpha = 0.1, beta = 0.8)
nobs = 1000
(with nobs=500, gjrgarch doesn't do well

>>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
Optimization terminated successfully.
         Current function value: -448.861335
         Iterations: 385
         Function evaluations: 690
>>> print 'ggres.params', ggres.params
ggres.params [ -7.75090330e-01   1.57714749e-01  -9.60223930e-02   8.76021411e-07]
rearranged
8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
예제 #8
0
    ggmod.nma = 1
    ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0])
    ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]),
                      maxiter=1000)
    print('ggres.params', ggres.params)

    g11res = optimize.fmin(
        lambda params: -loglike_GARCH11(params, x - x.mean())[0],
        [0.6, 0.6, 0.2])
    print(g11res)
    llf = loglike_GARCH11(g11res, x - x.mean())
    print(llf[0])

    garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated')
    fit = r.garchFit(f, data=x - x.mean(), include_mean=False, trace=False)
    print(r.summary(fit))
'''based on R default simulation
model = list(omega = 1e-06, alpha = 0.1, beta = 0.8)
nobs = 1000
(with nobs=500, gjrgarch doesn't do well

>>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
Optimization terminated successfully.
         Current function value: -448.861335
         Iterations: 385
         Function evaluations: 690
>>> print('ggres.params', ggres.params
ggres.params [ -7.75090330e-01   1.57714749e-01  -9.60223930e-02   8.76021411e-07]
rearranged
8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
예제 #9
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        ''' Set up and estimate R model with data and design '''
        r.library('MASS')  # still needs to be in test, but also here for
        # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = [
            'x.%d' % (i + 1) for i in range(self.design.shape[1])
        ]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r('y ~ %s-1' % '+'.join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula, data=self.frame, **kwds)
        self.robj = results  # keep the Robj model so it can be
        # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

        #        coeffs = self.results['coefficients']
        #        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results['residuals'])
        if isinstance(self.results['residuals'], dict):
            self.resid = np.zeros((len(self.results['residuals'].keys())))
            for i in self.results['residuals'].keys():
                self.resid[int(i) - 1] = self.results['residuals'][i]
        else:
            self.resid = self.results['residuals']
        self.fittedvalues = self.results['fitted.values']
        self.df_resid = self.results['df.residual']
        self.params = rsum['coefficients'][:, 0]
        self.bse = rsum['coefficients'][:, 1]
        self.bt = rsum['coefficients'][:, 2]
        try:
            self.pvalues = rsum['coefficients'][:, 3]
        except:
            pass
        self.rsquared = rsum.setdefault('r.squared', None)
        self.rsquared_adj = rsum.setdefault('adj.r.squared', None)
        self.aic_R = rsum.setdefault('aic', None)
        self.fvalue = rsum.setdefault('fstatistic', None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault('value', None)  # for wls
        df = rsum.setdefault('df', None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0] - 1  # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault('cov.unscaled', None)
        self.bcov = rsum.setdefault('cov.scaled', None)
        if 'sigma' in rsum:
            self.scale = rsum['sigma']
        elif 'dispersion' in rsum:
            self.scale = rsum['dispersion']
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
예제 #10
0
    x = np.asarray(xr)
    ggmod = Garch(x-x.mean())
    ggmod.nar = 1
    ggmod.nma = 1
    ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0])
    ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
    print('ggres.params', ggres.params)

    g11res = optimize.fmin(lambda params: -loglike_GARCH11(params, x-x.mean())[0], [0.6, 0.6, 0.2])
    print(g11res)
    llf = loglike_GARCH11(g11res, x-x.mean())
    print(llf[0])

    garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated')
    fit = r.garchFit(f, data = x-x.mean(), include_mean=False, trace=False)
    print(r.summary(fit))

'''based on R default simulation
model = list(omega = 1e-06, alpha = 0.1, beta = 0.8)
nobs = 1000
(with nobs=500, gjrgarch doesn't do well

>>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000)
Optimization terminated successfully.
         Current function value: -448.861335
         Iterations: 385
         Function evaluations: 690
>>> print('ggres.params', ggres.params
ggres.params [ -7.75090330e-01   1.57714749e-01  -9.60223930e-02   8.76021411e-07]
rearranged
8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
예제 #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not possible to use
                        # cor.test or lsfit directly, as you have to perform a
                        # regression through the origin.

                        # uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"),
                                            data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y], options.value_format %
                             phy_r, options.pvalue_format % p, code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(map(lambda x: options.value_format % x, d)) +
                        "\n ")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(
                    node_id,
                    bl,
                    c1,
                    c2,
                ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(node_id, node.data.branchlength, c1,
                                        c2)
                        else:
                            assert (node_id == tree.root)
                            assert (len(node.succ) == 3)
                            update_data(node_id, node.data.branchlength,
                                        node.succ[0], node.succ[1])
                            update_data(max_index - 1, node.data.branchlength,
                                        node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write("node_id\tvariance\t%s\n" %
                                     "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()
예제 #12
0
	def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None):
		"""
		02-28-05
			linear model fitting here
		
		03-08-05
			grouping and accumulating before do linear model fitting, see log of 2005, 
			section 'linear model overfitting' for detail.
		03-27-05
			Use glm of R to do logistic regression
		06-30-05
			add cluster_size
			add bit_string to control which parameter should be enabled.
		07-04-05
			add connectivity_2nd
		07-06-05
			add logistic
		11-09-05 extend coeff_list and coeff_p_value_list
			restructure the list, go_no2lm_results[go_no]
			
			--data_prepare
			--submit
		"""
		sys.stderr.write("Linear Model Fitting...\n")
		go_no2lm_results = {}
		
		#06-30-05	setup the formula_list based on bit_string
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		
		for (go_no,data) in go_no2prediction_space.iteritems():
			sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no))
			#11-09-05 extend coeff_list and coeff_p_value_list
			coeff_list = [0]*7	#intercept, p_value, recurrence, connectivity, cluster_size
			coeff_p_value_list = [1]*7
			index = 0	#06-30-05	the pointer for summary_stat
			
			if len(data)<=50:
				#two few data
				continue
			#convert it to a 2d array
			data = array(data)
			"""
			data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \
				repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1]))
			lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)")
			significance_dict = r("summary(lm_result)")
			print significance_dict['coefficients']
			"""
			set_default_mode(NO_CONVERSION) #04-07-05
			data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \
				"cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]})	#06-30-05	-1 denotes is_correct
			if self.logistic:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial"))
			else:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame)	#06-30-05 use formula_list
			set_default_mode(BASIC_CONVERSION) #04-07-05
			#04-07-05 r.summary() requires lm_result in NO_CONVERSION state
			summary_stat = r.summary(lm_result)
			if self.debug:
				print "everything about coefficients from function", go_no, "is"
				print summary_stat['coefficients']	#p-values of coefficients
			"""
			#04-07-05 convert to python dictionary form
			lm_result = lm_result.as_py()
			coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \
				lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \
				lm_result["coefficients"]["cluster_size"], \
				summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\
				summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\
				summary_stat['coefficients'][4][-1], 1]
				#the last entry is score_cut_off, replaced later in get_score_cut_off()
				#06-30-05	add corresponding p-values
			"""
			#06-30-05	0 in summary_stat['coefficients'] is intercept
			coeff_list[0] = summary_stat['coefficients'][0][0]	#0 is the coefficient
			coeff_p_value_list[0] = summary_stat['coefficients'][0][-1]	#-1 is the corresponding p-value
			#06-30-05	fill in other efficients based on bit_string, NOTE i+1
			for i in range(len(bit_string)):
				if bit_string[i] == '1':
					index+=1
					coeff_list[i+1] = summary_stat['coefficients'][index][0]	#0 is the coefficient
					coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1]	#-1 is the corresponding p-value
			#11-09-05 restructure the following list
			go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1]	#the last entry is score_cut_off, replaced later in get_score_cut_off()
		sys.stderr.write("done.\n")
		return go_no2lm_results
예제 #13
0
                        import rpy
                        from rpy import r as R
                        
                        ## Various ways to calculate r. It is not possible to use
                        ## cor.test or lsfit directly, as you have to perform a
                        ## regression through the origin.
                        
                        ## uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        ## for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"), data = R.data_frame(x=columns[x], y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        ## extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write( "\t".join( (headers[x], headers[y],
                                                          options.value_format % phy_r,