示例#1
0
    def _prune(self, fit, p_max):
        """
        If the fit contains statistically insignificant parameters, remove them.
        Returns a pruned fit where all parameters have p-values of the t-statistic below p_max

        Parameters
        ----------
        fit: fm.ols fit object
            Can contain insignificant parameters
        p_max : float
            Maximum allowed probability of the t-statistic

        Returns
        -------
        fit: fm.ols fit object
            Won't contain any insignificant parameters

        """

        model_desc = ModelDesc(
            fit.model.formula.lhs_termlist[:], fit.model.formula.rhs_termlist[:])
        to_prune = fit.pvalues.where(
            fit.pvalues > p_max).dropna().index.tolist()
        to_prune.remove('Intercept')

        while to_prune:
            model_desc.rhs_termlist.remove(Term([LookupFactor(to_prune[0])]))
            fit = fm.ols(model_desc, data=self.data_frame).fit()
            to_prune = fit.pvalues.where(
                fit.pvalues > p_max).dropna().index.tolist()
            to_prune.remove('Intercept')

        return fit
示例#2
0
def _build_targets(formula, data):

    y, _ = dmatrices(ModelDesc(formula.lhs_termlist, list()), data)
    y = np.ravel(y)
    y = np.array(y)

    return y
示例#3
0
    def _do_analysis_no_cross_validation(self):
        """
        Find the best model (fit) and create self.list_of_fits and self.fit
        """

        # first model is just the mean
        response_term = [Term([LookupFactor(self.y)])]
        model_terms = [Term([])]  # empty term is the intercept
        all_model_terms_dict = {
            x: Term([LookupFactor(x)])
            for x in self.list_of_x
        }
        # ...then add another term for each candidate
        #model_terms += [Term([LookupFactor(c)]) for c in candidates]
        model_desc = ModelDesc(response_term, model_terms)
        self._list_of_fits.append(fm.ols(model_desc, data=self.df).fit())
        # try to improve the model until no improvements can be found

        while all_model_terms_dict:
            # try each x and overwrite the best_fit if we find a better one
            # the first best_fit is the one from the previous round
            ref_fit = self._list_of_fits[-1]
            best_fit = self._list_of_fits[-1]
            best_bic = best_fit.bic
            for x, term in all_model_terms_dict.items():
                # make new_fit, compare with best found so far
                model_desc = ModelDesc(
                    response_term, ref_fit.model.formula.rhs_termlist + [term])
                fit = fm.ols(model_desc, data=self.df).fit()
                if fit.bic < best_bic:
                    best_bic = fit.bic
                    best_fit = fit
                    best_x = x
            # Sometimes, the obtained fit may be better, but contains unsignificant parameters.
            # Correct the fit by removing the unsignificant parameters and estimate again
            best_fit = self._prune(best_fit, p_max=self.p_max)

            # if best_fit does not contain more variables than ref fit, exit
            if len(best_fit.model.formula.rhs_termlist) == len(
                    ref_fit.model.formula.rhs_termlist):
                break
            else:
                self._list_of_fits.append(best_fit)
                all_model_terms_dict.pop(best_x)
        self._fit = self._list_of_fits[-1]
示例#4
0
def create_patsy_model(dependent_variable, independent_variables, transformations={}, interactions=[]):
    '''
    Construct and return patsy formula (object representation)
    '''

    # 1) Handling passing in [{'name': X}] vs [X]
    lhs_var = dependent_variable
    rhs_vars = independent_variables
    if 'name' in dependent_variable:
        lhs_var = dependent_variable['name']

    if 'name' in independent_variables[0]:
        new_rhs_vars = []
        for iv in independent_variables:
            if type(iv) is list:
                new_rhs_vars.append([x['name'] for x in iv])
            else:
                if 'name' in iv:
                    new_rhs_vars.append(iv['name'])
                else:
                    new_rhs_vars.append(iv)
        rhs_vars = new_rhs_vars

    if interactions:
        first_interaction = interactions[0]
        if 'name' in first_interaction:
            new_interactions = []
            for interaction in interactions:
                new_interactions.append([term['name'] for term in interaction])
            rhs_interactions = new_interactions
        else:
            rhs_interactions = interactions

    # 2) Constructing model
    lhs = [ Term([LookupFactor(lhs_var)]) ]

    rhs = [ Term([]) ]
    for rhs_var in rhs_vars:
        if type(rhs_var) is list:
            rhs += [ Term([ LookupFactor(term) for term in rhs_var ]) ]
        else:
            if rhs_var in transformations:
                transformation = transformations[rhs_var]    
                if transformation == 'square':
                    rhs += [ Term([ LookupFactor(rhs_var) ]) ]
                format_string = transformation_to_format_string[transformation]
                rhs += [ Term([ EvalFactor(format_string.format(rhs_var)) ]) ]                    
            else:
                rhs += [ Term([ LookupFactor(rhs_var) ]) ]

    if interactions:
        rhs += [ Term([ LookupFactor(term) for term in interaction ]) for interaction in rhs_interactions ]

    model = ModelDesc(lhs, rhs)
    return model
示例#5
0
def dict_to_model_desc(dictionary):
    """Return a string representation of a patsy ModelDesc object"""
    lhs_termlist = [Term([LookupFactor(dictionary['lhs_termlist'][0])])]
    rhs_termlist = []
    for name in dictionary['rhs_termlist']:
        if name == '':
            rhs_termlist.append(Term([]))
        else:
            rhs_termlist.append(Term([LookupFactor(name)]))

    return ModelDesc(lhs_termlist, rhs_termlist)
示例#6
0
    def _prune(self, fit, p_max):
        """
        If the fit contains statistically insignificant parameters, remove them.
        Returns a pruned fit where all parameters have p-values of the t-statistic below p_max

        Parameters
        ----------
        fit: fm.ols fit object
            Can contain insignificant parameters
        p_max : float
            Maximum allowed probability of the t-statistic

        Returns
        -------
        fit: fm.ols fit object
            Won't contain any insignificant parameters

        """
        def remove_from_model_desc(x, model_desc):
            """
            Return a model_desc without x
            """

            rhs_termlist = []
            for t in model_desc.rhs_termlist:
                if not t.factors:
                    # intercept, add anyway
                    rhs_termlist.append(t)
                elif not x == t.factors[0]._varname:
                    # this is not the term with x
                    rhs_termlist.append(t)

            md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
            return md

        corrected_model_desc = ModelDesc(fit.model.formula.lhs_termlist[:],
                                         fit.model.formula.rhs_termlist[:])
        pars_to_prune = fit.pvalues.where(
            fit.pvalues > p_max).dropna().index.tolist()
        try:
            pars_to_prune.remove('Intercept')
        except:
            pass
        while pars_to_prune:
            corrected_model_desc = remove_from_model_desc(
                pars_to_prune[0], corrected_model_desc)
            fit = fm.ols(corrected_model_desc, data=self.df).fit()
            pars_to_prune = fit.pvalues.where(
                fit.pvalues > p_max).dropna().index.tolist()
            try:
                pars_to_prune.remove('Intercept')
            except:
                pass
        return fit
示例#7
0
    def _modeldesc_from_dict(self, d):
        """Return a string representation of a patsy ModelDesc object"""
        lhs_termlist = [Term([LookupFactor(d['lhs_termlist'][0])])]
        rhs_termlist = []
        for name in d['rhs_termlist']:
            if name == '':
                rhs_termlist.append(Term([]))
            else:
                rhs_termlist.append(Term([LookupFactor(name)]))

        md = ModelDesc(lhs_termlist, rhs_termlist)
        return md
示例#8
0
        def remove_from_model_desc(x, model_desc):
            """
            Return a model_desc without x
            """

            rhs_termlist = []
            for t in model_desc.rhs_termlist:
                if not t.factors:
                    # intercept, add anyway
                    rhs_termlist.append(t)
                elif not x == t.factors[0]._varname:
                    # this is not the term with x
                    rhs_termlist.append(t)

            md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
            return md
示例#9
0
def build_model_desc(snps, no_interactions):
    """
    Creates the model description (formula)
    :param snps: The selected snp labels
    :param no_interactions: If false, interactions will not be included in the model
    :return: The model description
    """
    x_terms = []
    for i in range(len(snps)):
        # Main effects
        snp_i = EvalFactor(snps[i])
        x_terms.append(Term([snp_i]))

        if not no_interactions:
            for j in range(i + 1, len(snps)):
                # Interaction effects
                snp_j = EvalFactor(snps[j])
                x_terms.append(Term([snp_i, snp_j]))

    return ModelDesc([], x_terms)
示例#10
0
    def _do_analysis_cross_validation(self):
        """
        Find the best model (fit) based on cross-valiation (leave one out)
        """
        assert len(self.data_frame) < 15, "Minimum 15 datapoints"

        # initialization: first model is the mean, but compute cv correctly.
        errors = []
        response_term = [Term([LookupFactor(self.dependent_var)])]
        model_desc = ModelDesc(response_term, [Term([])])
        for i in self.data_frame.index:
            # make new_fit, compute cross-validation and store error
            data_frame_ = self.data_frame.drop(i, axis=0)
            fit = fm.ols(model_desc, data=data_frame_).fit()
            cross_prediction = self._predict(
                fit=fit, data_frame=self.data_frame.loc[[i], :])
            errors.append(
                cross_prediction['predicted'] - cross_prediction[self.dependent_var])

        self._list_of_fits = [fm.ols(model_desc, data=self.data_frame).fit()]
        self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]

        # try to improve the model until no improvements can be found
        all_model_terms_dict = {x: Term([LookupFactor(x)])
                                for x in self.list_of_x}
        while all_model_terms_dict:
            # import pdb;pdb.set_trace()
            # try each x in all_exog and overwrite if we find a better one
            # at the end of iteration (and not earlier), save the best of the iteration
            better_model_found = False
            best = {
                "fit": self._list_of_fits[-1],
                "cverror": self.list_of_cverrors[-1]
            }
            for value, term in all_model_terms_dict.items():
                model_desc = ModelDesc(
                    response_term, self._list_of_fits[-1].model.formula.rhs_termlist + [term])
                # cross_validation, currently only implemented for monthly data
                # compute the mean error for a given formula based on leave-one-out.
                errors = []
                for i in self.data_frame.index:
                    # make new_fit, compute cross-validation and store error
                    data_frame_ = self.data_frame.drop(i, axis=0)
                    fit = fm.ols(model_desc, data=data_frame_).fit()
                    cross_prediction = self._predict(
                        fit=fit, data_frame=self.data_frame.loc[[i], :])
                    errors.append(
                        cross_prediction['predicted'] - cross_prediction[self.dependent_var])
                cverror = np.mean(np.abs(np.array(errors)))
                # compare the model with the current fit
                if cverror < best['cverror']:
                    # better model, keep it
                    # first, reidentify using all the datapoints
                    best['fit'] = fm.ols(
                        model_desc, data=self.data_frame).fit()
                    best['cverror'] = cverror
                    better_model_found = True
                    best_val = value

            if better_model_found:
                self._list_of_fits.append(best['fit'])
                self.list_of_cverrors.append(best['cverror'])

            else:
                # if we did not find a better model, exit
                break

            # next iteration with the found exog removed
            all_model_terms_dict.pop(best_val)

        self._fit = self._list_of_fits[-1]
示例#11
0
execfile("code/03-DataPrep.py")

from patsy import dmatrices, ModelDesc, Term, LookupFactor
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.feature_selection import SelectFromModel
import numpy as np

'''
model 1 - logistic regression with L1 regularization
'''
formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in orgfeatures])

y, x = dmatrices(formula, rawdf, return_type="dataframe")
y = y.values.flatten()

logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01)

logreg.fit(x, y)
scores = cross_val_score(logreg, x, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()})
nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist()
print(len(nflist))

# feature selection using best model from cross validation and get the best features
fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear')
fslogreg.fit(x, y)
示例#12
0
def _group_model(spreadsheet=None,
                 contrastdicts=None,
                 variabledicts=None,
                 subjects=None):
    rawdataframe = loadspreadsheet(spreadsheet)

    id_column = None
    for variabledict in variabledicts:
        if variabledict["type"] == "id":
            id_column = variabledict["name"]
            break

    assert id_column is not None, "Missing id column, cannot specify model"

    rawdataframe[id_column] = pd.Series(rawdataframe[id_column], dtype=str)
    if all(str(id).startswith("sub-")
           for id in rawdataframe[id_column]):  # for bids
        rawdataframe[id_column] = [
            str(id).replace("sub-", "") for id in rawdataframe[id_column]
        ]
    rawdataframe = rawdataframe.set_index(id_column)

    continuous_columns = []
    categorical_columns = []
    columns_in_order = []
    for variabledict in variabledicts:
        if variabledict["type"] == "continuous":
            continuous_columns.append(variabledict["name"])
            columns_in_order.append(variabledict["name"])
        elif variabledict["type"] == "categorical":
            categorical_columns.append(variabledict["name"])
            columns_in_order.append(variabledict["name"])

    # separate
    continuous = rawdataframe[continuous_columns]
    categorical = rawdataframe[categorical_columns]

    # only keep subjects that are in this analysis
    # also sets order
    continuous = continuous.loc[subjects, :]
    categorical = categorical.loc[subjects, :]

    # Demean continuous for flameo
    continuous -= continuous.mean()

    # replace np.nan by 0 for demeaned_continuous file and regression models
    continuous = continuous.replace({np.nan: 0})

    # change type first to string then to category
    categorical = categorical.astype(str)
    categorical = categorical.astype("category")

    # merge
    dataframe = categorical.join(continuous, how="outer").loc[subjects, :]

    # maintain order
    dataframe = dataframe[columns_in_order]

    # remove zero variance columns
    columns_var_gt_0 = dataframe.apply(pd.Series.nunique) > 1
    dataframe = dataframe.loc[:, columns_var_gt_0]

    # don't need to specify lhs
    lhs = []

    # generate rhs
    rhs = [Term([])]  # force intercept
    for contrastdict in contrastdicts:
        if contrastdict["type"] == "infer":
            # for every term in the model a contrast of type infer needs to be specified
            rhs.append(
                Term([LookupFactor(name)
                      for name in contrastdict["variable"]]))

    # specify patsy design matrix
    modelDesc = ModelDesc(lhs, rhs)
    dmat = dmatrix(modelDesc, dataframe, return_type="dataframe")
    _check_multicollinearity(dmat)

    # prepare lsmeans
    uniqueValuesForCategorical = [(0.0, ) if pd.api.types.is_numeric_dtype(
        dataframe[f].dtype) else dataframe[f].unique()
                                  for f in dataframe.columns]
    grid = pd.DataFrame(list(product(*uniqueValuesForCategorical)),
                        columns=dataframe.columns)
    refDmat = dmatrix(dmat.design_info, grid, return_type="dataframe")

    # data frame to store contrasts
    contrastMats = []

    for field, columnslice in dmat.design_info.term_name_slices.items():
        constraint = {
            column: 0
            for column in dmat.design_info.column_names[columnslice]
        }
        contrast = dmat.design_info.linear_constraint(constraint)
        assert np.all(contrast.variable_names == dmat.columns)
        contrastMat = pd.DataFrame(contrast.coefs, columns=dmat.columns)
        contrastMats.append((field, contrastMat))

    for contrastdict in contrastdicts:
        if contrastdict["type"] == "t":
            (variable, ) = contrastdict["variable"]
            variableLevels = dataframe[variable].unique()
            # Generate the lsmeans matrix where there is one row for each
            # factor level. Each row is a contrast vector.
            # This contrast vector corresponds to the mean of the dependent
            # variable at the factor level.
            # For example, we would have one row that calculates the mean
            # for patients, and one for controls.
            lsmeans = pd.DataFrame(index=variableLevels, columns=dmat.columns)
            for level in variableLevels:
                lsmeans.loc[level, :] = refDmat.loc[grid[variable] ==
                                                    level, :].mean()
            valueDict = contrastdict["values"]
            names = [
                name for name in valueDict.keys() if name in variableLevels
            ]
            values = [valueDict[name] for name in names]
            # If we wish to test the mean of each group against zero,
            # we can simply use these contrasts and be done.
            # To test a linear hypothesis such as patient-control=0,
            # which is expressed here as {"patient":1, "control":-1},
            # we translate it to a contrast vector by taking the linear
            # combination of the lsmeans contrasts.
            contrastVector = lsmeans.loc[names, :].mul(values, axis=0).sum()
            contrastMat = pd.DataFrame([contrastVector], columns=dmat.columns)
            contrastMats.append((contrastdict["name"], contrastMat))

    npts, nevs = dmat.shape

    if nevs >= npts:
        logger.warning("Reverting to simple intercept only design. \n"
                       f"nevs ({nevs}) >= npts ({npts})")
        return (
            {
                "intercept": [1.0] * len(subjects)
            },
            [["mean", "T", ["intercept"], [1]]],
            ["mean"],
        )

    regressors = {d: dmat[d].tolist() for d in dmat.columns}
    contrasts = []
    contrast_names = []

    for contrastName, contrastMat in contrastMats:  # t contrasts
        if contrastMat.shape[0] == 1:
            contrastVec = contrastMat.squeeze()
            contrasts.append((contrastName, "T", list(contrastVec.keys()),
                              list(contrastVec)))

            contrast_names.append(contrastName)

    for contrastName, contrastMat in contrastMats:  # f contrasts
        if contrastMat.shape[0] > 1:

            tcontrasts = []  # an f contrast consists of multiple t contrasts
            for i, contrastVec in contrastMat.iterrows():
                tname = f"{contrastName}_{i:d}"
                tcontrasts.append(
                    (tname, "T", list(contrastVec.keys()), list(contrastVec)))

            contrasts.extend(tcontrasts)  # add t contrasts to the model
            contrasts.append(
                (contrastName, "F", tcontrasts))  # then add the f contrast

            contrast_names.append(
                contrastName)  # we only care about the f contrast

    return regressors, contrasts, contrast_names
示例#13
0
    chist = chist[Yhist_gpvars + ['HISTBIN', wgtvar]].groupby(
        Yhist_gpvars + ['HISTBIN'], as_index=False).aggregate(np.sum)
    return chist


if __name__ == "__main__":
    ## Build the regression formula
    catvars = list(cfg.flevels.keys())
    with open("fpaths.json") as fpj:
        FPATHS = json.load(fpj)
    numvar_evals = ["I(YEAR - 2000)", "INCTOT99"]
    catvar_evals = [
        "C(" + cv + ", Treatment, levels=cfg.flevels['" + cv + "'])"
        for cv in catvars
    ]
    desc = ModelDesc([], [Term([EvalFactor(v)]) for v in numvar_evals])
    desc.rhs_termlist += [Term([EvalFactor(v)]) for v in catvar_evals]
    # Interactions
    interact_order = 2
    catvar_interact = ['SEX', 'AGECAT', 'RACE']
    print("Including all order-" + str(interact_order) +
          " interactions of the following variables:\n\t" +
          ", ".join(catvar_interact + numvar_evals))
    interact_evals = numvar_evals + [
        catvar_evals[i] for i in [catvars.index(v) for v in catvar_interact]
    ]
    desc.rhs_termlist += [
        Term([EvalFactor(v) for v in list(comb)])
        for comb in combinations(interact_evals, interact_order)
    ]
    # 'implied decimals'
示例#14
0
def scatterfit(x,
               y,
               method='pearson',
               adjustVars=[],
               labelLookup={},
               plotLine=True,
               annotateFit=True,
               annotatePoints=False,
               returnModel=False,
               lc='gray',
               **kwargs):
    """Scatter plot of x vs. y with a fitted line overlaid.

    Expects x and y as pd.Series but will accept arrays.

    Prints covariate unadjusted AND adjusted rho/pvalues on the figure.
    Plots covariate unadjusted data.

    Parameters
    ----------
    x,y : ndarrays or pd.Series
    method : string
        'pearson'
    adjustVars : list
    labelLookup : dict
    plotLine : bool
    annotateFit : bool
    annotatePoints : bool
    returnModel : bool
    kwargs : additional keyword arguments
        Passed to the plot function for the data points.

    Returns
    -------
    model : statsmodels GLM object
        Optionally the fitted model, depending on returnModel."""

    k = kwargs.keys()
    if not 'mec' in k:
        kwargs.update({'mec': 'k'})
    if not 'mfc' in k:
        kwargs.update({'mfc': 'k'})
    if not 'ms' in k:
        kwargs.update({'ms': 5})
    """Try to force X and Y into pandas.Series objects"""
    if not isinstance(x, pd.core.series.Series):
        x = pd.Series(x, name='X')
    if not isinstance(y, pd.core.series.Series):
        y = pd.Series(y, name='Y')

    xlab = x.name
    ylab = y.name
    if xlab == ylab:
        ylab = 'y_' + ylab
        xlab = 'x_' + xlab
        x.name = xlab
        y.name = ylab

    tmpDf = pd.concat((
        x,
        y,
    ), axis=1, join='inner')
    for av in adjustVars:
        tmpDf = pd.concat((tmpDf, pd.DataFrame(av)), axis=1)
    """Drop any row with a nan in either column"""
    tmpDf = tmpDf.dropna(axis=0, how='any')

    plt.gca().set_xmargin(0.2)
    plt.gca().set_ymargin(0.2)

    unrho, unp = partialcorr(tmpDf[xlab], tmpDf[ylab], method=method)
    """Print unadjusted AND adjusted rho/pvalues
    Plot unadjusted data with fit though..."""

    if method == 'spearman' and plotLine:
        #unrho,unp=stats.spearmanr(tmpDf[xlab],tmpDf[ylab])
        if unrho > 0:
            plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab]), '-', color=lc)
        else:
            plt.plot(sorted(tmpDf[xlab]),
                     sorted(tmpDf[ylab], reverse=True),
                     '-',
                     color=lc)
    elif method == 'pearson' and plotLine:
        #unrho,unp=stats.pearsonr(tmpDf[xlab],tmpDf[ylab])
        formula_like = ModelDesc(
            [Term([LookupFactor(ylab)])],
            [Term([]), Term([LookupFactor(xlab)])])

        Y, X = dmatrices(formula_like, data=tmpDf, return_type='dataframe')
        model = sm.GLM(Y, X, family=sm.families.Gaussian())
        results = model.fit()
        mnmxi = np.array([tmpDf[xlab].idxmin(), tmpDf[xlab].idxmax()])
        plt.plot(tmpDf[xlab][mnmxi],
                 results.fittedvalues[mnmxi],
                 '-',
                 color=lc)

    plt.plot(tmpDf[xlab], tmpDf[ylab], 'o', **kwargs)

    if annotatePoints:
        annotationParams = dict(xytext=(0, 5),
                                textcoords='offset points',
                                size='medium')
        for x, y, lab in zip(tmpDf[xlab], tmpDf[ylab], tmpDf.index):
            plt.annotate(lab, xy=(x, y), **annotationParams)

    if annotateFit:
        if unp > 0.001:
            s = 'p = %1.3f\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0])
        else:
            s = 'p = %1.1e\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0])
        textTL(plt.gca(), s, color='black')

        if len(adjustVars) > 0:
            rho, p = partialcorr(tmpDf[xlab],
                                 tmpDf[ylab],
                                 adjust=adjustVars,
                                 method=method)
            if p > 0.001:
                s = 'adj-p = %1.3f\nadj-rho = %1.2f\nn = %d' % (p, rho,
                                                                tmpDf.shape[0])
            else:
                s = 'adj-p = %1.1e\nadj-rho = %1.2f\nn = %d' % (p, rho,
                                                                tmpDf.shape[0])

            textTR(plt.gca(), s, color='red')

    plt.xlabel(labelLookup.get(xlab, xlab))
    plt.ylabel(labelLookup.get(ylab, ylab))
    if returnModel:
        return model
示例#15
0
    import extensions.patsy
    import common

    from patsy import ModelDesc, Term, dmatrix

    datafile = os.path.join(common.data_dir, "child.iq", 'kidiq.csv')

    kw = {
        'dtype': {
            'mom_hs': 'category',
            'mom_work': 'category',
        }
    }
    df = pd.read_csv(datafile, **kw)
    """
        Create a ModelDesc object capable of displaying readable column names.  Compare 
        with:

            Xu = dmatrix(" ~ mom_hs + C(mom_work,extensions.patsy.FullRankOneHot)", data=df, return_type='dataframe')

    """
    factors = [
        extensions.patsy.LookupFactor('mom_hs'),
        extensions.patsy.EvalFactorRenamed(
            'C(mom_work,extensions.patsy.FullRankOneHot)').set_name('mom_work')
    ]
    terms = [Term([])] + [Term([f]) for f in factors]
    desc = ModelDesc([], terms)

    X = dmatrix(desc, df, return_type='dataframe')
示例#16
0
 def redraw(self):
     variables = []
     if self.includeallcheckBox.isChecked():
         for i in range(self.interactionlistWidget.count()):
             variables.append(self.interactionlistWidget.item(i).text())
     else:
         for i in range(self.selectedlistWidget.count()):
             variables.append(self.selectedlistWidget.item(i).text())
     nX = len(variables)
     if nX < 1:
         QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     Yname = self.YcomboBox.currentText()
     Lc = DS.Lc[DS.Ic]
     Gc = DS.Gc[DS.Ic]
     Lcy = Lc[Gc]
     Lcx = Lc[-Gc]
     data = DS.Raw.loc[DS.Ir, DS.Ic]
     Y = data[Lcy]
     X = data[Lcx]
     if nX > X.shape[0]:
         QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\
                                        QtWidgets.QMessageBox.Ok)
         return ()
     ny = self.YcomboBox.currentIndex()
     Y = Y.values.astype('float')
     X = X.values.astype('float')
     Y = Y[:, ny]
     nr = len(Y)
     basey = [Term([LookupFactor(Yname)])]
     basex = []
     for term in variables:
         if term == 'Intercept':
             basex = [INTERCEPT]
             variables.remove(term)
     for term in variables:
         vterm = term.split(':')
         term_lookup = [LookupFactor(x) for x in vterm]
         if len(term_lookup) > 1:
             if vterm[0] == vterm[1]:
                 term_lookup = [EvalFactor(vterm[0] + ' ** 2')]
         basex.append(Term(term_lookup))
     desc = ModelDesc(basey, basex)
     data = np.column_stack((X, Y))
     columns = Lcx.tolist()
     columns.append(Yname)
     data = pd.DataFrame(data, columns=columns)
     y, mx = dmatrices(desc, data, return_type='dataframe')
     dism = np.linalg.inv(np.dot(mx.T.values, mx.values))
     mod = OLS(y, mx)
     DOE.res = mod.fit()
     # calculation of cross-validation
     ypcv = list()
     rcv = list()
     bres = list()
     loo = LeaveOneOut()
     loo.get_n_splits(mx)
     for train_index, test_index in loo.split(mx):
         mx_train = mx.ix[train_index, :]
         mx_test = mx.ix[test_index, :]
         y_train = y.ix[train_index, :]
         y_test = y.ix[test_index, :]
         modcv = OLS(y_train, mx_train)
         rescv = modcv.fit()
         ypcv.append(rescv.predict(mx_test).values[0])
         rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0])
         bres.append((rescv.params - DOE.res.params).values**2)
     bres = pd.DataFrame(bres)
     bres = bres.sum() * nr / (nr - 1)
     bres = np.sqrt(bres.values)
     tres = np.abs(DOE.res.params.values / bres)
     pt = 2 * t.pdf(tres, nr)
     fig = Figure()
     ax = fig.add_subplot(111)
     if self.coefradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (DOE.res.pvalues[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\
             format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1]))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0]
             cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:]
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:]
             cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:]
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=[cmin.values, cmax.values],
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.coefpredradioButton.isChecked():
         if DOE.res.params.index[0] == 'Intercept':
             ind = np.arange(1, len(DOE.res.params))
             vcol = []
             for i in ind:
                 if (pt[i] < 0.05): vcol.append('red')
                 else: vcol.append('blue')
             ax.bar(ind, DOE.res.params[1:], align='center', color=vcol)
             ax.set_title(
                 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.
                 format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr),
                        DOE.res.params[0], DOE.res.params[0] +
                        tres[0] * bres[0] / np.sqrt(nr)))
             ax.set_xticklabels(DOE.res.params.index[1:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[1:],
                         yerr=tres[1:] * bres[1:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         else:
             ind = np.arange(1, len(DOE.res.params) + 1)
             ax.bar(ind, DOE.res.params, align='center')
             ax.set_title('Coefficient Value : None Intercept')
             ax.set_xticklabels(DOE.res.params.index[0:],
                                rotation='vertical')
             ax.errorbar(ind,
                         DOE.res.params[0:],
                         yerr=tres[0:] * bres[0:] / np.sqrt(nr),
                         fmt='o',
                         ecolor='green')
         ax.set_xticks(ind)
         ax.set_xlabel('Coefficient Number (except Intercept)')
         ax.annotate('red bar: significance 5%',
                     xy=(0.75, 0.95),
                     xycoords='figure fraction',
                     fontsize=8)
     elif self.fitradioButton.isChecked():
         yf = DOE.res.fittedvalues.tolist()
         resid = DOE.res.resid.tolist()
         ax.scatter(y, yf, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('Fitted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         df = DOE.res.df_resid
         vares = np.sum(DOE.res.resid**2) / df
         rmsef = np.sqrt(vares)
         vary = np.var(y.values)
         evar = (1 - vares / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSEF {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsef, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         ax.set_xlabel('Measured Values')
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], yf[i]))
     elif self.predradioButton.isChecked():
         ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o')
         ax.set_ylabel('CV Predicted Values', color='red')
         ax.tick_params('y', colors='red')
         ax1 = ax.twinx()
         ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o')
         ax1.set_ylabel('CV Residuals', color='blue')
         ax1.tick_params('y', colors='blue')
         xmin, xmax = ax.get_xlim()
         ax.set_ylim([xmin, xmax])
         ax.set_xlabel('Measured Values')
         df = DS.Raw.shape[0]
         varcv = np.sum(np.array(rcv)**2) / df
         rmsecv = np.sqrt(varcv)
         vary = np.var(y.values)
         evar = (1 - varcv / vary) * 100
         ax.set_title(
             'df {:3.0f};   RMSECV {:6.2f};   Exp.Var.{:5.1f}%'.format(
                 df, rmsecv, evar))
         ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red'))
         ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue'))
         if self.VcheckBox.isChecked():
             Lr = DOE.res.model.data.row_labels
             for i, txt in enumerate(Lr):
                 ax.annotate(str(txt), (y.ix[i], ypcv[i]))
     elif self.levradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 20
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = np.diag(np.dot(np.dot(mx, dism), mx.T))
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.surradioButton.isChecked():
         Ftable = surtabDlg.launch(None)
         if len(np.shape(Ftable)) == 0: return ()
         if np.argmax(Ftable['X axis'].values) == np.argmax(
                 Ftable['Y axis'].values):
             QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\
                                            QtWidgets.QMessageBox.Ok)
             return ()
         fig = plt.figure()
         ax = fig.add_subplot(111)
         npts = 100
         xname = Ftable[(Ftable['X axis'] == True).values].index[0]
         yname = Ftable[(Ftable['Y axis'] == True).values].index[0]
         cname = Ftable[(Ftable['Constant'] == True).values].index.tolist()
         cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value']
         zname = Yname
         x = np.linspace(float(Ftable['min'][xname]),
                         float(Ftable['max'][xname]), npts)
         y = np.linspace(float(Ftable['min'][yname]),
                         float(Ftable['max'][yname]), npts)
         px = []
         py = []
         for i in range(npts):
             for j in range(npts):
                 px.append(x[i])
                 py.append(y[j])
         data = pd.DataFrame({xname: px, yname: py, zname: px})
         xtitle = ''
         for i in range(len(cname)):
             xtitle = xtitle + cname[i] + ' = ' + str(
                 cvalue.values.tolist()[i])
             data[cname[i]] = np.ones(npts**2) * float(cvalue[i])
         my, mx = dmatrices(desc, data, return_type='dataframe')
         pz = DOE.res.predict(mx)
         px = np.array(px)
         py = np.array(py)
         pz = np.array(pz)
         z = plt.mlab.griddata(px, py, pz, x, y, interp='linear')
         plt.contour(x, y, z, 15, linewidths=0.5, colors='k')
         plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow)
         plt.colorbar()
         ax.set_xlabel(xname)
         ax.set_ylabel(yname)
         ax.set_title(xtitle)
         ax.set_xlim([px.min(), px.max()])
         ax.set_ylim([py.min(), py.max()])
     elif self.dismradioButton.isChecked():
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(dism)
         fig.colorbar(cax)
         ax.set_title('Trace = {:10.4f}'.format(np.trace(dism)))
     elif self.inflradioButton.isChecked():
         mxc = preprocessing.scale(mx.values,
                                   with_mean=True,
                                   with_std=False)
         mxc2 = mxc**2
         infl = np.sum(mxc2, axis=0) * np.diag(dism)
         fig = plt.figure()
         ax = fig.add_subplot(111)
         cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r')
         fig.colorbar(cax)
         ax.yaxis.grid(False)
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
         ax.set_xlabel('Inlaction Factor')
     if self.XcheckBox.isChecked():
         if self.XlineEdit.text():
             ax.set_xlabel(self.XlineEdit.text())
     else:
         ax.set_xlabel('')
     if self.YcheckBox.isChecked():
         if self.YlineEdit.text():
             ax.set_ylabel(self.YlineEdit.text())
     else:
         ax.set_ylabel('')
     if self.XGcheckBox.isChecked():
         ax.xaxis.grid(True)
     else:
         ax.xaxis.grid(False)
     if self.YGcheckBox.isChecked():
         ax.yaxis.grid(True)
     else:
         ax.yaxis.grid(False)
     if not self.XMcheckBox.isChecked():
         ax.tick_params(axis='x',
                        which='both',
                        bottom='off',
                        top='off',
                        labelbottom='off')
     if not self.YMcheckBox.isChecked():
         ax.tick_params(axis='y',
                        which='both',
                        left='off',
                        right='off',
                        labelleft='off')
     self.rmmpl()
     self.addmpl(fig)