예제 #1
0
def VariableMiningMnlogit(df, y):
    """Searches variables using multinomial logistic regression to find ones that predict the target dependent variable 'y'.

    Args:
        df (DataFrame): DataFrame that holds all the variables.
        y (string): Column name of dependent variable y.

    Returns:
        variables (list): A list of tuples each containing r-squared value and variable name
    """
    variables = []
    for name in df.columns:
        try:
            if df[name].var() < 1e-7:
                continue

            formula = '{} ~ '.format(y) + name
            model = smf.mnlogit(formula, data=df)
            nobs = len(model.endog)
            if nobs < len(df) / 2:
                continue

            results = model.fit()
        except:
            continue

        variables.append((results.prsquared, name))

    return variables
예제 #2
0
 def fit(self, data: List[pd.DataFrame], fit_method: str = 'ncg', disp: int = 0) -> None:
     num_timesteps = len(data)
     self.models = list()
     for t in range(0, num_timesteps, 1):
         model = smf.mnlogit('A ~ ' + self.formula,
                             data=data[t].astype({'A': int}))
         model = model.fit(method=fit_method, disp=disp)
         self.models.append(model)
예제 #3
0
    def __init__(self, formula=None, data=None, weights=None, **kwargs):

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = mnlogit(formula, data, **kwargs)
            self._fit = self._model.fit(maxiter=10000)
            self._betas = self._fit.params
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
예제 #4
0
    def __init__(self, formula=None, data=None, weights=None, **kwargs):

        # convert all variables raised to a power to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        if type(data) == pd.DataFrame:
            power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)',
                                             formula)))
            for var in power_vars:
                data[var] = data[var].astype('float64')

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)
            self._y_design_info = y.design_info
            self._X_design_info = X.design_info
            self._model = mnlogit(formula, data, **kwargs)
            self._fit = self._model.fit(maxiter=10000)
            self._betas = self._fit.params
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
def Forward_Select(data, response, modeltype, metric):
    """Select Variables using forward selection before building a Linear model.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response
    response: string, name of response column in data
    model_type: It accepts both "Regression" and "Classification" type problems.
    metric: the criteria improving which the variable is Selected.
          The metric must be a known metric among all Statsmodels' model metrics.

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model with an intercept
           selected by forward selection.
           evaluated by adjusted R-squared or AIC or BIC or whatever
    selected: variables that are selected by this algorithm.
    """
    ############################################################################
    #####   CAUTION CAUTION: IF you have Scipy 1.0 version you have to do this
    ##### This is a dumb workaround until Scipy 1.0 is patched - I should not have
    ### upgraded from scipy 0.19 to scipy 1.0 = full of bugs!![]. If you DONT
    #### have this statement then your glm.summary statement will give an ERROR
    stats.chisqprob = lambda chisq, data: stats.chi2.sf(chisq, data)
    #### For those who have Scipy 0.19 or older, you can comment out above line.
    ############################################################################
    start_time = time.time()
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    maxiter = 1000
    if metric == 'rsquared' or metric == 'rsquared_adj':
        current_score, best_new_score = 0.0, 0.0
    else:
        current_score, best_new_score = np.inf, np.inf
    iterations = 1
    if data[response].dtype == object:
        response_char = 'C(' + response + ')'
        data[response], factors = factorize_class(data[response])
    else:
        response_char = response
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            #print('Variable considered: %s' %candidate)
            if data[candidate].dtype == object:
                ### If it is a categorical variable, encode it this way
                #### In smf formula string notation, you don't have to add 1. it adds it automatically.
                if selected == []:
                    formula = "{} ~ {}".format(response_char,
                                                'C('+candidate+')')
                else:
                    formula = "{} ~ {} + {}".format(response_char,
                                            ' + '.join(selected), 'C('+candidate+')')
            else:
                formula = "{} ~ {}".format(response_char,
                                           ' + '.join(selected + [candidate]))
            if modeltype == 'Regression':
                model = smf.ols(formula, data).fit(max_iter=maxiter, disp=0)
            else:
                if len(data[response].value_counts()) > 2:
                    try:
                        model = smf.mnlogit(formula=formula, data=data).fit(max_iter=maxiter, disp=0)
                    except:
                        model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit(
                                        max_iter=maxiter, disp=0)
                else:
                    try:
                        model = smf.logit(formula=formula, data=data).fit(max_iter=maxiter, disp=0)
                    except:
                        model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit(
                                                    max_iter=maxiter, disp=0)
            try:
                score = eval('model.'+metric)
            except:
                if modeltype == 'Regression':
                    metric = 'aic'
                    print('Metric not recognized. Choosing default = %s' %metric)
                else:
                    metric = 'aic'
                    print('Metric not recognized. Choosing default = %s' %metric)
                score = eval('model.'+metric)
            iterations += 1
            scores_with_candidates.append((score, candidate))
        if metric == 'rsquared' or metric == 'rsquared_adj':
            scores_with_candidates.sort(reverse=False)
        else:
            scores_with_candidates.sort(reverse=False)
        best_new_score, best_candidate = scores_with_candidates.pop()
        if metric == 'rsquared' or metric == 'rsquared_adj':
            if current_score < best_new_score:
                remaining.remove(best_candidate)
                selected.append(best_candidate)
                current_score = best_new_score
        else:
            if current_score > best_new_score:
                remaining.remove(best_candidate)
                selected.append(best_candidate)
                current_score = best_new_score
    tempform = []
    print('Time taken for %d iterations (minutes): %0.2f' %(iterations, (time.time()-start_time)/60))
    for eachcol in selected:
        if tempform == []:
            if data[eachcol].dtype == object:
                ### If it is a categorical variable, encode it this way
                tempform = 'C('+eachcol+')'
            else:
                tempform = eachcol
        else:
            if data[eachcol].dtype == object:
                ### If it is a categorical variable, encode it this way
                tempform = "{} + {}".format(tempform, 'C()'+eachcol+')')
            else:
                tempform = "{} + {}".format(tempform, eachcol)
    ### when all is done, put the formula together ####
    formula = "{} ~ {} ".format(response_char, tempform)
    if modeltype == 'Regression':
        model = smf.ols(formula, data).fit(max_iter=maxiter, disp=0)
    else:
        if len(data[response].value_counts()) > 2:
            try:
                model = smf.mnlogit(formula=formula, data=data).fit(max_iter=maxiter, disp=0)
            except:
                model = smf.glm(formula=formula, data=data, family=sm.families.Gamma()).fit(
                                        max_iter=maxiter, disp=0)
        else:
            try:
                model = smf.logit(formula, data).fit(max_iter=maxiter, disp=0)
            except:
                model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit(
                                        max_iter=maxiter, disp=0)
    print('Score = %0.2f, Number Selected = %d\nmodel formula: %s' %(score,
                                    len(selected),formula))
    print('Time taken for Final Model (minutes): %0.2f' %((time.time()-start_time)/60))
    print(model.summary())
    return model, selected
예제 #6
0
    ## read the variables
    relevant = MiningReport(variables, n=60)

    ## make model with relevant predictive variables
    model = smf.poisson('numbabes ~ ager + educat + C(race) + totincr',
                        data=join)
    results = model.fit()
    results.summary()

    ## predict numbabes
    columns = ['ager', 'race', 'educat', 'totincr']
    new = pd.DataFrame([[35, 1, 16, 14]], columns=columns)
    predict_babes = results.predict(new)
    print('predict_babes:\n', predict_babes)

    ## predict married/divorced
    model = smf.mnlogit('rmarital ~ ager + C(race) + totincr + educat',
                        data=join)
    results = model.fit()
    results.summary()

    ## test individual
    new = pd.DataFrame({
        'ager': [25],
        'race': [2],
        'totincr': [11],
        'educat': [12]
    })
    predict = results.predict(new)
    print('predict:\n', predict)
예제 #7
0
    def _estimate_gstar_(self, pooled_data, data_to_predict, distribution):
        # Treatment of individual
        f = sm.families.family.Binomial()
        treat_i_model = smf.glm(self._gi_model, pooled_data, family=f).fit()
        pred = treat_i_model.predict(data_to_predict)
        pr_i = np.where(data_to_predict[self.exposure] == 1, pred, 1 - pred)
        if self._verbose_:
            print('==============================================================================')
            print('gstar-model: A')
            print(treat_i_model.summary())

        # Treatment of direct contacts
        if distribution is None:
            f = sm.families.family.Binomial()
            cond_vars = patsy.dmatrix(self._gs_model, pooled_data, return_type='matrix')
            pred_vars = patsy.dmatrix(self._gs_model, data_to_predict, return_type='matrix')
            pr_s = np.array([1.] * data_to_predict.shape[0])

            for c in self._nonparam_cols_:
                # Estimating probability
                treat_s_model = sm.GLM(pooled_data[c], cond_vars, family=f).fit()
                pred = treat_s_model.predict(pred_vars)
                pr_s *= np.where(data_to_predict[c] == 1, pred, 1 - pred)
                if self._verbose_:
                    print('==============================================================================')
                    print('gstar-model: ' + c)
                    print(treat_s_model.summary())

                # Stacking vector to the end of the array
                cond_vars = np.c_[cond_vars, np.array(pooled_data[c])]
                pred_vars = np.c_[pred_vars, np.array(data_to_predict[c])]

        elif distribution == 'normal':
            gs_model = self._gs_measure_ + ' ~ ' + self._gs_model
            treat_s_model = smf.ols(gs_model, pooled_data).fit()
            pred = treat_s_model.predict(data_to_predict)
            pr_s = norm.pdf(data_to_predict[self._gs_measure_], pred, np.sqrt(treat_s_model.mse_resid))
            if self._verbose_:
                print('==============================================================================')
                print('gstar-model: '+self._gs_measure_)
                print(treat_s_model.summary())

        elif distribution == 'poisson':
            f = sm.families.family.Poisson()
            gs_model = self._gs_measure_ + ' ~ ' + self._gs_model
            treat_s_model = smf.glm(gs_model, pooled_data, family=f).fit()
            pred = treat_s_model.predict(data_to_predict)
            pr_s = poisson.pmf(data_to_predict[self._gs_measure_], pred)
            if self._verbose_:
                print('==============================================================================')
                print('gstar-model: '+self._gs_measure_)
                print(treat_s_model.summary())

        elif distribution == 'multinomial':
            gs_model = self._gs_measure_ + ' ~ ' + self._gs_model
            treat_s_model = smf.mnlogit(gs_model, pooled_data).fit(disp=False)
            pred = treat_s_model.predict(data_to_predict)
            values = pd.get_dummies(data_to_predict[self._gs_measure_])
            pr_s = np.array([0] * data_to_predict.shape[0])
            for i in data_to_predict[self._gs_measure_].unique():
                pr_s += pred[i] * values[i]

            if self._verbose_:
                print('==============================================================================')
                print('gstar-model: '+self._gs_measure_)
                print(treat_s_model.summary())

        elif distribution == 'threshold':
            f = sm.families.family.Binomial()
            gs_model = self._gs_measure_ + ' ~ ' + self._gs_model
            treat_s_model = smf.glm(gs_model, pooled_data, family=f).fit()
            pred = treat_s_model.predict(data_to_predict)
            pr_s = np.where(data_to_predict[self._gs_measure_] == 1, pred, 1 - pred)
            if self._verbose_:
                print('==============================================================================')
                print('gstar-model: '+self._gs_measure_)
                print(treat_s_model.summary())

        else:
            raise ValueError("Invalid distribution choice")

        # Creating estimated probability
        return pr_i * pr_s
예제 #8
0
    df.dyslexia = df['{{dyslexia}}']

    # make dyslexia a categorical variable
    df.dyslexia = df.dyslexia.astype('category')

    # remove trials based on comprehension < 2/3
    # --- (D1)
    # just remove trials
    df = df[df.correct_rate > 0.6]

    # --- (D2)
    # drop entire participants
    bad_uuid = set()
    for i, row in df.iterrows():
        if row.correct_rate < 0.6:
            bad_uuid.add(str(row.uuid))
    df = df[~df.uuid.isin(bad_uuid)]

    # --- (F1)
    # fit a linear mixed effects model
    fml = '{{formula}}'
    model = smf.mixedlm(fml, df, groups=df.uuid).fit()
    print(model.summary())

    # --- (F2)
    # fit a multinomial logit model to accuracy
    df['acc'] = 3 - pd.Categorical(df.correct_rate).codes
    fml = 'acc ~ page_condition*dyslexia_bin'
    model = smf.mnlogit(fml, df).fit()
    print(model.summary())
예제 #9
0
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from statsmodels.miscmodels.ordinal_model import OrderedModel
from pandas.api.types import CategoricalDtype

data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/钝吻鳄例子.csv")

#1.基线-类别logit
data['食物选择'] = data['食物选择'].replace({'I': 3, 'F': 2, 'O': 1})
mdl = smf.mnlogit(formula='食物选择~长度', data=data)
result = mdl.fit()
#与书中不同的还是去第一个数值为基线类,由于因变量是类别
#与顺序无关所以这里取O=1,得到与书中一样的结果.
#得到两个方程是无脊椎-其他
#鱼-其他
#若研究无脊椎与鱼的关系则使用(1)-(2)得到
result.summary()

#2.有序响应变量的累积logit模型
data = pd.read_csv(r"D:/书籍资料整理/属性数据分析/政治意识与党派.csv")
# data['意识形态']=data['意识形态'].replace({'很自由':1,'有点自由':2,'中等':3,'有点保守':4,'很保守':5})
data['政治党派'] = data['政治党派'].replace({'民主党人': 1, '共和党人': 0})
tmp = pd.DataFrame()
for i in range(0, 20):
    tmp = tmp.append([data.loc[i]] * data.iloc[i]['值'])
tmp = tmp.reset_index()
del tmp['值']
del tmp['index']
# tmp.to_csv(r'D:/书籍资料整理/属性数据分析/政治意识与党派_整理数据.csv')
예제 #10
0
파일: script.py 프로젝트: uwdata/boba
    # wrangle education level
    edu_order = [
        'pre-high school', 'high school', 'professional school', 'college',
        'graduate school', 'PhD', 'postdoctoral'
    ]
    tp = pd.CategoricalDtype(categories=edu_order, ordered=True)
    df['edu_level'] = df.education.astype(tp).cat.codes

    # check correlation between IVs
    ivs = df[['img_width', 'num_words', 'page_condition', 'age']]
    print(ivs.corr(), '\n')
    print(pd.crosstab(df.english_native, df.dyslexia, normalize='columns'),
          '\n')
    print(pd.crosstab(df.device, df.dyslexia, normalize='columns'), '\n')

    # fit a multinomial logit model to accuracy
    df['acc'] = 3 - pd.Categorical(df.correct_rate).codes
    print(df.groupby('acc').size(), '\n')
    fml = 'acc ~ page_condition*dyslexia_bin'
    model = smf.mnlogit(fml, df, groups=df.uuid).fit()
    print(model.summary(), '\n')

    # remove trials based on comprehension < 2/3
    df = df[df.correct_rate > 0.6]

    # fit a linear mixed effects model
    fml = 'log_speed ~ img_width + num_words + page_condition*dyslexia' \
          '+ age + english_native'
    model = smf.mixedlm(fml, df, groups=df.uuid).fit()
    print(model.summary())
chat_up['Successx'] = chat_up['Success'].replace({
    'Get Phone Number': 1,
    'Go Home with Person': 2,
    'No response/Walk Off': 0
})
chat_up['Genderx'] = chat_up['Gender'].replace({'Male': 0, 'Female': 1})

chat_up['Gen_Funny'] = chat_up['Genderx'] * chat_up['Funny']
chat_up['Gen_Sex'] = chat_up['Genderx'] * chat_up['Sex']

# In[26]:

import statsmodels.formula.api as smf
ml01 = smf.mnlogit(
    'Successx ~ Funny + Sex + Good_Mate +Genderx+Gen_Funny+ Gen_Sex',
    chat_up).fit()
print(ml01.summary())

# In[27]:

print(np.exp(ml01.params))

# ## Checking Assumptions

# ### Assumptions of Multicollinearity

# In[29]:

from statsmodels.stats.outliers_influence import variance_inflation_factor