Пример #1
0
from sklearn.model_selection import train_test_split

Train, Test = train_test_split(FullRaw, test_size=0.3, random_state=123)

Train_X = Train.drop(['smoker'], axis=1)
Train_Y = Train['smoker'].copy()
Test_X = Test.drop(['smoker'], axis=1)
Test_Y = Test['smoker'].copy()

from statsmodels.api import Logit

M1_Model = Logit(Train_Y, Train_X).fit()
M1_Model.summary()

Test_pred = M1_Model.predict(Test_X)

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

Test['Test_prob'] = Test_pred
Test['Test_Class'] = np.where(Test['Test_prob'] > 0.5, 1, 0)

Con_Mat = confusion_matrix(Test['Test_Class'], Test_Y)
sum(np.diag(Con_Mat)) / Test_Y.shape[0] * 100

from sklearn.metrics import roc_auc_score, roc_curve

ROC = roc_auc_score(Test['Test_Class'], Test_Y)
AUC = roc_curve(Test['Test_Class'], Test_Y)

from sklearn.ensemble import RandomForestClassifier
Пример #2
0
print("\n*** Model ***")
# add intercept manually
dfX_train_const = add_constant(dfX_train)
# build model and fit training data
model = Logit(y_train, dfX_train_const).fit()
# print the model summary
print(model.summary())
print("Done ...")

################################
# Classification  - Predict Train
# evaluate : Accuracy & Confusion Metrics
###############################

# Probability Distribution for train data
prob_train = model.predict(dfX_train_const)
# sort the prob dist for visualization
sorted_train = sorted(prob_train.values)
index_train = np.arange(len(sorted_train))

# plot it
plt.figure()
sns.regplot(x=index_train,
            y=sorted_train,
            color='b',
            fit_reg=False,
            scatter_kws={"s": 5})
plt.title('Train Data: Probability Distribution')
plt.xlabel('(sorted by output value)')
plt.ylabel('Probability of Logit function')
plt.show()
Test_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True)

Train_x.shape
Test_x.shape

from statsmodels.api import Logit
Model1 = Logit(Train_y, Train_x).fit()
Model1.summary()

col_names = ['ApplicantIncome','Dependents']
Model2 = Logit(Train_y,Train_x.drop( col_names, axis= 1)).fit()
Model2.summary()

Test_x.drop(['ApplicantIncome','Dependents'], axis = 1, inplace=True)

Test_x['Predit'] = Model2.predict(Test_x)
Test_x.columns
Test_x['Predit'][0:6]

import numpy as np
Test_x['Test_class']=np.where(Test_x['Predit']>=0.5, 1, 0)

import pandas as pd
confusion_matrix = pd.crosstab(Test_x['Test_class'], Test_y)
confusion_matrix

accuracy = sum(np.diagonal(confusion_matrix))/Test_x.shape[0]*100
accuracy #82.70

from sklearn.metrics import f1_score, precision_score, recall_score
f1_score(Test_y,Test_x['Test_class'])  
Пример #4
0
class PropensityScore:
    """
    Parameters
    ----------
    outcome : str
        This should be the name of the binary variable to predict.
    test_vars : list
        A list of the variables to test.
    df : DataFrame
        The pandas DataFrame that contains all of the data.
    init_vars : str or list, optional
        Variables to always have included in the propensity score. The default is None.
    add_cons : Boolean, optional
        Select this to add a constant to model. The default is True.
    disp : Boolean, optional
        Display the final model including dropped variables. The default is True.
    cutoff_ord1 : Numeric, optional
        The log gain cutoff for first order covariates. The default is 1.
    cutoff_ord2 : Numeric, optional
        The log gain cutoff for second order covariates. The default is 2.71.
    t_strata : Numeric, optional
        The cutoff for the t-statistic for the calculated strata. The default is 1.
    n_min : {'n_min_strata':int1,'n_min_tc':int2} or 'auto'
        The minimum number of units in each strata or treated/control individuals in strata.
        The default is 'auto' in which case the number per strata is the number of covariates
        tested in the propensity score (just linear ones) + 2 (or K+2)
        while the minimum number of treated and control individuals per strata is 3.
        If not auto, the input needs to be a dictionary that explicitly specifies:
        {'n_min_strata':int1,'n_min_tc':int2}

    Raises
    ------
    ValueError
        If variables are improperly defined, this prints out warnings.

    Returns
    -------
    self.data : DataFrame
        This includes a new frame of just the outcome and potential covariates.
    self.dropped_vars : list
        The variables that did not make the cut for singularity reasons.
    self.model : sm.Logit.fit() model
        This is the raw model on the final set of variables from Statsmodels
    self.propscore : Series
        This is the propensity score as calculated by self.model.fittedvalues.
        This may not match dimension of data due to dropped missing values,
        but index will align properly.
    self.strata : Series
        The calculated strata. Missing propensity scores and values outside of
        min of treated group or max of control group are coded as NaN.
    self.logodds : Series
        The linearized propensity score. Will be the same dimension as propscore.
    self.test_vars_ord2: list
        The full list of tested second order variables for reference.
    self.trim_range : tuple
        The result of calculating the optimal trim min and max propensity score values.
    self.in_trim : Series (True/False)
        An array where True means that the propensity score falls within the
        trim min/max range.
    """
    def __init__(self,
                 outcome,
                 test_vars,
                 df,
                 init_vars=None,
                 add_cons=True,
                 disp=True,
                 cutoff_ord1=1,
                 cutoff_ord2=2.71,
                 t_strata=1,
                 n_min='auto'):

        # double checking some inputs
        if type(outcome) != str:
            raise ValueError(
                'y must be a string variable name in the DataFrame.')
        if type(test_vars) != list:
            raise ValueError('X must be a list of covariates to test.')

        self.outcome = outcome
        self.test_vars = test_vars
        self.add_cons = add_cons
        self.init_vars = init_vars

        if init_vars and type(init_vars) == str:
            covs = [init_vars] + test_vars
        elif init_vars and type(init_vars) == list:
            covs = init_vars + test_vars
        else:
            covs = test_vars

        if n_min == 'auto':
            n_min_strata = len(covs) + 2
            n_min_tc = 3
        else:
            if type(n_min) != dict:
                raise ValueError('n_min must be "auto" or a dictionary')
            elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min):
                raise ValueError('Must specify both n_min_strata (ex. K+2) '\
                                    'and n_min_tc (ex. 3)')
            n_min_strata = n_min['n_min_strata']
            n_min_tc = n_min['n_min_tc']

        if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]:
            raise ValueError(
                'You cannot have variables labeled "propscore" or "logodds"')

        data = df[[outcome] + covs].copy()

        ord2_vars = []
        dropped_vars = []
        # looping through covariates
        for idx, cc in enumerate(covs):
            # first a gut check to make sure all the variables aren't singular
            if len(data[cc].dropna().unique()) == 1:
                raise ValueError('{} only takes on one value'.format(cc))

            # for all variables generate the interaction terms
            if idx < len(covs):
                for jj in covs[idx + 1:]:
                    testvar = data[cc] * data[jj]
                    if (not testvar.equals(data[cc])
                            and not testvar.equals(data[jj])
                            and len(testvar.dropna().unique()) > 1):
                        data.loc[:, 'X'.join([cc, jj])] = testvar
                        ord2_vars.append('X'.join([cc, jj]))
                    else:
                        dropped_vars.append('X'.join([cc, jj]))

            # for continuous variables, generate squared term
            if not data[cc].equals(data[cc]**2):
                data.loc[:, '{}_sq'.format(cc)] = data[cc]**2
                ord2_vars.append('{}_sq'.format(cc))
            else:
                dropped_vars.append('{}_sq'.format(cc))

        if add_cons:
            data.loc[:, '_cons'] = 1

        self.data = data
        self.dropped_vars = dropped_vars
        self.test_vars_ord2 = ord2_vars

        # =====================================================================
        # Actually calculating propensity score
        # =====================================================================
        linear = self.model_from_group(self.test_vars,
                                       cutoff=cutoff_ord1,
                                       init_vars=self.init_vars)

        squared = self.model_from_group(ord2_vars,
                                        cutoff=cutoff_ord2,
                                        init_vars=linear)

        if add_cons:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared + ['_cons']],
                               missing='drop').fit(disp=False)
        else:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared],
                               missing='drop').fit(disp=False)

        self.logodds = self.model.fittedvalues.rename('logodds')
        self.propscore = Series(self.model.predict(),
                                index=self.logodds.index,
                                name='propscore')
        self.trim_range = self.calc_trim(self.propscore)
        self.in_trim = (
            self.propscore.ge(self.trim_range[0])
            & self.propscore.le(self.trim_range[1])).rename('in_trim')
        self.strata = self.stratify(self.data[self.outcome],
                                    self.logodds,
                                    t_max=t_strata,
                                    n_min_strata=n_min_strata,
                                    n_min_tc=n_min_tc)

        if disp:
            print(self.model.summary())
            print('The following vars were infeasible: {}'.format(', '.join(
                self.dropped_vars)))
            print('Stratification produced {} strata'.format(
                len(self.strata.dropna().unique())))

    def best_in_group(self, newvars, basevars=None):
        ''' Get the best variable for score among a set of new variables '''

        if not basevars and self.add_cons:
            basevars = ['_cons']
        elif basevars and self.add_cons:
            basevars = basevars + ['_cons']
        elif not basevars and not self.add_cons:
            raise ValueError(
                'Must specify at least one covariate for baseline model')

        origmod = Logit(self.data[self.outcome],
                        self.data[basevars],
                        missing='drop').fit(disp=False)
        list_llf = []
        for cc in newvars:
            try:
                newmod = Logit(self.data[self.outcome],
                               self.data[basevars + [cc]],
                               missing='drop').fit(disp=False)
                if origmod.nobs / origmod.nobs < .95:
                    warnings.warn('Using {} causes more than 5% '\
                                  'of the sample to be dropped'.format(cc))
                list_llf.append(newmod.llf)
            except:
                if cc not in self.dropped_vars:
                    self.dropped_vars.append(cc)
                list_llf.append(origmod.llf)
        idx = list_llf.index(max(list_llf))

        return newvars[idx], 2 * (list_llf[idx] - origmod.llf)

    def model_from_group(self, test_vars, cutoff, init_vars=None):
        ''' Iterate through a list over and over until no more contribution '''
        remaining = test_vars.copy()

        if init_vars and type(init_vars) == str:
            final = [init_vars].copy()
            init_vars = [init_vars]
        elif init_vars and type(init_vars) == list:
            final = init_vars.copy()
        else:
            final = []

        while len(remaining) > 0:
            temp, gain_add = self.best_in_group(remaining, basevars=final)
            if gain_add > cutoff:
                final.append(temp)
                remaining.remove(temp)
            else:
                break

        return final

    # we will define a static method so that we can call this on any generic series
    @staticmethod
    def stratify(outcome, logodds, n_min_strata, n_min_tc=3, t_max=1):
        """
    Calculate strata from a given outcome variable and log-odds. Specify the cutoff
    for the t-statistic in t_max, or the minimum number of observations for
    each strata in n_min_strata and the number of treated or control observations per
    strata in n_min_tc.
    Parameters
    ----------
    outcome : Series
        Binary variable denoting treatment outcome
    logodds : Series
        The calculated log-odds for that (transformation of propensity score).
    n_min_strata : Int
        The minimum number of observations per strata.
    n_min_tc : Int
        The minimum number of treated or control observations per strata.
        Default is 3.
    t_max : Float
        The maximum t-statistic value acceptable in a strata before splitting.
        Default is 1.

    Returns
    -------
    strata : Series
        The calculated strata. Missing propensity scores and values outside of
        min of treated group or max of control group are coded as NaN.
        """

        if type(outcome) != Series or type(logodds) != Series:
            raise ValueError('Expecting pandas series as inputs')

        # helper function to facilitate indexing
        def above_med(x):
            return (x >= x.median()).astype(int)

        outcome = outcome.rename('outcome').to_frame()
        df = outcome.join(logodds)
        minmax = df.groupby('outcome')['logodds'].agg(['max', 'min'])
        df = df.loc[df.logodds.ge(minmax.loc[1, 'min'])
                    & df.logodds.le(minmax.loc[0, 'max'])
                    & df.logodds.notnull()]

        # initialize the strata, potential blocks, and the change while loop
        df.loc[:, 'strata'] = 0
        df.loc[:, 'block'] = 0
        change = True

        while change == True:
            # get the medians of the strata
            df.loc[:,
                   'medgrp'] = df.groupby('strata')['logodds'].apply(above_med)
            for ii in df.strata.unique():
                # simplify the notation
                sub = df.loc[df.strata.eq(ii), :].copy()

                # calculate t-stat and a grouper with number of groups
                t_test = ttest(sub.loc[sub.outcome.eq(1), 'logodds'],
                               sub.loc[sub.outcome.eq(0), 'logodds'],
                               nan_policy='omit').statistic
                n = sub.groupby(['medgrp', 'outcome'])['logodds'].count()

                # make new blocks
                if (t_test > t_max and min(n) >= n_min_tc
                        and min(n.groupby('medgrp').sum()) >= n_min_strata):
                    df.loc[df.strata.eq(ii),
                           'block'] = df.loc[df.strata.eq(ii), 'medgrp']

            if df.block.sum() == 0:
                change = False
            else:
                # getting ready for next loop
                df.strata = df.groupby(['strata', 'block']).ngroup()
                df.block = 0

        return outcome.join(df.strata).strata

    # we will define a static method so that we can call this on any generic series
    @staticmethod
    def calc_trim(propscore):
        y = 1 / (propscore * (1 - propscore))

        if y.max() <= (2 / y.count()) * (y.sum()):
            return 0, 1

        for gamma in linspace(y.max(), 0, 10000):
            lhs_estimand = (gamma / y.count()) * (y.le(gamma).sum())
            rhs_estimand = (2 / y.count()) * ((y.le(gamma) * y).sum())
            if lhs_estimand < rhs_estimand:
                break

        alpha = .5 - ((.25 - (1 / gamma))**.5)

        return alpha, 1 - alpha
Пример #5
0
Col_To_Drop.append('Academic_Qualification_Undergraduate')
M8 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit()
M8.summary()

Col_To_Drop.append('Previous_Payment_May')
M9 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit()
M9.summary()

Col_To_Drop.append('Repayment_Status_April')
M10 = Logit(Train_Y, Train_X.drop(Col_To_Drop, axis=1)).fit()
M10.summary()

Train_X = Train_X.drop(Col_To_Drop, axis=1)
Test_X = Test_X.drop(Col_To_Drop, axis=1)

Test_pred = M10.predict(Test_X)
Test['Test_prob'] = Test_pred
Test['Test_Class'] = np.where(Test['Test_prob'] >= 0.5, 1, 0)
Test['Test_Class'].value_counts()

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

Con_Mat = confusion_matrix(Test['Test_Class'], Test_Y)
sum(np.diag(Con_Mat)) / Test_Y.shape[0] * 100  # 81.90%

recall_score(Test['Test_Class'], Test_Y) * 100
precision_score(Test['Test_Class'], Test_Y) * 100
f1_score(Test['Test_Class'], Test_Y) * 100

from sklearn.ensemble import RandomForestClassifier
test_y = pd.DataFrame(data=d)
test_y.head(10)

# In[26]:

from statsmodels.api import Logit

# In[35]:

rm.seed(123)
SB_logit = Logit(y_train, x_train).fit()
SB_logit.summary()

# In[36]:

SB_pred = SB_logit.predict(x_test)

# In[37]:

test_y['predicted'] = SB_pred
test_y.head(10)

# In[38]:

test_y['pred_round'] = 1

# In[39]:

test_y.loc[test_y.predicted < 0.5, 'pred_round'] = 0
test_y.head(10)
Пример #7
0
def main():

    index = ["RIIPL_ID"]
    pop = CachePopulationSubsets(population, index)
    pop["ROW_ID"] = np.arange(len(pop))
    outcomes = pd.read_csv(outcomes_file)
    words = pd.read_csv(words_file, index_col="WORD_ID")

    # Load counts and convert to CSR sparse matrix for efficient row slicing
    counts = mmread(counts_file).tocsr()

    # Further divide training data into training and validation sets for
    # selecting the optimal number of topics
    training = (pop["SUBSET"] == "TRAINING")
    np.random.seed(seed)
    subset = np.random.choice([True, False], len(training), p=[0.25, 0.75])
    validation = (training & subset)
    training = (training & ~subset)
    print(training.sum(), "training")
    print(validation.sum(), "validation")

    # Create training and validation outcomes
    y_train = outcomes.loc[training, "OUTCOME_ANY"].values
    y_validate = outcomes.loc[validation, "OUTCOME_ANY"].values
    print(y_train.sum(), "training outcomes")
    print(y_validate.sum(), "validation outcomes")

    # Transform raw counts to TF-IDF using IDF from the training set
    training = np.where(training)[0]
    validation = np.where(validation)[0]
    counts_train = counts[training, :]
    tfidf = TfidfTransformer()
    tfidf.fit(counts_train)
    counts = tfidf.transform(counts)
    counts_train = counts[training, :]
    counts_validate = counts[validation, :]

    # Select NMF model with best AUC performance on validation data
    best = 0
    best_auc = 0
    nmfs = []
    for i, n in enumerate(ntopics):
        print(n, "topics:")
        nmf = NMF(n, random_state=seed).fit(counts_train)
        nmfs.append(nmf)
        X_train = pd.DataFrame(nmf.transform(counts_train))
        X_train["intercept"] = 1
        logit = Logit(y_train, X_train).fit(maxiter=1000, method="cg")
        print(logit.summary())
        X_validate = pd.DataFrame(nmf.transform(counts_validate))
        X_validate["intercept"] = 1
        y_pred = logit.predict(X_validate)
        auc = roc_auc_score(y_validate, y_pred)
        print("AUC:", auc)
        if (auc - best_auc) > delta:
            best = i
            best_auc = auc
        else:
            break
    print("selected", ntopics[best], "topics")

    # Turn best NMF topics into features
    features = pd.DataFrame(nmfs[best].transform(counts))
    features.columns = [
        "MEDICAID_TOPIC_{}".format(i) for i in range(ntopics[best])
    ]
    features["RIIPL_ID"] = pop["RIIPL_ID"]
    features = features.set_index("RIIPL_ID")

    # Use the top 10 words in a topic as its description
    top10words = [
        " ".join(words.loc[i, "WORD"] for i in topic.argsort()[-11:-1])
        for topic in nmfs[best].components_
    ]
    descs = [
        "Topic {} ({})".format(i, words) for i, words in enumerate(top10words)
    ]
    labels = dict(zip(features.columns, descs))

    SaveFeatures(features, out, manifest, population, labels)