예제 #1
0
def score(df):

    X, y = get_X_y(df)

    vif = variance_inflation_factor
    print('VIF: ')
    for i in range(X.shape[1]):
        print(vif(X, i))

    X = add_constant(X)

    model = Logit(y, X).fit()
    print(model.summary(xname=names))

    kfold = KFold(n_splits=5)

    accuracies = []
    precisions = []
    recalls = []

    for train_index, test_index in kfold.split(X):
        model = LogisticRegression(solver="lbfgs")
        model.fit(X[train_index], y[train_index])
        y_predict = model.predict(X[test_index])
        y_true = y[test_index]
        accuracies.append(accuracy_score(y_true, y_predict))
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))

    print("Accuracy:", np.average(accuracies))
    print("Precision:", np.average(precisions))
    print("Recall:", np.average(recalls))
def basic_significance(dataframe, list_to_dummify, target):
    '''
    fits a non-regularized logistic model to target using dataframe predictors
    prints model accuracy and outputs significant coefficients order by absolute magnitude
    ----------
    list_to_dummify: a list of columns in string format that require dummification before modeling
    '''
    #process the dataframe
    df = dataframe.copy()
    df = dummify_columns(df, list_to_dummify)
    X, y = xy_split(df, target)
    X = add_constant(X)
    #fit the model
    logit = Logit(y, X)
    fitted_logit = Logit.fit(logit)
    #store accuracy
    c_mat = confusion_matrix(
        y, np.round(Logit.predict(logit, fitted_logit.params)))
    accuracy = sum(c_mat.diagonal()) / np.sum(c_mat)
    print('model train accuracy: %s' % (accuracy))
    #store significant coefs
    coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues < 0.05])
    coefs['coefs'] = fitted_logit.params.filter(items=coefs.index)
    coefs.columns = ['p-values', 'coefs']
    coefs['abs_coefs'] = np.abs(coefs.coefs)
    coefs = coefs.sort_values(by='abs_coefs', ascending=False)
    coefs = coefs.drop('abs_coefs', axis=1)
    return fitted_logit, coefs
def SM_logit(X, y):
    """Computing logit function using statsmodels Logit and 
    output is coefficient array."""
    logit = Logit(y, X)
    result = logit.fit()
    coeff = result.params
    return coeff
예제 #4
0
	def runAnalysis(self,y):
		log_res=[0 for i in range(0,self.m)];
		for i in range(0,self.m):
			I=[i]
			I.extend([-1])
			x=self.X[:,I];
			lr=LR(y,x);
			res_lr=lr.fit(disp=0)
		
			if self.params=="Coef":
				log_res[i]=float(res_lr.params[0]);
			if self.params=="Odds":
				coef=float(res_lr.params[0]);
				log_res[i]=math.exp(coef);
			"""
			if self.params=="pval":
				log_res[i]=;
			if self.params=="logpval":
				pval=;
				if pval>0:
					log_res[i]=-np.log10(pval);
				else:
					log_res[i]=-1.0;
			"""
		
			
		return np.asarray(log_res);
예제 #5
0
def test_attack(n0, n1, numCov, err=.001):
    n = n0 + n1
    x = [rand.randint(0, 2) for i in range(0, n)]
    y = [1 for i in range(0, n)]
    for i in range(0, n0):
        y[i] = 0
    covs = [[rand.randint(0, 1) for i in range(0, n)]
            for j in range(0, numCov)]
    ORs = []
    for i in range(0, numCov):
        print i
        ret = [x]
        ret.append(covs[i])
        X = np.asarray(ret).T
        X = AC(X, False)
        lr = LR(y, X)
        res_lr = lr.fit(disp=0)
        OR = math.exp(float(res_lr.params[0]))
        ret.append(attack(y, covs, OR, err))
        ORs.append(OR)
    print "The number of matches is: "
    for r in ret:
        str(len(r))
    print ORs
    for r in ret:
        print r
    print len(ret)
def forward_selection(dataframe, target, list_to_dummify, criteria='bic'):
    '''
    runs forward selection process to select best predictor set based on bic or aic
    returns a dictionary with the variable set and aic/bic at each step
    ----------
    criteria: default value bic, otherwise aic is used
    list_to_dummify: a list of columns in string format that require dummification before modeling
    '''
    #create target array, intercept only dataframe, and list of variables to select from
    X = pd.DataFrame()
    y = dataframe[target]
    X['const'] = np.ones(cchd.shape[0])
    var_list = list(dataframe.columns)
    var_list.remove(target)

    #create empty dictionary to store output of each step
    models = {'model_vars': [], 'scoring_crit': []}

    #define while loop that will run until all variables have been selected
    while len(var_list) > 0:

        #define empty list to store aic/bic values temporarily for step attempt
        crit_vals = []

        #try adding variables one by one find lowest vif model for current step
        for var in var_list:
            #create temporary df with all previously selected variables + the new variable being tried
            tempX = pd.concat([X, dataframe[var]], axis=1)
            #dummify the variable if necessary
            if var in list_to_dummify:
                tempX = dummify_columns(tempX, [var])
            #fit the logistic model
            logit = Logit(y, tempX)
            fitted_logit = Logit.fit(logit)
            #store aic or bic in a list for each variable attempted
            if criteria == 'bic':
                crit_vals += [fitted_logit.bic]
            else:
                crit_vals += [fitted_logit.aic]

        #find the index of the lowest bic model and store the name of the variable which produced it
        min_crit_idx = crit_vals.index(min(crit_vals))
        best_var = var_list[min_crit_idx]

        #add the best variable to the df
        X = pd.concat([X, dataframe[best_var]], axis=1)

        #store the variables and aic/bic for the best model at the current step
        models['model_vars'] += [list(X.columns)]
        models['scoring_crit'] += [min(crit_vals)]

        #dummify the added variable if necessary
        if best_var in list_to_dummify:
            X = dummify_columns(X, [best_var])

        #remove the added variable from the variable list and track progress
        var_list.remove(best_var)
        print('adding var: %s' % (best_var))

    return models
예제 #7
0
 def score(self, X, confounder_types, assignment='assignment', store_model_fit=False, intercept=True):
     df = X[[assignment]]
     regression_confounders = []
     for confounder, var_type in confounder_types.items():
         if var_type == 'o' or var_type == 'u':
             c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
             if len(c_dummies.columns) == 1:
                 df[c_dummies.columns] = c_dummies[c_dummies.columns]
                 regression_confounders.extend(c_dummies.columns)
             else:
                 df[c_dummies.columns[1:]] = c_dummies[c_dummies.columns[1:]]
                 regression_confounders.extend(c_dummies.columns[1:])
         else:
             regression_confounders.append(confounder)
             df.loc[:,confounder] = X[confounder].copy() #
             df.loc[:,confounder] = X[confounder].copy() #
     if intercept:
         df.loc[:,'intercept'] = 1.
         regression_confounders.append('intercept')
     logit = Logit(df[assignment], df[regression_confounders])
     result = logit.fit()
     if store_model_fit:
         self.model_fit = result
     X.loc[:,'propensity score'] = result.predict(df[regression_confounders])
     return X
예제 #8
0
def test_perfect_prediction():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    iris_dir = os.path.join(cur_dir, '..', '..', 'genmod', 'tests', 'results')
    iris_dir = os.path.abspath(iris_dir)
    iris = np.genfromtxt(os.path.join(iris_dir, 'iris.csv'), delimiter=",",
                            skip_header=1)
    y = iris[:,-1]
    X = iris[:,:-1]
    X = X[y != 2]
    y = y[y != 2]
    X = sm.add_constant(X, prepend=True)
    mod = Logit(y,X)
    assert_raises(PerfectSeparationError, mod.fit)
    #turn off raise PerfectSeparationError
    mod.raise_on_perfect_prediction = False
    mod.fit(disp=False)  #should not raise
예제 #9
0
def test_perfect_prediction():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    iris_dir = os.path.join(cur_dir, '..', '..', 'genmod', 'tests', 'results')
    iris_dir = os.path.abspath(iris_dir)
    iris = np.genfromtxt(os.path.join(iris_dir, 'iris.csv'), delimiter=",",
                            skip_header=1)
    y = iris[:,-1]
    X = iris[:,:-1]
    X = X[y != 2]
    y = y[y != 2]
    X = sm.add_constant(X, prepend=True)
    mod = Logit(y,X)
    assert_raises(PerfectSeparationError, mod.fit)
    #turn off raise PerfectSeparationError
    mod.raise_on_perfect_prediction = False
    mod.fit(disp=False)  #should not raise
def logreg(X, y, train_test=True, roc=True):
    '''
    INPUT:
        - X: 2-D feature matrix
        - y: target vector
        - train_test: boolean
        - roc: boolean
    OUTPUT:
        - fitted: fitted LogitResults
    Runs statsmodels Logistic Regression and prints summary.  Uses
    train_test_split to split data if train_test = True.  Plots and shows ROC
    curve if roc = True.  Returns fitted Logistic Regression model.
    '''

    if train_test:
        X_train, X_test, y_train, y_test = train_test_split(X, y)
    else:
        X_train, X_test, y_train, y_test = X, X, y, y

    vifs, filtered = get_vifs(X_train)
    X_train, X_test = X_train[filtered], X_test[filtered]

    log_reg = Logit(y_train, add_constant(X_train, has_constant='add'))
    fitted = log_reg.fit(method='bfgs', maxiter=500)
    try:
        print fitted.summary()
    except:
        return logreg(X, y)
    if roc:
        plot_roc(y_test, fitted.predict(add_constant(X_test, has_constant='add')))

    return fitted
예제 #11
0
 def score(self,
           X,
           confounder_types,
           assignment='assignment',
           store_model_fit=False,
           intercept=True):
     df = X[[assignment]]
     regression_confounders = []
     for confounder, var_type in confounder_types.items():
         if var_type == 'o' or var_type == 'u':
             c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
             if len(c_dummies.columns) == 1:
                 df[c_dummies.columns] = c_dummies[c_dummies.columns]
                 regression_confounders.extend(c_dummies.columns)
             else:
                 df[c_dummies.columns[1:]] = c_dummies[
                     c_dummies.columns[1:]]
                 regression_confounders.extend(c_dummies.columns[1:])
         else:
             regression_confounders.append(confounder)
             df.loc[:, confounder] = X[confounder].copy()  #
             df.loc[:, confounder] = X[confounder].copy()  #
     if intercept:
         df.loc[:, 'intercept'] = 1.
         regression_confounders.append('intercept')
     logit = Logit(df[assignment], df[regression_confounders])
     result = logit.fit()
     if store_model_fit:
         self.model_fit = result
     X.loc[:,
           'propensity score'] = result.predict(df[regression_confounders])
     return X
예제 #12
0
    def test_attributes(self):
        data = ds.df

        mask_drop = data['apply'] == "somewhat likely"
        data2 = data.loc[~mask_drop, :].copy()
        # we need to remove the category also from the Categorical Index
        data2['apply'] = data2['apply'].cat.remove_categories(
            "somewhat likely")

        # standard fit with pandas input
        modp = OrderedModel(data2['apply'],
                            data2[['pared', 'public', 'gpa']],
                            distr='logit')
        resp = modp.fit(method='bfgs', disp=False)

        exog = add_constant(data2[['pared', 'public', 'gpa']], prepend=False)
        mod_logit = Logit(data2['apply'].cat.codes, exog)
        res_logit = mod_logit.fit()

        attributes = "bse df_resid llf aic bic llnull".split()
        attributes += "llnull llr llr_pvalue prsquared".split()
        assert_allclose(resp.params[:3], res_logit.params[:3], rtol=1e-5)
        assert_allclose(resp.params[3], -res_logit.params[3], rtol=1e-5)
        for attr in attributes:
            assert_allclose(getattr(resp, attr),
                            getattr(res_logit, attr),
                            rtol=1e-4)

        resp = modp.fit(method='bfgs',
                        disp=False,
                        cov_type="hac",
                        cov_kwds={"maxlags": 2})
        res_logit = mod_logit.fit(method='bfgs',
                                  disp=False,
                                  cov_type="hac",
                                  cov_kwds={"maxlags": 2})
        for attr in attributes:
            assert_allclose(getattr(resp, attr),
                            getattr(res_logit, attr),
                            rtol=1e-4)

        resp = modp.fit(method='bfgs', disp=False, cov_type="hc1")
        res_logit = mod_logit.fit(method='bfgs', disp=False, cov_type="hc1")
        for attr in attributes:
            assert_allclose(getattr(resp, attr),
                            getattr(res_logit, attr),
                            rtol=1e-4)
예제 #13
0
def Log_Calc(y, x):
    x1 = AC(x, False)
    lr = LR(y, x1)
    try:
        res_lr = lr.fit(disp=0)
    except:
        return -1
    return math.exp(res_lr.params[0])
def log_reg(X_train, Y_train, X_val):
    from statsmodels.discrete.discrete_model import Logit
    from statsmodels.tools import add_constant
    X_train = add_constant(X_train)
    X_val = add_constant(X_val)
    logit = Logit(Y_train, X_train)
    fit = logit.fit(method = 'bfgs', maxiter = 10000)
    logitprobs = fit.predict(X_val)  
    return logitprobs
예제 #15
0
def select_features(X, y):
    if len(list(set(list(y)))) == 2:
        model = Logit(y, X)
    else:
        model = OLS(y, X)
    res = model.fit(disp = False)
    features = ind = multitest.multipletests(res.pvalues, method='holm')[0]	
    X = X[:, features]
    return X
예제 #16
0
def get_logit_coef(X, y, cols=None):
    if cols:
        X_fit = X[cols]
    else:
        X_fit = X
    X_fit = sm.add_constant(X_fit)
    logit = Logit(y, X_fit)
    fit = logit.fit()
    print fit.summary()
예제 #17
0
파일: model.py 프로젝트: pombredanne/drain
class LogisticRegression(object):
    def __init__(self):
        pass

    def fit(self, X, y, **kwargs):
        self.model = Logit(y, X)
        self.result = self.model.fit()
    
    def predict_proba(self, X):
        return self.result.predict(X)
예제 #18
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = Logit(y, x)
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(disp=0)

        cls.atol = 1e-4  # why not closer ?
예제 #19
0
파일: model.py 프로젝트: stegben/drain
class LogisticRegression(object):
    def __init__(self):
        pass

    def fit(self, X, y, **kwargs):
        from statsmodels.discrete.discrete_model import Logit
        self.model = Logit(y, X)
        self.result = self.model.fit()

    def predict_proba(self, X):
        return self.result.predict(X)
예제 #20
0
    def _initialize(cls):
        y, x = cls.y, cls.x

        modp = Logit(y, x)
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(disp=0)

        cls.atol = 1e-4  # why not closer ?
예제 #21
0
파일: model.py 프로젝트: dean12/drain
class LogisticRegression(object):
    def __init__(self):
        pass

    def fit(self, X, y, **kwargs):
        from statsmodels.discrete.discrete_model import Logit
        self.model = Logit(y, X)
        self.result = self.model.fit()
    
    def predict_proba(self, X):
        return self.result.predict(X)
예제 #22
0
def logregress(xi, xj, *args, **kwargs):
    x = np.vstack((xi, xj))[::3]
    y = np.vstack((np.zeros((xi.shape[0], 1)), np.ones((xj.shape[0], 1))))[::3]
    scaler = MinMaxScaler([-1, 1])
    scaler.fit(x)
    x = scaler.transform(x)
    #clf = LogisticRegression(random_state=0).fit(x, y[:, 0])
    model = Logit(y, x)
    res = model.fit()
    #print(res.prsquared)
    return res.prsquared
예제 #23
0
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Binomial())
        res = mod.fit(method="newton", tol=1e-10)
        from statsmodels.discrete.discrete_model import Logit
        mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(method="newton", tol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = res2.get_influence()
예제 #24
0
def create_Logit(X, y):
    '''
    creates statsmodels logistic regression model with 'linked click ' as target variable
    INPUT: pandas dataframe
    OUTPUT: statsmodels Logistic Regression model
    '''
    X = X.copy()
    X['constant'] = 1
    X.pop('email_id')
    logit = Logit(y, X)
    model = logit.fit(maxiter=400)
    return model
예제 #25
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = Logit(y, x[:, :cls.k_nonzero])
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
예제 #26
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = Logit(y, x[:, :cls.k_nonzero])
        cls.res2 = modp.fit(disp=0)

        mod = LogitPenalized(y, x, penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
예제 #27
0
    def score(
        self,
        X,
        confounder_types,
        assignment="assignment",
        store_model_fit=False,
        intercept=True,
        propensity_score_name="propensity score",
    ):
        """
        Fit a propensity score model using the data in X and the confounders listed in confounder_types. This adds
        the propensity scores to the dataframe, and returns the new dataframe.

        :param X: The data set, with (at least) an assignment, set of confounders, and an outcome
        :param assignment: A categorical variable (currently, 0 or 1) indicating test or control group, resp.
        :param outcome: The outcome of interest.  Should be real-valued or ordinal.
        :param confounder_types: A dictionary of variable_name: variable_type pairs of strings, where
        variable_type is in {'c', 'o', 'd'}, for 'continuous', 'ordinal', and 'discrete'.
        :param store_model_fit: boolean, Whether to store the model as an attribute of the class, as
        self.propensity_score_model
        :param intercept: Whether to include an intercept in the logistic regression model
        :return: A new dataframe with the propensity scores included
        """
        df = X[[assignment]].copy()
        regression_confounders = []
        for confounder, var_type in confounder_types.items():
            if var_type == "o" or var_type == "u":
                c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
                if len(c_dummies.columns) == 1:
                    df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1)
                    regression_confounders.extend(c_dummies.columns)
                else:
                    df = pd.concat([df, c_dummies[c_dummies.columns[1:]]],
                                   axis=1)
                    regression_confounders.extend(c_dummies.columns[1:])
            else:
                regression_confounders.append(confounder)
                df.loc[:, confounder] = X[confounder].copy()
                df.loc[:, confounder] = X[confounder].copy()
        if intercept:
            df.loc[:, "intercept"] = 1.0
            regression_confounders.append("intercept")
        logit = Logit(df[assignment], df[regression_confounders])
        model = logit.fit()
        if store_model_fit:
            self.propensity_score_model = model
        X.loc[:, propensity_score_name] = model.predict(
            df[regression_confounders])
        return X
    def regressor(y, X, model_type=model_type):
        if model_type == "linear":
            regressor = sm.OLS(y, X).fit()
        elif model_type == "logistic":
            # df = pd.DataFrame({'x':[X], 'y':[y]})
            # regressor = sm.logit('y~x', data=df)
            regressor = Logit(y, X)
            regressor = regressor.fit()

        else:
            print("\nWrong Model Type : " + model_type +
                  "\nLinear model type is seleted.")
            model_type = "linear"
            regressor = sm.OLS(y, X).fit()
        return regressor
예제 #29
0
def test_attack_nocovar(n0, n1, err=.001):
    n = n0 + n1
    x = [rand.randint(0, 2) for i in range(0, n)]
    y = [1 for i in range(0, n)]
    for i in range(0, n0):
        y[i] = 0
    x = AC(x, False)
    lr = LR(y, x)
    res_lr = lr.fit(disp=0)
    OR = math.exp(float(res_lr.params[0]))
    ret = attack_no_covar(n0, n1, OR, err)
    print "The number of matches is: " + str(len(ret))
    print OR
    for r in ret:
        print r
예제 #30
0
def _LRT(dependent_var_name: str, independent_var_names: List[str],
         study_df: pd.DataFrame) -> Tuple[dict, dict]:

    from statsmodels.discrete.discrete_model import Logit
    from scipy.linalg import LinAlgError
    from statsmodels.tools.sm_exceptions import PerfectSeparationError
    from tqdm import tqdm

    dic_pvalues = {}
    dic_errors = {}
    for independent_var_name in tqdm(independent_var_names,
                                     position=0,
                                     leave=True):
        print(independent_var_name)
        subset_df = study_df.loc[:, [dependent_var_name, independent_var_name]]\
                            .dropna(how="any")\
                            .copy()

        if subset_df.shape[0] == 0:
            dic_pvalues[independent_var_name] = np.NaN
            dic_errors[independent_var_name] = "All NaN"
            continue

        if subset_df[independent_var_name].dtype in ["object", "bool"]:
            subset_df = pd.get_dummies(subset_df,
                                       columns=[independent_var_name],
                                       drop_first=False)\
                          .iloc[:, 0:-1]
        y = subset_df[dependent_var_name].cat.codes
        X = subset_df.drop(dependent_var_name, axis=1)\
                                .assign(intercept = 1)
        model = Logit(y, X)
        try:
            results = model.fit(disp=0)
            params = results.params.drop("intercept", axis=0)
            conf = np.exp(results.conf_int().drop("intercept", axis=0))
            conf['OR'] = np.exp(params)
            conf["pvalue"] = results.pvalues.drop("intercept", axis=0)
            conf = conf.rename({0: 'lb', 1: 'ub'}, axis=1)
            dic_or = conf.to_dict(orient="index")
            dic_pvalues[independent_var_name] = {
                "llr_pvalue": results.llr_pvalue,
                **dic_or
            }
        except (LinAlgError, PerfectSeparationError) as e:
            dic_pvalues[independent_var_name] = np.NaN
            dic_errors[independent_var_name] = str(e)
    return dic_pvalues, dic_errors
예제 #31
0
class LogReg:
    def __init__(self):
        self.coef_=None

    def fit(self,X,y):
        X=add_constant(X)
        self.lr=Logit(y,X)
        self.l_fitted=self.lr.fit()
        self.coef_=self.l_fitted.params[:-1]

    def predict(self,X):
        if self.coef_ is None:
            print('you must first fit the model')
            return
        X=add_constant(X)
        return(self.lr.predict(self.l_fitted.params,X))
예제 #32
0
def get_logit_results(feature_set,
                      feature_names,
                      n,
                      pos_words,
                      neg_words,
                      diff_func=None):
    """
    Fit logistic regression to predict pos_words from
    neg_words according to the mean difference between
    their feature values (up to training month n).
    
    Parameters:
    -----------
    feature_set : [pandas.DataFrame]
    Rows = words, cols = dates.
    feature_names : [str]
    n : int
    pos_words : [str]
    neg_words : [str]
    diff_func : func(x,y : z)
    Compute difference z between vectors x and y.

    Returns:
    --------
    logit_results : statsmodels.discrete.discrete_model.LogitResults
    """
    M = len(feature_set)
    stats = pd.concat([s.ix[:, 0:n] for s in feature_set], axis=1)
    if (diff_func is None):
        diff_func = lambda x, y: x - y
    X, Y = get_differenced_data(pos_words, neg_words, stats, diff_func)
    # mean of differences
    X_mean = pd.np.hstack([
        pd.np.mean(X.iloc[:, i * n:(i + 1) * n - 1],
                   axis=1).values.reshape(-1, 1) for i in range(M)
    ])
    X = pd.DataFrame(MinMaxScaler().fit_transform(X_mean),
                     columns=feature_names)
    # remove stats with 0 variance
    X = X.ix[:, X.var() > 0.]
    X = add_constant(X)
    logit = Logit(Y, X)
    logit_results = logit.fit()
    return logit_results
예제 #33
0
def attack_no_covar(n0, n1, OR, err=.001):
    n = n0 + n1
    y = [1 for i in range(0, n)]
    for i in range(0, n0):
        y[i] = 0
    num_match = 0
    ret = []
    ##iterate through all possibilities and test
    for i0 in range(0, n0 + 1):  ##number in controls with 2 minor alleles
        for i1 in range(
                0, (n0 - i0 + 1)):  ##number in controls with 1 minor alleles
            for j0 in range(0, n1 + 1):  ##number in cases with 2 minor alleles
                for j1 in range(
                        0,
                    (n1 - j0 + 1)):  ##number in cases with 1 minor alleles
                    i2 = n0 - i0 - i1
                    j2 = n1 - j0 - j1
                    x = [0 for i in range(0, n)]

                    x[:i0] = [2 for i in range(0, i0)]
                    cur = i0
                    x[cur:cur + i1] = [1 for i in range(0, i1)]
                    cur = cur + i1
                    cur = cur + i2
                    x[cur:cur + j0] = [2 for i in range(0, j0)]
                    cur = cur + j0
                    x[cur:cur + j1] = [1 for i in range(0, j1)]
                    res_lr = ""
                    x = AC(x, False)
                    lr = LR(y, x)
                    try:
                        res_lr = lr.fit(disp=0)
                    except:
                        #print x;
                        continue
                    try:
                        OR_cur = math.exp(float(res_lr.params[0]))
                    except:
                        continue
                    if round_sig(OR_cur, err) == round_sig(OR, err):
                        num_match = num_match + 1
                        ret.append([i0, i1, i2, j0, j1, j2])
    #print "match!\n";
    return ret
예제 #34
0
def simple_model(motif_results_A, non_results_A, motif_results_B,
                 non_results_B):
    all_results = motif_results_A + non_results_A + motif_results_B + non_results_B
    is_diplo = np.array([check_is_diplo(result) for result in all_results],
                        dtype="int")
    total_gpc = (len(motif_results_A) + len(non_results_A))
    is_gpc = np.zeros(len(all_results), dtype="int")
    is_gpc[:total_gpc] = 1
    motif = np.zeros(len(all_results), dtype="int")
    motif[:len(motif_results_A)] = 1
    motif[total_gpc:total_gpc + len(motif_results_B)] = 1
    X = np.hstack((np.ones_like(is_gpc)[:, None], is_gpc[:, None],
                   is_diplo[:, None], (is_gpc * is_diplo)[:, None]))
    print(np.sum(X * motif[:, None], axis=0) / np.sum(X, axis=0))
    print(np.sum((1 - X) * motif[:, None], axis=0) / np.sum((1 - X), axis=0))
    y = motif
    model = Logit(y, X)
    result = model.fit()
    print(result.summary())
예제 #35
0
def main():
    """
        Method to test the implementation
    """
    my_log = MyLogisticRegression()
    sk_log = LogisticRegression(C=1000)

    X = np.random.random((50, 4))
    y = np.random.randint(2, size=50)[:, None]

    my_log.fit(X, y)
    sk_log.fit(X, y)

    exog = add_constant(X)
    lr = Logit(y, exog)
    lrf = lr.fit()

    # print(sk_log.coef_, my_log.W, lrf.summary())
    assert np.allclose(sk_log.coef_, my_log.W.T, .1), 'incorrect coefs'
예제 #36
0
def attack(y, cov, OR, err=.001, bnd=.5, numStep=10):
    n = n0 + n1
    num_match = 0
    ret = []
    COV = [y]
    COV.extend(cov)
    COV = np.asarray(COV).T
    iter = IterTable(COV)
    cur = 0
    iter.next()
    [yCur, covCur] = iter.get()
    while not iter.isDone():
        #print cur;
        cur = cur + 1
        [yCur, covCur] = iter.get()

        lr = LR(yCur, AC(covCur, False))
        try:
            res_lr = lr.fit(disp=0)
        except:
            iter.next()
            continue

        OR_cur = 1.0

        try:
            OR_cur = math.exp(float(res_lr.params[0]))
        except:
            iter.next()
            continue
        if abs(OR_cur - OR) < err:
            print "match"
            num_match = num_match + 1
            ret.append(iter.getTable())
        #if abs(OR-OR_cur)/OR_cur>bnd:
        #for i in range(0,numStep):
        #iter.next();
        iter.next()
    return ret
예제 #37
0
def logit(df, y_var, X_vars, add_intercept=True):
    """
    This function replicates probit in STATA, for probit model.
    至少有一个固定效应变量,至多只能有两个。
    y变量应为0-1变量。

    Inputs.
    ---------
    df:pd.DataFrame, the data for OLS.
    y_var:str, the column name of the dependent variable
    X_vars:list of str, the list of explanatory variable names

    Outputs.
    ---------
    res:obj

    """
    new_df = df.copy()
    new_df = new_df.dropna()
    y = new_df[y_var]

    if add_intercept:
        new_df['intercept'] = 1.0
        X = new_df[['intercept'] + X_vars]
    else:
        X = new_df[X_vars]

    logit_mod = Logit(endog=y, exog=X, check_rank=True, missing="drop")
    res = logit_mod.fit(start_params=None,
                        method='newton',
                        maxiter=35,
                        full_output=1,
                        disp=1,
                        callback=None)

    return res