예제 #1
0
    def get_z_LinearRegression(self,xo,yo):
        print 'linear regression'
        dist_sigma = 1000
        xx= self.Knots[:,0]
        yy= self.Knots[:,1]
        dd = np.sqrt((xx-xo)**2 + (yy-yo)**2)
        print "dd",dd
        exponent = -(dd**2)/(2*(dist_sigma**2))
        print "exponent", exponent
        weights = np.exp(exponent)
        print "weights",weights

        X = self.Knots[:,0:2]
        X = sm.add_constant(X)
        y = self.Knots[:,2]

        mod_wls = sm.WLS(y, X, weights=weights)
        res_wls = mod_wls.fit()
        print(res_wls.summary())
        p = np.zeros((2,2),dtype=X.dtype)
        p[0,0] = xo
        p[0,1] = yo
        p[1,0] = xo
        p[1,0] = yo
        p = sm.add_constant(p)
        z = res_wls.predict(p)
        print "zshape",z

        return z[0]
def detailedMultipleRegression(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results # WARNING : coef not in right order !
    def test_plot_influence(self, close_figures):
        infl = self.res.get_influence()
        fig = influence_plot(self.res)
        assert_equal(isinstance(fig, plt.Figure), True)
        # test that we have the correct criterion for sizes #3103
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(infl.cooks_distance[0])
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            import warnings
            warnings.warn('test not compatible with matplotlib version')

        fig = influence_plot(self.res, criterion='DFFITS')
        assert_equal(isinstance(fig, plt.Figure), True)
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(np.abs(infl.dffits[0]))
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            pass

        assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
예제 #4
0
def error_rate_for_model(test_model, train_set, test_set, infer=False,
    infer_set=None):
    """Report error rate on test_doc sentiments, using
    supplied model and train_docs"""

    train_targets, train_regressors = \
        zip(*[( doc.sentiment, test_model.docvecs[doc.tags[0]] )
                for doc in train_set ])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        #if infer_subsample < 1.0:
        #    test_data = sample(test_data,
        #                       int(infer_subsample * len(test_data)))
        #test_regressors = [test_model.infer_vector(doc.words,
        #                       steps=infer_steps, alpha=infer_alpha)
        #                   for doc in test_data]
        test_data = [SentimentDocument(None, None, None, s)
                     for (v, s) in infer_set]
        test_regressors = [v for (v, s) in infer_set]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]]
                           for doc in test_set]
    test_regressors = sm.add_constant(test_regressors)

    # predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = ( sum(np.rint(test_predictions) ==
                 [doc.sentiment for doc in test_data] ))
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)
def regression(json_data, bandwidth):
    
    latency = []
    rtt_by_size = []

    # RTT object 
    # rtt = {[avg_rtt1]: [rtt1, rtt2, rtt3, ..., rtcx],
    #         [avg_rtt2]: [...]}

    for i in range(0, len(json_data)):
        latency.append(json_data[i]["latency"])
        rtt_by_size.append(json_data[i]["size"] * json_data[i]["rtt"])

    y = np.array(bandwidth).astype(np.float)
    z = np.array(latency).astype(np.float)
    r = np.array(rtt_by_size).astype(np.float)

    data = np.array([rtt_by_size, y])

    ones = np.ones(len(data[0]))
    X = sm.add_constant(np.column_stack((data[0], ones)))
    for ele in data[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(z, X).fit()
    print results.summary()
예제 #6
0
def overfit_stocks():
  # Load one year's worth of pricing data for five different assets
  start = datetime.date(1,1,2013)
  end = datetime.datetime(1,1,2014)
  x1 = get_pricing('PEP', )
  x2 = get_pricing('MCD', fields='price', start_date=start, end_date=end)
  x3 = get_pricing('ATHN', fields='price', start_date=start, end_date=end)
  x4 = get_pricing('DOW', fields='price', start_date=start, end_date=end)
  y = get_pricing('PG', fields='price', start_date=start, end_date=end)
  #
  # Build a linear model using only x1 to explain y
  slr = regression.linear_model.OLS(y, sm.add_constant(x1)).fit()
  slr_prediction = slr.params[0] + slr.params[1]*x1
  #
  # Run multiple linear regression using x1, x2, x3, x4 to explain y
  mlr = regression.linear_model.OLS(y, sm.add_constant(np.column_stack((x1,x2,x3,x4)))).fit()
  mlr_prediction = mlr.params[0] + mlr.params[1]*x1 + mlr.params[2]*x2 + mlr.params[3]*x3 + mlr.params[4]*x4
  #
  # Compute adjusted R-squared for the two different models
  print('SLR R-squared: %.5f' %slr.rsquared_adj)
  print('SLR p-value: %.5f' %slr.f_pvalue)
  print('MLR R-squared: %.5f' %mlr.rsquared_adj)
  print('MLR p-value: %.5f'  %mlr.f_pvalue)
  #
  # Plot y along with the two different predictions
  y.plot()
  slr_prediction.plot()
  mlr_prediction.plot()
  plt.ylabel('Price')
  plt.xlabel('Date')
  plt.legend(['PG', 'SLR', 'MLR']);
def make_g_model(daily_results, daily_projections):
    daily_results_common = unify_dfs(daily_results)
    dfm = create_master(daily_results_common)
    dfm['NF'] = dfm['NF'].astype(float)
    dfm = eliminate_zeros(dfm)
    X = pd.get_dummies(dfm[['Salary', 'RG', 'NF', 'RW', 'POS', 'Depth']])
    X = pd.concat([X.drop('Depth',1), pd.get_dummies(X['Depth'])], 1)
    if 'POS_' in X.columns:
        X.drop('POS_', axis=1, inplace=True)
    #if 3 in X.columns:
        #X.drop(3, axis=1, inplace=True)
    print X.columns
    X = sm.add_constant(X)
    info = dfm[['Player', 'Date', 'Time']]
    y = dfm['FD']
    model=sm.OLS(y, X).fit()
    today = daily_projections[date_string]
    X = pd.get_dummies(today[['Salary', 'RG', 'NF', 'RW', 'POS', 'Depth']])
    X = pd.concat([X.drop('Depth',1), pd.get_dummies(X['Depth'])], 1)
    print X.columns
    if 'POS_' in X.columns:
        X.drop('POS_', axis=1, inplace=True)
    X = sm.add_constant(X)
    g_model = model.predict(X)
    return g_model  
예제 #8
0
    def weak_instruments(self, n_sims=20):

        np.random.seed(1692)

        model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2)

        treatment_effects = []
        ols_betas, ols_ses = [], []
        old_corrs, new_corrs = [], []
        for _ in xrange(n_sims):
            df = self.treatment_gen.simulate_data(False)

            X = np.hstack((self.x, df['new_treat'].values[:, None]))
            Z = np.hstack((self.x, df['instrument'].values[:, None]))

            ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
            ols_betas.append(ols_beta)
            ols_ses.append(ols_se)

            old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
            new_instrument, new_corr = model.fit_instruments(X, Z, df['treatment_effect'].values, batchsize=128)
            new_corrs.append(new_corr)
            old_corrs.append(old_corr)

            Z2 = Z.copy()
            Z2[:, -1] = new_instrument[:, 0]

            iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2))

            model.reset_params()

        if new_corr:
            logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs))
        logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses))
        logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
예제 #9
0
def scatter(filename, x, y, line=True, xr=None, yr=None, x_title='', y_title='', title=None):
    if title is None:
        title = filename

    plt.figure(figsize=(24,18), dpi=600)
    plt.scatter(x, y)

    if xr is not None:
        plt.xlim(xr)
    if yr is not None:
        plt.ylim(yr)

    if line:
        est = sm.OLS(y, sm.add_constant(x)).fit()
        x_prime = np.linspace(min(x), max(x), 100)[:, np.newaxis]
        x_prime = sm.add_constant(x_prime)
        y_hat = est.predict(x_prime)
        line_plot1 = plt.plot(x_prime[:, 1], y_hat, 'r', alpha=0.9, label='r^2 = %s' % est.rsquared)
        #res = linregress(x,y)
        #line_plot2 = plt.plot([min(x), max(x)], [res[0]*min(x)+res[1], res[0]*max(x)+res[1]],
        #                      'g', alpha=0.9, label='r^2 = %s' % res[2])
        plt.legend(['r^2 = %s' % est.rsquared])

    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.title(title)

    plt.savefig('%s.png' % filename, format='png')
    plt.savefig('%s.eps' % filename, format='eps')
    plt.close()
예제 #10
0
파일: stats_1.py 프로젝트: tjisana/IEOR4739
def reg_m(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results
    def predict(self, test_X):
        dataset = self.__dataset
        intercept = self.__intercept
        XX_inv  = self.__XX_inv
        beta    = self.__beta
        
        train_X = sm.add_constant(dataset[:, :-1]) if intercept else dataset[:, :-1]
        test_X = sm.add_constant(vec(test_X)) if intercept else vec(test_X)
        train_Y = dataset[:, -1:]
        train_pred = np.dot(train_X, beta)
        
        # Confidence interval
        sig = (np.linalg.norm(train_Y-train_pred)**2/(train_X.shape[0]-train_X.shape[1]+1))**0.5
        s = []
        for row in range(test_X.shape[0]):
            x = test_X[[row], :]
            s.append(sig*(1 + np.dot(np.dot(x, XX_inv), x.T))**0.5)
            
        s = np.reshape(np.asarray(s), (test_X.shape[0], 1))

        test_pred = np.dot(test_X, beta)
        hi_ci = test_pred + 2*s
        lo_ci = test_pred - 2*s

        return test_pred, hi_ci, lo_ci
def get_r_stat(sim_data):
    try:
        x = sm.add_constant(sim_data.rt_sampled.sort_values().reset_index().rt_sampled)
        y = sim_data.rt.sort_values().reset_index().rt
        fitted = sm.OLS(y,x).fit()
        log_x = sm.add_constant(sim_data.log_rt_sampled.sort_values().reset_index().log_rt_sampled)
        log_y = sim_data.log_rt.sort_values().reset_index().log_rt
        log_fitted = sm.OLS(log_y,log_x).fit()
        sub_out = pd.DataFrame([{'int_val': fitted.params.const,
                                'int_pval': fitted.pvalues.const,
                                'slope_val': fitted.params.rt_sampled,
                                'slope_pval':fitted.pvalues.rt_sampled,
                                'rsq': fitted.rsquared,
                                'rsq_adj': fitted.rsquared_adj,
                                'log_int_val': log_fitted.params.const,
                                'log_int_pval': log_fitted.pvalues.const,
                                'log_slope_val': log_fitted.params.log_rt_sampled,
                                'log_slope_pval': log_fitted.pvalues.log_rt_sampled,
                                'log_rsq': log_fitted.rsquared,
                                'log_rsq_adj': log_fitted.rsquared_adj}], index=[0])
    except:
        sub_out = pd.DataFrame([{'int_val': np.nan,
                                'int_pval': np.nan,
                                'slope_val': np.nan,
                                'slope_pval': np.nan,
                                'rsq': np.nan,
                                'rsq_adj': np.nan,
                                'log_int_val': np.nan,
                                'log_int_pval': np.nan,
                                'log_slope_val': np.nan,
                                'log_slope_pval': np.nan,
                                'log_rsq': np.nan,
                                'log_rsq_adj': np.nan}], index=[0])
    return sub_out
예제 #13
0
def GetCoef(start_train, end_train, StockReturns, CarhartDaily, SP500Returns, DataFolder):
    if os.path.isfile(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date())):
        Coef = pd.read_csv(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date()))
        return Coef
    else:
        Coef = pd.DataFrame()
        for ticker in StockReturns.ticker.unique():
            print "Getting regression coefficient for %s" % ticker
            tmpReturn = StockReturns[(StockReturns.ticker == ticker)]
            if not tmpReturn.empty:
                tmpData = tmpReturn.merge(CarhartDaily, left_on = 'endDate', right_on = 'date')
                tmpData = tmpData.merge(SP500Returns, on = 'endDate')
                tmpData['SP500-RF'] = tmpData['SP500Return']*100 - tmpData['RF']
                y = tmpData['return']*100 - tmpData['RF']
                X1 = tmpData[['Mkt-RF', 'SMB', 'HML', 'UMD']]
                X2 = tmpData[['Mkt-RF']]
                X3 = tmpData[['SP500-RF']]
                X1 = sm.add_constant(X1)
                X2 = sm.add_constant(X2)
                X3 = sm.add_constant(X3)
                model1 = sm.OLS(y, X1).fit()
                model2 = sm.OLS(y, X2).fit()
                model3 = sm.OLS(y, X3).fit()
                tmpDF1 = pd.DataFrame(model1.params).T
                tmpDF1.rename( columns = {'const' : 'alphaFF'}, inplace = True)
                tmpDF2 = pd.DataFrame(model2.params).T
                tmpDF2.rename( columns = {'const' : 'alphaCAPM', 'Mkt-RF' : 'Mkt-RF_only'}, inplace = True)
                tmpDF3 = pd.DataFrame(model3.params).T
                tmpDF3.rename( columns = {'const' : 'alphaSP500'}, inplace = True )
                tmpDF = pd.concat((tmpDF1, tmpDF2, tmpDF3), axis = 1)
                tmpDF['ticker'] = ticker
                Coef = Coef.append(tmpDF)
        Coef.to_csv(r'%s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date()), index = False)
        print 'Finished saving regression coefficients to: %s\Coef_%s_%s.csv' % (DataFolder, start_train.date(), end_train.date())
        return Coef
예제 #14
0
파일: main.py 프로젝트: arrowrowe/ec310
 def __init__(self):
   self.researched = util.source.read('F-F_Research_Data_5_Factors_2x3')
   self.portfolios = util.source.read('25_Portfolios_5x5')
   self.simpleFactor = sm.add_constant(self.researched.Mkt_RF)
   self.threeFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'HML']])
   self.fourFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'RMW', 'CMA']])
   self.fiveFactor = sm.add_constant(self.researched[['Mkt_RF', 'SMB', 'HML', 'RMW', 'CMA']])
    def test_mvl_fuse_function(self):
        Y, D, P, T, G = generate_raw_samples()
        T = sm.add_constant(T, prepend=False)
        P = sm.add_constant(P, prepend=False)
        D = sm.add_constant(D, prepend=False)
        G = sm.add_constant(G, prepend=False)
        loo = LeaveOneOut(len(Y))
        er = []
        for train_idx, test_idx in loo:
            tm = taxi_view_model(train_idx, Y, T)
            pm = poi_view_model(train_idx, Y, P)
            gm = geo_view_model(train_idx, Y, G)
            dm = demo_view_model(train_idx, Y, D)
            models = [tm, pm, gm, dm]
            lm = mvl_fuse_function(models, train_idx, Y)
            
            
            tm_test = tm[0].predict(T[test_idx])
            pm_test = pm[0].predict(P[test_idx])
            gm_test = gm[0].predict(G[test_idx])
            dm_test = dm[0].predict(D[test_idx])
            
            newX_test = np.array([1, tm_test, pm_test, gm_test, dm_test])
            ybar = lm.predict(newX_test)
            y_error = ybar - Y[test_idx]
#            if np.abs(y_error / Y[test_idx]) > 0.8:
#                print test_idx, ybar, Y[test_idx], newX_test
            er.append(y_error)
        mre = np.mean(np.abs(er)) / np.mean(Y)
        print "MVL with linear fusion function MRE: {0}".format(mre)
        
        self.visualize_prediction_error(er, Y, "MVL linear combination")
예제 #16
0
def models_pattern(data, matrix):
    y = np.array(data['survived'])
    ones = np.ones(len(matrix[0]))
    X = sm.add_constant(np.column_stack((matrix[0], ones)))
    for ele in matrix[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    logit_model = sm.Logit(y, X)
    logit_res = logit_model.fit(maxiter=2000)
    print logit_res.summary()
    print logit_res.wald_test('1*x1 + 1*x2 + 1*x3')
    print
    probit_model = sm.Probit(y, X)
    probit_res = probit_model.fit(maxiter=2000)
    print probit_res.summary()
    print logit_res.wald_test('1*x1 + 1*x2 + 1*x3')
    print
    linear_model = sm.OLS(y, X)
    linear_res = linear_model.fit(maxiter=2000)
    result = 0.
    for array in X:
        for i, item in enumerate(array):
            result += linear_res.params[i] * item
    result /= (len(X))
    print 'Linear function value: {}'.format(result)
    print linear_res.summary()
    print linear_res.wald_test('1*x1 + 1*x2 + 1*x3')
    print
예제 #17
0
def linear_model_plot(x_variable, y_variable):
    '''Function develops linear model for x and y variable inputs and plots regression line on top of scatter plot'''
    
    assert len(x_variable) > 1, 'length of x_variable should be larger than 1'
    assert len(y_variable) > 1, 'length of y_variable should be larger than 1'
    
    # assigning function variables to response and predictor variables
    y = y_variable # response variable
    X = x_variable # predictor variable
    X = sm.add_constant(X)  # Adds a constant term to the predictor (essential to obtain the constant in the formula)
    
    # Calculating the linear model for the two variables
    lm = sm.formula.OLS(y, X).fit()
    
    # Developing the plot of the linear model
    # making a range of the x variable to pass to the y prediction
    x_pred = np.linspace(x_variable.min(), x_variable.max())
    
    # Adding a constant to this range of x values (essential to obtain the constant in the formula)
    x_pred2 = sm.add_constant(x_pred)

    # Passing the linear model predictor the range of x values to model over
    y_pred = lm.predict(x_pred2)

    # Plotting these predicitons on the graph
    plt.plot(x_pred, y_pred, color='k', linewidth=2)

    # Obtaining linear regression 
    return plt.plot()
def inbound_forcast(target, exchange, geo, exchange_test, geo_test, submit, i):
    for col in col_list:
        # 宿泊者数のカラム名を指定
        target_col = col + suff

        target.index = range(0, 365)

        X = sm.add_constant(exchange, prepend=False)
        X_test = sm.add_constant(exchange_test, prepend=False)

        X.index = range(0,365)

        for g in range(0, len(target)):
            if target[target_col][g] == 0:
                target[target_col][g] = 1

        y = target[target_col].apply(np.log)

        model = sm.OLS(y, X)
        results = model.fit()
        print(results.summary())
        pred = results.predict()
        Y = y - pred
        L = len(Y)

        fftY = fft.fft(Y)
        freqs = fft.fftfreq(L)
        power = np.abs(fftY)
        phase = [np.arctan2(float(c.imag), float(c.real)) for c in fftY]

        wave = newwave_i(L, results, pred, power, freqs, phase, X_test)
        submit[i] = wave
        i += 1

    return submit, i
예제 #19
0
파일: util.py 프로젝트: lishaoyi/quandl
def calcSScoreAgainstExisting(beta_mfac, beta_aux, df, n_aux):
  y = df.iloc[:, 0]
  X = df.iloc[:, 1:]
  resid_mfac = y - sm.add_constant(X).dot(beta_mfac)
  aux = calcAuxilaryArray(resid_mfac, n_aux)

  resid_aux = aux[1:] - sm.add_constant(aux[:-1]).dot(beta_aux)
  return calcSScore(beta_aux, resid_aux, aux[-1])
def get_recommend():

    # repeat process above to get data for recommendations
    data = flask.request.json
    headline = data["headline"]
    content = data["content"].encode('utf8')
    tags = data["tags"]
    day_published = data["day_pub"]
    channel = data["channel"]
    num_imgs = int(data["num_imgs"])
    index = [0]
    columns = [u'LDA_0_prob', u'LDA_1_prob', u'LDA_2_prob', u'LDA_3_prob',
       u'LDA_4_prob', u'LDA_5_prob', u'LDA_6_prob', u'LDA_7_prob',
       u'LDA_8_prob', u'LDA_9_prob', u'average_token_length_content',
       u'average_token_length_title', u'avg_negative_polarity',
       u'avg_positive_polarity', u'data_channel_is_bus',
       u'data_channel_is_entertainment', u'data_channel_is_lifestyle',
       u'data_channel_is_socmed', u'data_channel_is_tech',
       u'data_channel_is_world', u'global_grade_level',
       u'global_rate_negative_words', u'global_rate_positive_words',
       u'global_reading_ease', u'global_sentiment_abs_polarity',
       u'global_sentiment_polarity', u'global_subjectivity', u'is_weekend',
       u'max_abs_polarity', u'max_negative_polarity', u'max_positive_polarity',
       u'min_negative_polarity', u'min_positive_polarity', u'n_tokens_content',
       u'n_tokens_title', u'num_imgs', u'num_tags', u'num_videos',
       u'r_non_stop_unique_tokens', u'r_non_stop_words', u'r_unique_tokens',
       u'rate_negative_words', u'rate_positive_words',
       u'title_sentiment_abs_polarity', u'title_sentiment_polarity',
       u'title_subjectivity', u'weekday_is_friday',
       u'weekday_is_monday', u'weekday_is_saturday', u'weekday_is_sunday',
       u'weekday_is_thursday', u'weekday_is_tuesday', u'weekday_is_wednesday']
    data_df = pd.DataFrame(index=index, columns=columns)
    create_metadata_fields(data_df, num_imgs, tags, day_published, channel)
    create_NLP_features(data_df, headline, content)
    create_lda_features(data_df, content)
    results = {}
    create_metadata_fields(results, num_imgs, tags, day_published, channel)
    create_NLP_features(results, headline, content)
    create_lda_features(results, content)
    results['est_shares'] = round(pois_reg.predict(sm.add_constant(data_df))[0],-2)
    results['est_prob'] = round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2)

    # change day of week data to sunday
    data_df['weekday_is_monday'] = 0
    data_df['weekday_is_tuesday'] = 0
    data_df['weekday_is_wednesday'] = 0
    data_df['weekday_is_thursday'] = 0
    data_df['weekday_is_friday'] = 0
    data_df['weekday_is_saturday'] = 0
    data_df['weekday_is_sunday'] = 1
    data_df['is_weekend'] = 1

    # get results if change to sunday
    results['est_shares_sun'] = round(pois_reg.predict(sm.add_constant(data_df))[0],-2)
    results['est_prob_sun'] = round(RF_class.predict_proba(sm.add_constant(data_df))[0][1],2)

    # return results to javascript
    return flask.jsonify(results)
예제 #21
0
def main():
    data = sm.datasets.star98.load()
    
    print sm.datasets.star98.NOTE
    
    # Load the data and add a constant to the exogenous (independent) variables
    data.exog = sm.add_constant(data.exog, prepend=False)
    print data.endog[:5, :], '\n'
    
    # The dependent variable is N by 2 (Success: NABOVE, Failure: NBELOW):
    print data.exog[:2, :], '\n'

    # Fit and summary
    glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())

    res = glm_binom.fit()

    print res.summary(), '\n'        
        
    # We extract information that will be used to draw some interesting plots
    nobs = res.nobs
    y = data.endog[:, 0] / data.endog.sum(1)
    yhat = res.mu
        
    #Plot yhat vs y:
    plt.figure()
    plt.scatter(yhat, y)            
    
    line_fit = sm.OLS(y, sm.add_constant(yhat, prepend=False)).fit().params

    fit = lambda x: line_fit[1] + line_fit[0] * x  # better way in scipy?
    plt.plot(np.linspace(0, 1, nobs), fit(np.linspace(0, 1, nobs)))
    
    plt.title('Model Fit Plot')
    plt.ylabel('Observed Values')
    plt.xlabel('Fitted Values')
    
    # Plot yhat vs. Pearson residuals:    
    plt.figure();
    plt.scatter(yhat, res.resid_pearson);
    plt.plot([0.0, 1.0], [0.0, 0.0], 'k-');
    plt.title('Residual Dependence Plot');
    plt.ylabel('Pearson Residuals');
    plt.xlabel('Fitted values');
    
    # Histogram of standardized deviance residuals
    plt.figure();
    resid = res.resid_deviance.copy()
    resid_std = (resid - resid.mean()) / resid.std()
    plt.hist(resid_std, bins=25);
    plt.title('Histogram of standardized deviance residuals');

    # QQ Plot of Deviance Residuals
    from statsmodels import graphics
    graphics.gofplots.qqplot(resid, line='r');    

    test = 1
예제 #22
0
파일: util.py 프로젝트: lishaoyi/quandl
def regrFromDataframe(df, y_attr, X_attrs, date, n_aux, n_mfac):
  df = getLookbackPeriod(df, date, n_mfac)

  model_mfac = sm.OLS(df[y_attr], sm.add_constant(df[X_attrs])).fit()
  beta_mfac  = model_mfac.params
  resid_mfac = model_mfac.resid

  aux = calcAuxilaryArray(resid_mfac, n_aux)
  aux_res = sm.OLS(aux[1:], sm.add_constant(aux[:-1])).fit()
    
  return calcSScore(aux_res.params, aux_res.resid, aux[-1]), beta_mfac, aux_res.params
예제 #23
0
def main():
    # Process input data
    #json_data=open('C:/Users/rthomas/Documents/DemandPrediction/demand_prediction.json')
    json_data = open(sys.argv[1])
    x, y, last, total_hours = process_input(json_data)
    FUTURE_DAYS = 15  # will make prediciton 15 days into future

    # I looked at a few different regression families in sm.GLM but found very similar rms errors so I chose to use a simple linear regression
    trend_model = sm.OLS(y, sm.add_constant(range(len(x)), prepend=True)).fit()
    trend = trend_model.fittedvalues

    # y1 is y with the trend line (growth over time) removed
    y1 = y - trend
    
    # y2 is y1 with weekly trends removed
    days = [w.weekday() for w in x]
    days_mean = [np.mean([y1[i] for i in range(0,len(y1)) if days[i] == k]) for k in range(7)]
    y2 = [y1[i] - days_mean[days[i]] for i in range(len(y1))]
    
    # y3 is y2 with daily trends removed
    hours = [w.hour for w in x]
    hours_mean = [np.mean([y2[i] for i in range(0,len(y2)) if hours[i] == k]) for k in range(24)]
    y3 = [y2[i] - hours_mean[hours[i]] for i in range(len(y2))]

    trend_y = [days_mean[days[i]] + hours_mean[hours[i]] + trend[i] for i in range(len(trend))]

    # The predict function is not working the way I expected for future times
    # construct an ARMA model on the residuals once all trends have been removed
    #arma_model = sm.tsa.ARMA(y3)
    #arma_fit = arma_model.fit(order=(6,6))
    #arma_res = arma_fit.predict()
    #arma_y = [arma_res[i] + trend_y[i] for i in range(len(trend))]
    
    future_hours = FUTURE_DAYS*24 + total_hours
    future_trend = trend_model.predict(sm.add_constant(range(total_hours, future_hours), prepend=True))
    future_x = [last + datetime.timedelta(hours=k) for k in range(1,FUTURE_DAYS*24+1)]
    future_hours = [w.hour for w in future_x]
    future_days = [w.weekday() for w in future_x]
    future_hours_trend = [hours_mean[future_hours[i]] for i in range(len(future_x))]
    future_days_trend = [days_mean[future_days[i]] for i in range(len(future_x))]
    future_all_trends = [sum(tuple) for tuple in zip(future_trend, future_hours_trend, future_days_trend)] 
    
    
    

    #plot_series(x, [y, trend_y, arma_y], ['Time Series', 'All trends', 'All trends + ARMA'])

    #rms_arr = []
    #for predict in [trend_y, arma_y]:
    #    rms_arr.append(rms(y, predict))
            
    app = flask.Flask(__name__)
    app.run()
    def classify(self, mp, x_train, y_train, x_test):
        x_train_reg = sm.add_constant(x_train)
        x_test_reg = sm.add_constant(x_test)
        logit =  sm.Logit(y_train, x_train_reg)
        clf = logit.fit(disp=0)
        # print(clf.summary())
        log_to_info('Fitting a Logistic Regression to labeled training data...')
        log_to_info('Predicting test value')
        y_test = clf.predict(x_test_reg)
        log_to_info('Done!')

        return numpy.rint(y_test)
예제 #25
0
def estender_serie_anos(serie, df, grau_polinomio=2):
    mask = np.isfinite(df.index) & np.isfinite(df[serie])

    polinomio = np.array([np.array(np.power(df.index[mask],i)) for i in range(1,grau_polinomio+1)])
    polinomio = sm.add_constant(polinomio.T)
       
    reg = sm.OLS(df[serie][mask], polinomio).fit()
    
    polifit = np.array([np.array(np.power(df.index,i)) for i in range(1,grau_polinomio+1)])
    polifit = sm.add_constant(polifit.T)
          
    return reg.predict(polifit)
예제 #26
0
파일: fit.py 프로젝트: brielin/Jester
    def __init__(self,args):
        self.bed = Bed(args.bfile) #
        self.N = self.bed.iid_count
        if args.covfile is not None:
            cov = pd.read_table(args.covfile,header=None)
            self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid))
            self.ncov = self.cov.shape[1] # + constant
        else:
            self.cov = np.ones((self.N,1))
            self.ncov = 1 # Constant
        if args.phenofile is not None:
            Y = pd.read_table(args.phenofile,header=None,na_values='-9')
        else:
            try:
                Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9')
            except IOError:
                print("Phenotype file not found.")
                exit(1)
        self.Y = ju._reorder(Y,self.bed.iid)
        af = ju.get_allele_frequency(self.bed,args) #
        snps = (af>args.maf)&(af<1-args.maf) #
        if (args.from_bp is not None) and (args.to_bp is not None):
            k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp)
            snp1 = snps&k
        snps_to_use = self.bed.sid[snps]
        if args.extract is not None:
            keep = np.array([l.strip() for l in open(args.extract,'r')])
            snps_to_use = np.intersect1d(snps_to_use,keep)
        self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) #
        pos = self.bed.pos[self.bed_index] #
        bim=pd.read_table(self.bed.filename+'.bim',header=None,
                          names=['chm','id','pos_mb','pos_bp','a1','a2'])
        self.af = af[self.bed_index] #
        self.M = len(self.bed_index) #
        self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type)
        self.pos = pos[:,2]
        self.chr = pos[:,0]
        self.id = self.bed.sid[self.bed_index]
        self.A1 = bim['a1'].loc[self.bed_index]
        self.A2 = bim['a2'].loc[self.bed_index]
        self.logistic = False
        self.chimin = stats.chi2.ppf(1-args.minp,2)

        # Fit null
        if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1):
            self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0)
            self.logistic = True
        else:
            self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0)
        if self.ncov > 1:
            self.cov = sm.add_constant(self.null.fittedvalues)
        self.marg_res, self.joint_res = self.compute(args)
예제 #27
0
def scatterColor(x0, y, w):
    """Creates scatter plot with points colored by variable.
    All input arrays must have matching lengths

    Arg:
        x0 (array):
            array of x values
        y (array):
            array of y values
        w (array):
            array of scalar values

    Returns:
        slope and intercept of best fit line

    """
    import matplotlib as mpl
    import matplotlib.cm as cm
    import statsmodels.api as sm
    from scipy.stats import linregress
    cmap = plt.cm.get_cmap('RdYlBu')
    norm = mpl.colors.Normalize(vmin=w.min(), vmax=w.max())
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    m.set_array(w)
    sc = plt.scatter(x0, y, label='', color=m.to_rgba(w))

    xa = sm.add_constant(x0)

    est = sm.RLM(y, xa).fit()
    r2 = sm.WLS(y, xa, weights=est.weights).fit().rsquared
    slope = est.params[1]

    x_prime = np.linspace(np.min(x0), np.max(x0), 100)[:, np.newaxis]
    x_prime = sm.add_constant(x_prime)
    y_hat = est.predict(x_prime)

    const = est.params[0]
    y2 = [i * slope + const for i in x0]

    lin = linregress(x0, y)
    x1 = np.arange(np.min(x0), np.max(x0), 0.1)
    y1 = [i * lin[0] + lin[1] for i in x1]
    y2 = [i * slope + const for i in x1]
    plt.plot(x1, y1, c='g',
             label='simple linear regression m = {:.2f} b = {:.0f}, r^2 = {:.2f}'.format(lin[0], lin[1], lin[2] ** 2))
    plt.plot(x1, y2, c='r', label='rlm regression m = {:.2f} b = {:.0f}, r2 = {:.2f}'.format(slope, const, r2))
    plt.legend()
    cbar = plt.colorbar(m)

    cbar.set_label('use cbar.set_label("label") to label this axis')

    return slope, const
def reg_m(y, x1,x2,x3):
    y = y.reshape(-1)
    x = np.array([x1.reshape(-1),x2.reshape(-1), x3.reshape(-1)])
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    yy = results.params[0]*x1 + results.params[1]*x2 + results.params[2]*x3 + results.params[3] 
    yy = yy/yy.max()
    print (results.params)
    print (results.summary())
    return results,yy
 def classify(self, mp, x_train, y_train, x_test):
     x_train = sm.add_constant(x_train)
     x_test = sm.add_constant(x_test)
     clf = LogisticRegressionCV(verbose=1, cv=5)
     log_to_info('Fitting a Logistic Regression to labeled training data...')
     clf = clf.fit(x_train, y_train)
     log_to_info('Training details')
     log_to_info('Classifier parameters: {}'.format(clf.get_params()))
     log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0))
     log_to_info('Predicting test value')
     y_test = clf.predict(x_test)
     log_to_info('Done!')
     return y_test
예제 #30
0
 def setup_class(cls):
     data = sm.datasets.randhie.load(as_pandas=False)
     cls.endog = data.endog
     exog = sm.add_constant(data.exog[:,1:4], prepend=False)
     exog_infl = sm.add_constant(data.exog[:,0], prepend=False)
     cls.res1 = sm.ZeroInflatedGeneralizedPoisson(data.endog, exog,
         exog_infl=exog_infl, p=1).fit(method='newton', maxiter=500, disp=0)
     # for llnull test
     cls.res1._results._attach_nullmodel = True
     cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset', 'p']
     cls.init_kwds = {'inflation': 'logit', 'p': 1}
     res2 = RandHIE.zero_inflated_generalized_poisson
     cls.res2 = res2
trt1 = data['weight'][data.group == 'trt1']
trt2 = data['weight'][data.group == 'trt2']

from scipy import stats
F, p = stats.f_oneway(ctrl, trt1, trt2)
F
p

import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.group = le.fit_transform(data.group)

x = data.group
y = data.weight
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
sm.stats.anova_lm(model, typ=2)

#cravens example
df = pd.read_excel('cravens.xlsx')
from scipy.stats import skew
#sales
sales = df.Sales
#mean
sales.mean()
#median
sales.median()
#mode
sales.mode()[0]
예제 #32
0
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Prepare Data to Plot
x = np.arange(1, 10, 0.1)
err = np.random.randn(len(x))
y = x**2 + err

# X = (1 | x)
X = np.matrix(x).T      # To Column matrix
X = sm.add_constant(X)  # Add constant
Y = np.matrix(y).T      # To Column matrix

def find_beta_hat(X, y): # X should be np.matrix
    return np.linalg.pinv(X) * y

def find_y_hat(X, y):
    beta_hat = find_beta_hat(X, y)
    return X * beta_hat

Y_hat = find_y_hat(X, Y)

with plt.xkcd():
    # Prepare Plot
    plt.figure(figsize=(10,6), dpi=300)
    plt.title(r"OLS...?", fontsize=16)
    plt.xlabel(r'x', fontsize=14)
    plt.ylabel(r'y', fontsize=14)
    
    # Plot with Legends
data = data[data.proccessor_turbo != "Not found"]

data["proccessor"] = to_numeric(data["proccessor"])
data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"])
#print(data.info())

x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]]
y = data["price"]

regr = linear_model.LinearRegression()
regr.fit(x, y)

print("Intercept: ", regr.intercept_)
print("Coeff: ", regr.coef_)
print("Score: ", regr.score(x, y))

new_size = 15.6
new_proccessor = 1.6
new_proccessor_turbo = 3.9
new_ram = 12
new_hdd = 1250

predicted = regr.predict(
    [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print("Predicted: ", predicted)

x = add_constant(x)
model = OLS(y, x).fit()
predicted = model.predict(
    [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print(model.summary())
예제 #34
0
from sklearn.metrics import roc_curve, auc
from pylab import mpl

if __name__ == '__main__':
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

    data = pd.read_csv('WoeData.csv')
    Y = data['SeriousDlqin2yrs']
    X = data.drop([
        'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome',
        'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
        'NumberOfDependents'
    ],
                  axis=1)
    X1 = sm.add_constant(X)

    logit = sm.Logit(Y, X1)
    result = logit.fit()
    print(result.params)
    print(result.summary())

    test = pd.read_csv('TestWoeData.csv')
    Y_test = test['SeriousDlqin2yrs']
    X_test = test.drop([
        'SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome',
        'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
        'NumberOfDependents'
    ],
                       axis=1)
    X3 = sm.add_constant(X_test)
예제 #35
0
ax.set_ylabel('Average procurement cost savings [USD/MWh]')
ppt.savefig(run + '/41_WSpricestd_savings.png', bbox_inches='tight')
ppt.savefig(run + '/41_WSpricestd_savings.pdf', bbox_inches='tight')

print('Savings as a function of price std')
print(str(reg2.intercept_[0]) + ' + ' + str(reg2.coef_[0][0]) + '* std')
print('R2: ' + str(
    reg2.score(
        np.sqrt(df_results['var_p'].to_numpy()).reshape(len(df_results), 1),
        (df_results['difference']).to_numpy().reshape(len(df_results), 1))))

#

Y = (df_results['difference']).to_numpy().reshape(len(df_results), 1)
X = np.sqrt(df_results['var_p'])
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

df_results['std_p'] = np.sqrt(df_results['var_p'])
df_results_short = df_results.loc[df_results['std_p'] < 90.]
Y2 = (df_results_short['difference']).to_numpy().reshape(
    len(df_results_short), 1)
X2 = df_results_short['std_p'].to_numpy().reshape(len(df_results_short), 1)
X2 = sm.add_constant(X2)
model2 = sm.OLS(Y2, X2)
results2 = model2.fit()
print(results2.summary())

import pdb
예제 #36
0
 

Build A Predictive Model And Conclude If Both Predictors (Independent Variables) Are Significant For A Students’ Height Or Not
When Father’s Height Is Held Constant, The Average Student Height Increases By How Many Inches For Each One-Inch Increase In Mother’s Height.
When Mother’s Height Is Held Constant, The Average Student Height Increases By How Many Inches For Each One-Inch Increase In Father’s Height.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

fs_df = pd.read_csv('Female_Stats.csv')
features = fs_df.iloc[:, 1:].values
labels = fs_df.iloc[:, 0].values

#This is done because statsmodels library requires it to be done for constants.
#features = np.append(arr = np.ones((30, 1)), values = features, axis = 1)
features = sm.add_constant(features)
#adds a constant column to input data set.

features_opt = features[:, [0, 1, 2]]

regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit()
regressor_OLS.summary()
"""So we conclude that Both Predictors (Independent Variables) Are Significant For A Students’ Height"""
regressor_OLS.pvalues
print("Effect of mom's height", regressor_OLS.params[1])

print("Effect of dad's height", regressor_OLS.params[2])
예제 #37
0
def reconstruct():
    """
    run KFOLD method for regression 
    """
    #import packages
    import os
    import pandas as pd
    import statsmodels.api as sm
    from datetime import datetime
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 79
    y = 80

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {
            # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     lm = LinearRegression()
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
            # longitude = surge['lon'][0]
            # latitude = surge['lat'][0]
            # num_pc = X_pca.shape[1] #number of principal components
            # corr = np.mean(metric_corr)
            # rmse = np.mean(metric_rmse)

            # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
            #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
            #       np.mean(metric_rmse), '\n')
        }

        num_pc = X_pca.shape[1]  #number of principal components
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]

        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #model preparation
        #first train model using observed surge and corresponding predictors
        X_pca = sm.add_constant(X_pca)
        est = sm.OLS(y['surge'], X_pca).fit()

        #predict with X_recon and get 95% prediction interval
        X_pca_recon = sm.add_constant(X_pca_recon)
        predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05)

        #drop confidence interval and mean_se columns
        predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \
                         axis = 1, inplace = True)

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], predictions], axis=1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {
            # plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            # prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            # confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
예제 #38
0
    'night_charge'] + churn['intl_charge']

churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)
dependent_variable = churn['churn']

# independent_variables = churn[['account_length', 'custserv_calls', 'total_charges']]
# independent_variables = churn[['account_length', 'area_code', 'intl_plan', 'vmail_plan' ,
#                               'vmail_message', 'day_mins', 'day_calls',
#                               'day_charge', 'intl_mins', 'intl_charge', 'custserv_calls' ,'total_charges']]
independent_variables = churn[[
    'account_length', 'area_code', 'intl_plan', 'vmail_plan', 'vmail_message',
    'day_mins', 'day_calls', 'day_charge', 'intl_mins', 'intl_charge',
    'custserv_calls'
]]
independent_variables_with_constant = sm.add_constant(independent_variables,
                                                      prepend=True)
logit_model = sm.Logit(dependent_variable,
                       independent_variables_with_constant).fit()

new_observatios = churn.loc[churn.index.isin(range(20)),
                            independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded = [round(score, 0) for score in y_predicted]
print(y_predicted_rounded)
logistic_predicted_value_list = []

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0
예제 #39
0
                ax.set_xticks(0.5+np.arange(len(top_comb)), minor = False)
                ax.set_xticklabels(top_comb, ha='center', rotation = 20)
                ax.set_yticks(0.5+np.arange(len(top_comb)), minor = False)
                ax.set_yticklabels(top_comb[::-1], va='center', rotation = 30)

                #cax = fig.add_subplot(gs[6, :])
                cax = plt.axes([0.1, 0.1, 0.8, 0.05])
                cb = plt.colorbar(gigifig, cax=cax, orientation='horizontal', extend = 'both')
                cb.ax.tick_params(labelsize=18)
                cb.set_label('Cross correlation', fontsize=20)
                plt.subplots_adjust(left=0.1, bottom=0.17, right=0.98, top=0.92, wspace=0.05, hspace=0.20)

                fig.savefig(cart_out + 'Crosscorr_{}_{}_v2_{}driv_{}_{}.pdf'.format(reg, namti, nu, ensmod, katullo))


                Xgi = sm.add_constant(X)
                pvals = []
                params = []
                rsq = []
                for ko in Y.T:
                    est = sm.OLS(ko, Xgi)
                    est2 = est.fit()
                    est3 = est.fit_regularized(method='elastic_net', alpha=0.0) # This is a Ridge regression.
                    if np.any(est2.params - est3.params > 1.e-3):
                        print('AAAAA - 3')
                        print(est2.params)
                        print(est3.params)
                    elif np.any(est2.params - est3.params > 1.e-2):
                        raise ValueError('AAAAAAAAAAAAAA - 2')

                    pvals.append(est2.pvalues)
예제 #40
0
df = DataFrame(Data,
               columns=[
                   'country', 'year', 'status', 'life_expectancy', 'alcohol',
                   'percentage_expenditure', 'measles', 'bmi', 'polio',
                   'total_expenditure', 'gdp', 'population', 'schooling'
               ])

ax = df.plot(x="schooling", y="life_expectancy", style="o")
ax.set_ylabel("life_expectancy")

# In[2]:

import statsmodels.api as sm  #load the library and assigns a nickname
X = df[["schooling"]].values  # the independent variable(s)
X = sm.add_constant(X)  # add the intercept term
y = df["life_expectancy"].values  # the dependent variable
ols = sm.OLS(y, X).fit()  # run the regression
ols.summary()

# In[3]:

X = df[['alcohol', 'measles', 'gdp',
        'schooling']].values  # the independent variable(s)
X = sm.add_constant(X)  # add the intercept term
y = df["life_expectancy"].values  # the dependent variable
ols = sm.OLS(y, X).fit()  # run the regression
ols.summary()

# In[ ]:
start_date = y.index[0]
x.tail()

y.tail()
end_date = x.index[-1]

y = y[start_date:end_date]
x = x[start_date:end_date]

y.head()
x.head()

len(y)
len(x)
import statsmodels.api as sm
X = sm.add_constant(x) ## let's add an intercept (beta_0) to our model

# Note the difference in argument order
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)

# Print out the statistics
model.summary()

import statsmodels.formula.api as smf
d = { "x": pd.Series(x), "y": pd.Series(y)}
df = pd.DataFrame(d)
mod = smf.ols('y ~ x', data=df).fit()
print(mod.summary())

# Our linear model with a single independent variable on the left-hand side assumes the following form:
#
# $$y = \beta_0 + \beta_1 X_1 + \epsilon$$
#
# $\epsilon$ accounts for the deviations or errors that we will encounter when our data do not actually fit a straight line. When $\epsilon$ materializes, that is when we run the model of this type on actual data, the errors are called **residuals**.

# #### Estimate a simple regression with statsmodels

# The upper part of the summary displays the dataset characteristics, namely the estimation method, the number of observations and parameters, and indicates that standard error estimates do not account for heteroskedasticity.
#
# The middle panel shows the coefficient values that closely reflect the artificial data generating process. We can confirm that the estimates displayed in the middle of the summary result can be obtained using the OLS formula derived previously:

# In[5]:

X = sm.add_constant(data['X'])
model = sm.OLS(data['Y'], X).fit()
print(model.summary())

# #### Verify calculation

# In[6]:

beta = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(y))
pd.Series(beta, index=X.columns)

# #### Display model & residuals

# In[7]:

data['y-hat'] = model.predict()
예제 #43
0
import os

import pandas as pd
import statsmodels.api as sm


os.chdir('/Users/tom/Dropbox/Economics/Econometrics/Homework/HW8')
df = pd.read_csv('clean.csv')
df = sm.add_constant(df, prepend=True)
# a.) ln(wage) on educ, exper, expersq, black, south, smsa, reg661
#   through reg668 and smsa66.
endog_a = df['lwage']
exog = ['const', 'educ', 'exper', 'expersq', 'black', 'south',
        'smsa', 'reg661', 'reg662', 'reg663', 'reg664', 'reg665',
        'reg666', 'reg667', 'reg668', 'smsa66']

exog_a = df[exog]

endog_a.head()
exog_a.head()

model_a = sm.OLS(endog_a, exog_a)
results_a = model_a.fit()

print(results_a.summary())

#b: educ on exog_a - educ + nearc4
endog_b = df['educ']
exog_b = exog_a.drop('educ', axis=1).join(df['nearc4'])

model_b = sm.OLS(endog_b, exog_b)
예제 #44
0
        # backward step
        model = sm.Logit(y, sm.add_constant(pd.DataFrame(
            X[included]))).fit(disp=0)
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(
                    worst_feature, worst_pval))
        if not changed:
            break
    return included


if __name__ == '__main__':
    from sklearn.datasets import load_breast_cancer
    data = load_breast_cancer(return_X_y=False)
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target, name='response')
    selected_vars = stepwise_selection(X, y)

    print('resulting features:')
    print(selected_vars)

    model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[selected_vars]))).fit()
예제 #45
0
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
#Stock_Index_Price = (Intercept) + (Interest_Rate coef)*X1 + (Unemployment_Rate coef)*X2
#Stock_Index_Price = (1798.4040) + (345.5401)*X1 + (-250.1466)*X2

# prediction with sklearn
New_Interest_Rate = 2.75
New_Unemployment_Rate = 5.3
print ('Predicted Stock Index Price: \n', regr.predict([[New_Interest_Rate ,New_Unemployment_Rate]]))
#Interest Rate = 2.75 (i.e., X1= 2.75)
#Unemployment Rate = 5.3 (i.e., X2= 5.3)
Stock_Index_Price = (1798.4040) + (345.5401)*(2.75) + (-250.1466)*(5.3) #1422.86
Stock_Index_Price


# with statsmodels
import statsmodels.api as sm  #2nd method
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 
 
print_model = model.summary()
print(print_model)
#coefficients captured in this table match with the coefficients generated by sklearn.
#we got consistent results by applying both sklearn and statsmodels.
#ref
#https://datatofish.com/multiple-linear-regression-python/
#end here
#partition data into training and test sets
from sklearn.model_selection import train_test_split

X = df_dum.drop('avg_salary', axis=1)
y = df_dum.avg_salary.values

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

""" Model 1: Mulitple Regression """

# Statsmodel version   
# link=https://www.statsmodels.org/devel/generated/statsmodels.regression.linear_model.OLS.html
import statsmodels.api as sm

X_sm = sm.add_constant(X_train)

#make predictions using the training set
model = sm.OLS(y_train, X_sm)
model.fit().summary() # R-squared = 0.652 (e.g. model explains about 65% of the variance in salary estimate)

## SciKit version 
#link=https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
#link=https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
#link=https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

lm = LinearRegression()
예제 #47
0
#Model1
X = media[['Visitors', 'weekend']]
X
y = media['Views_show']

from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X, y)
lm.score(X, y)
lm.coef_

#different library: Constant term has to be added here
import statsmodels.api as sm
#need to fit constant
X = sm.add_constant(X)
lm_1 = sm.OLS(y, X).fit()
print(lm_1.summary())
#significnt variables - weekends, Visitors. P>|t| : .05

#2nd model : keep changing the combination of variables
X = media[{'Visitors', 'weekend', 'Character_A'}]
y = media['Views_show']
import statsmodels.api as sm
X = sm.add_constant(X)
lm_2 = sm.OLS(y, X).fit()
print(lm_2.summary())

#like this keep creating model. Model which has higher AdjR2 is better.
#see more from here
#(https://www.kaggle.com/ashydv/media-company-case-study-linear-regression)
예제 #48
0
reg = LinearRegression()
reg.fit(x, y)

#reg.coef_ and reg.intercept_ calculates the value of slope and C
print("The linear regression model is Y={:.5}X+ {:.5}".format(
    reg.coef_[0][0], reg.intercept_[0]))

#now creating predictions
predictions = reg.predict(x)
plt.figure(figsize=(12, 6))
plt.scatter(data["User"], data["ratings"], c='red')

plt.plot(data["User"], predictions, c='red', linewidth=2)
plt.xlabel("USERS")
plt.ylabel("RATINGS OUTOF 5")
plt.show

#Now accessig efficiency using RSquared model

x = data['User']
y = data['ratings']
x2 = sm.add_constant(x)

est = sm.OLS(y, x2)
est2 = est.fit()
print(est2.summary())

print("Enter user no.")
p = int(input())
rate = p * reg.coef_[0][0] + reg.intercept_[0]
print("User will rate", rate)
aX = np.asarray(df_fradulent['InscClaimAmtReimbursed'])
aY = np.asarray(df_fradulent['TotalClaimCost'])
regr = linear_model.LinearRegression()
regr.fit(aX.reshape(-1, 1), aY.reshape(-1, 1))
plt.scatter(aX, aY, marker='+')
plt.plot(aX, regr.predict(aX.reshape(-1, 1)))
plt.xlabel("InscClaimAmtReimbursed")
plt.ylabel("TotalClaimCost")
plt.show()

# In[209]:

aX = np.asarray(df_fradulent['InscClaimAmtReimbursed'])
aY = np.asarray(df_fradulent['TotalClaimCost'])
aX = sm.add_constant(aX)

model = sm.OLS(aY, aX)
results = model.fit()
print(results.summary())

# In[210]:

df = pd.DataFrame(results.resid)
df.describe()

# In[211]:

plt.style.use('seaborn')
plt.rc('font', size=14)
plt.rc('figure', titlesize=18)
예제 #50
0
import pandas_datareader.data as pdr
import numpy as np

import pandas_datareader.data as web




start='1971/12/1'
end='2016/8/31'
workpop = web.DataReader('LFWA64TTJPM647S',"fred",start,end).dropna()
gdp = web.DataReader('MKTGDPJPA646NWDB',"fred",start,end).dropna()
gdp=gdp.resample('A',loffset='-1d').last().dropna()
fx = web.DataReader('DEXJPUS',"fred",start,end).dropna()
fx=fx.resample('A',loffset='-1d').last().dropna()
workpop=workpop['1972':].resample('A',loffset='-1d').last().dropna()
gdpjpy=gdp.MKTGDPJPA646NWDB*fx.DEXJPUS
gdpjpy=np.log(gdpjpy).dropna()
workpop=np.log(workpop).dropna()

import statsmodels.api as sm
x=sm.add_constant(gdpjpy)
model=sm.OLS(gdpjpy,x)
results=model.fit()
print(results.summary())

f2xcoef = np.array([[ 0.1,  3.,  1.,    0.],
                    [ 0.,  0.,  1.5,   0.1],
                    [ 3.,  2.,  1.,    0.]])
x0 = np.dot(f0, f2xcoef)
x0 += 0.1*np.random.normal(size=x0.shape)
ytrue = np.dot(f0,[1., 1., 1.])
y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)

xred, fact, eva, eve  = pca(x0, keepdim=0)
print eve
print fact[:5]
print f0[:5]

import statsmodels.api as sm

res = sm.OLS(y0, sm.add_constant(x0, prepend=False)).fit()
print 'OLS on original data'
print res.params
print res.aic
print res.rsquared

#print 'OLS on Factors'
#for k in range(x0.shape[1]):
#    xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
#    fact_wconst = sm.add_constant(fact)
#    res = sm.OLS(y0, fact_wconst).fit()
#    print 'k =', k
#    print res.params
#    print 'aic:  ', res.aic
#    print 'bic:  ', res.bic
#    print 'llf:  ', res.llf
raw_data = pd.read_csv('../data/logistic-regression/Admittance.csv')

print(raw_data.head())

data = raw_data.copy()

data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0})

print(data.head())

y = data['Admitted']
x1 = data['SAT']

#plot with logistic regression
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()


def f(x, b0, b1):
    return np.array(np.exp(b0 + x * b1) / (1 + np.exp(b0 + x * b1)))


f_sorted = np.sort(f(x1, results_log.params[0], results_log.params[1]))
x_sorted = np.sort(np.array(x1))

plt.scatter(x1, y, color='C0')
plt.xlabel('SAT', fontsize=10)
plt.ylabel('Admitted', fontsize=10)
plt.plot(x_sorted, f_sorted, color='C0')
예제 #53
0
파일: universe_3.py 프로젝트: uwdata/boba
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import statsmodels.api as sm

if __name__ == '__main__':
    # read data file
    df = pd.read_csv('data.csv')

    # remove outliers based on iqr
    iqr = np.subtract(*np.percentile(df.y, [75, 25]))
    median = np.median(df.y)
    df = df[abs(df.y - median) <= 3 * iqr]

    # fit a simple ordinary least squares model
    x = sm.add_constant(df.x)
    lm = sm.OLS(df.y, x).fit()

    # display results
    print('y = {:.2f} + {:.2f} * x'.format(lm.params.const, lm.params.x))
    print('AIC: {:.2f}'.format(lm.aic))
    print('Coehn\'s F2: {:.3f}'.format(lm.rsquared_adj))
예제 #54
0
                    cnd_nan = np.isnan(S_D_cand)
                    S_D_cand[cnd_nan] = 0.5 # correct the NaN value of correlation

                    if ((bi-ai)*(bj-aj) < (wint*2+1)*(wint*2+1)): # not an integrate window
                        D_D_cand = 1.0+np.power(np.square(i-x_cand)+np.square(j-y_cand),0.5)/opts.hwid # spatial distance
                    else:
                        D_D_cand = D_D_all[cnd_cand] # integrate window
                    C_D = (1.0-S_D_cand)*D_D_cand+0.0000001 # combined distance
                    weight = (1.0/C_D)/np.sum(1.0/C_D)

                    for iband in range(nb): # compute V
                        fine_cand = np.hstack(((fine1[iband,aj:bj,ai:bi])[cnd_cand],(fine2[iband,aj:bj,ai:bi])[cnd_cand]))
                        coarse_cand = np.hstack(((coarse1[iband,aj:bj,ai:bi])[cnd_cand],(coarse2[iband,aj:bj,ai:bi])[cnd_cand]))
                        coarse_change = abs(np.nanmean((coarse1[iband,aj:bj,ai:bi])[cnd_cand])-np.nanmean((coarse2[iband,aj:bj,ai:bi])[cnd_cand]))
                        if (coarse_change >= opts.dn_max*0.02): # to ensure changes in coarse image large enough to obtain the conversion coefficient
                            regress_result = sm.OLS(fine_cand,sm.add_constant(coarse_cand)).fit()
                            sig = 1.0-stats.f.cdf(regress_result.fvalue,1,number_cand*2-2)
                            # correct the result with no significancy or inconsistent change or too large value
                            if (sig <= 0.05) and (regress_result.params[1] > 0) and (regress_result.params[1] <= 5):
                                V_cand = regress_result.params[1]
                            else:
                                V_cand = 1.0
                        else:
                            V_cand = 1.0

                        # compute the temporal weight
                        difc_pair1 = np.abs(np.nanmean((coarse0[iband,aj:bj,ai:bi])[cnd_wind_valid])-np.nanmean((coarse1[iband,aj:bj,ai:bi])[cnd_wind_valid]))+0.01**5
                        difc_pair2 = np.abs(np.nanmean((coarse0[iband,aj:bj,ai:bi])[cnd_wind_valid])-np.nanmean((coarse2[iband,aj:bj,ai:bi])[cnd_wind_valid]))+0.01**5
                        T_weight1 = (1.0/difc_pair1)/(1.0/difc_pair1+1.0/difc_pair2)
                        T_weight2 = (1.0/difc_pair2)/(1.0/difc_pair1+1.0/difc_pair2)
예제 #55
0
# random search for MLE starting parameters. Because Markov switching models
# are often characterized by many local maxima of the likelihood function,
# performing an initial optimization step can be helpful to find the best
# parameters.
#
# Below, we specify that 20 random perturbations from the starting
# parameter vector are examined and the best one used as the actual starting
# parameters. Because of the random nature of the search, we seed the random
# number generator beforehand to allow replication of the result.

mod_filardo = sm.tsa.MarkovAutoregression(
    dta_filardo.iloc[2:]['dlip'],
    k_regimes=2,
    order=4,
    switching_ar=False,
    exog_tvtp=sm.add_constant(dta_filardo.iloc[1:-1]['dmdlleading']))

np.random.seed(12345)
res_filardo = mod_filardo.fit(search_reps=20)

res_filardo.summary()

# Below we plot the smoothed probability of the economy operating in a
# low-production state, and again include the NBER recessions for
# comparison.

fig, ax = plt.subplots(figsize=(12, 3))

ax.plot(res_filardo.smoothed_marginal_probabilities[0])
ax.fill_between(usrec.index,
                0,
예제 #56
0
# DO NOT EDIT

# # Robust Linear Models

from __future__ import print_function
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# ## Estimation
#
# Load data:

data = sm.datasets.stackloss.load()
data.exog = sm.add_constant(data.exog)

# Huber's T norm with the (default) median absolute deviation scaling

huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT())
hub_results = huber_t.fit()
print(hub_results.params)
print(hub_results.bse)
print(
    hub_results.summary(
        yname='y',
        xname=['var_%d' % i for i in range(len(hub_results.params))]))

# Huber's T norm with 'H2' covariance matrix

hub_results2 = huber_t.fit(cov="H2")
예제 #57
0
def linear_fit(x, y, constant=True):
    if constant:
        x = _sm.add_constant(x)
    fit = _sm.OLS(y, x).fit()
    out = (fit.params, fit.fittedvalues, fit.resid)
    return out
예제 #58
0
# train test split
from sklearn.model_selection import train_test_split

X = df_dum.drop('avg_salary', axis=1)
y = df_dum.avg_salary.values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# multiple linear regression
import statsmodels.api as sm

X_sm = X = sm.add_constant(X)
model = sm.OLS(y, X_sm)
model.fit().summary()

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(X_train, y_train)

np.mean(
    cross_val_score(lm,
                    X_train,
                    y_train,
                    scoring='neg_mean_absolute_error',
                    cv=3))
예제 #59
0
  def linreg(x,y):
    x = sm.add_constant(x)
    model = regression.linear_model.OLS(y,x).fit()

    X = x[:,1]
    return model.params[0], model.params[1]
예제 #60
0
def make_a_feature_list(data, feat):
    X = data[feat]
    Y = sm.add_constant(X)  # add a constant
    return Y