示例#1
0
def stepwise_selection(data, target, SL_in=0.05, SL_out=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features) > 0):
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = OLS(target,
                        sm.add_constant(data[best_features +
                                             [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if (min_p_value < SL_in):
            best_features.append(new_pval.idxmin())
            while (len(best_features) > 0):
                best_features_with_constant = sm.add_constant(
                    data[best_features])
                p_values = OLS(target,
                               best_features_with_constant).fit().pvalues[1:]
                max_p_value = p_values.max()
                if (max_p_value >= SL_out):
                    excluded_feature = p_values.idxmax()
                    best_features.remove(excluded_feature)
                else:
                    break
        else:
            break
    return best_features
示例#2
0
    def remove_outliers(train, targetField, dropVal, studentResid, verbose=True):
        """
        Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features

        :param pandas.DataFrame train: data for training
        :param str targetField: target from train/ test :py:class:`pandas.DataFrame`
        :param obj dropVal: value to drop rows across
        :param float studentResid: number to threshold absolute value of student residuals above
        :param bool verbose: flag to print out OLS summary information and number of outlier removed
        """

        train = train.dropna()
        if dropVal is not None:
            train = train.ix[(train.T != dropVal).all()]

        design = train[[i for i in train if i != targetField]]
        target = train[targetField]

        design = StandardScaler().fit_transform(design)
        model = OLS(target, design)
        mask = np.ones((train.shape[0])).astype(bool)
        if studentResid is not None:
            mask = (model.fit().outlier_test()['student_resid'].abs() < 2)

        if verbose:
            print model.fit().summary()
            print 'Removed:' + str(train.shape[0] - sum(mask))

        return train.ix[mask]
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''):
    
    #Express the observations relative to the base period 
    obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))])

    #Load the best estimate forcings from Piers
    forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt'
    data = np.genfromtxt(forc_file,skip_header=4)
    years = data[:,0]
    tot_forc = data[:,13]
    ant_forc = data[:,14]
    
    #Integrate anthropogenic and natural forcing with standard FAIR parameters
    C, t_nat = fair_scm(other_rf=tot_forc-ant_forc)
    C, t_anthro = fair_scm(other_rf=ant_forc)
    #Express relative to the centre of the base period
    t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)])
    t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)])
    # -----------------------------------------------
    
    
    # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared
    # -----------------------------------------------
    #Interpolate the annual forced responses to the grid of the observed data
    if reg_type !='mon':
        t_nat = np.interp(obs_years+0.5, years+0.5, t_nat)
        t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro)
    else:
        t_nat = np.interp(obs_years, years+0.5, t_nat)
        t_anthro = np.interp(obs_years, years+0.5, t_anthro)

    #Linearly project the final half year
    t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_anthro[obs_years<=(years[-1]+0.5)][-1]
    t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_nat[obs_years<=(years[-1]+0.5)][-1]
    # -----------------------------------------------
    
    #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant
    y = np.copy(obs)
    x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)})
    # add constant vector on to dataframe we will fit to temp observations
    x = statsmodels.tools.tools.add_constant(x)
    # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset.
    model = OLS(y, x)
    result = model.fit()
    # collect output scaling factors for anthro and natural temperature timeseries
    sf = result.params

    #Form scaled anthropgenic warming index
    awi = t_anthro * sf['x1']
    #Scaled natural warming index
    nwi = t_nat * sf['x2']
    #Scaled total externally forced warming index
    gwi = awi + nwi
    
    print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2'])

    
    return awi, nwi
示例#4
0
    def _capm_mu(self, asset, markets, mu, sigma, X):
        """Calculate mean estimated by CAPM."""
        freq = tools.freq(X.index)
        X = X[[asset] + markets].dropna()
        res = OLS(X[asset] - 1 - self.rfr / freq, add_constant(X[markets] - 1 - self.rfr / freq)).fit()

        beta = res.params.drop(['const'])

        prev_mu = mu[asset]
        new_mu = self.rfr + (mu[markets] - self.rfr).dot(beta)

        alpha = res.params.const * freq
        alpha_std = freq * np.sqrt(res.cov_params().loc['const', 'const'])

        if self.verbose:
            print(f'Beta of {[x for x in beta.round(2)]} changed {asset} mean return from {prev_mu:.1%} to {new_mu:.1%} with alpha {alpha:.2%} ({alpha_std:.2%})')

        # be benevolent and add alpha if it is positive
        # k = 0.2 was fine tuned on DPST in order to get it out of the portfolio
        k = 0.2
        if alpha - k * alpha_std > 0 and asset in ('KRE', 'DPST'):
            if self.verbose:
                print(f'   Adding alpha of {alpha - k * alpha_std:.2%} for {asset}')
            new_mu += alpha - k * alpha_std
        return new_mu
示例#5
0
    def alpha_beta(self):
        rr = (self.X - 1).mean(1)

        m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T)
        reg = m.fit()
        alpha, beta = reg.params.const * 252, reg.params.x1
        return alpha, beta
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50, 6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        print(regressor_OLS.summary())
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:, j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:, [0, j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print(regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
        else:
            break
    return x
示例#7
0
def prosperity_score_regression(cards,
                                metadata,
                                score_columns=score_column_names):
    """
    Perform a linear regression to determine the degree to which the
    Prosperity add-on treasure and victory cards contribute to a good
    score.
    """
    prosperity = set(cards['currency'].columns.get_level_values(1))
    # victory_cards = set(cards['victory'].columns.get_level_values(1))
    # cards = currency_cards.union(victory_cards)
    scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1)

    # Ignore missing cells
    refine_idx = np.isfinite(scores)
    scores = scores[refine_idx]

    set_counts = pd.concat([
        pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values,
                     columns=[c]) for c in prosperity
    ] + [
        pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score'])
    ],
                           axis=1).fillna(0)

    results = OLS(scores, set_counts).fit()
    print results.summary()
示例#8
0
 def run_acc_compare(self, print_summary=False, data_df=None):
     #if regressiondict is None:
     #    regressiondict=self.modeldict['regressiondict']
     if data_df is None:
         self.set_flat_c_stats_df()
         data_df = self.flat_c_stats_df
     data_df.dropna(inplace=True, axis=0)
     y_df = data_df.loc[:, 'accuracy']
     X_df = data_df.drop(labels='accuracy', axis=1, inplace=False)
     #print('y_df',y_df)
     #print('X_df',X_df)
     X_dtypes_ = dict(X_df.dtypes)
     obj_vars = [
         var for var, dtype in X_dtypes_.items() if dtype == 'object'
     ]
     #float_idx=[i for i in range(X_df.shape[1]) if i not in obj_idx]
     #self.model=regressiondict['pipeline'](cat_idx=obj_idx,float_idx=float_idx)
     X_float_df = self.floatify_df(X_df, obj_vars)
     #X_float_df=add_constant(X_float_df)
     self.X_float_df = X_float_df
     self.y_df = y_df
     self.model = OLS(y_df, X_float_df)
     self.model_result = self.model.fit()
     if print_summary:
         print('OLS results for modeldict:')
         print(self.modeldict)
         print(self.model_result.summary())
示例#9
0
    def alpha_beta(self):
        rr = (self.X - 1).mean(1)

        m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T)
        reg = m.fit()
        alpha, beta = reg.params.const * 252, reg.params.x1
        return alpha, beta
def linear_regression(data):
    """
    goal of this function :
        - to apply a linear regression ; ie. to calculate the coefficient and
        the intercept value of the regression line
    input parameter :
        - json file's content (data)
    output :
        - dict containing the coefficient value and intercept for each word
    cmd packages :
        - numpy (ones, arange)
        - statsmodels.api (ols)
    """

    #initialisation
    dict_linreg = {}

    #for each entry in the json file (data)
    #intercept value and coefficient calculation
    for k, v in data.items():
        mat_x = np.ones((len(v), 2))
        mat_x[:, 1] = np.arange(0, len(v))

        reg = OLS(v, mat_x)
        results = reg.fit()

        dict_linreg[k] = [results.params[1], results.params[0]]

    return (dict_linreg)
示例#11
0
def get_cointLst(corrList, df_is):
    # called in main
    # Test cointegration the test has to be perform on both side of the spread
    cointLst = []
    for pair in corrList:
        X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values

        x1 = add_constant(X1)
        x2 = add_constant(X2)
        r1 = OLS(X2, x1).fit()
        r2 = OLS(X1, x2).fit()

        adf1 = adfuller(r1.resid)[1]
        if adf1 < 0.01:
            adf2 = adfuller(r2.resid)[1]
            if adf2 < 0.01 and adf1 < adf2:  # Test for strong cointegration in both side only.
                cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair +
                                [adf1] + list(r1.params))
            elif adf2 < 0.01:
                cointLst.append(["{0}_{1}".format(pair[1], pair[0])] +
                                [pair[1], pair[0], pair[2], pair[3], adf2] +
                                list(r2.params))

    #print "There are {0} pairs strongly cointegrated.".format(len(cointLst))
    return cointLst
示例#12
0
def test_linearity(x, y, n_knots=5, verbose=True):
    """Test linearity between two variables.

    Run a linear regression of y on x, and take the residuals.
    Fit the residuals with a natural spline with `n_knots` knots.
    Conduct a joint F-test for all columns in the natural spline basis matrix.

    Example:
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> x = np.linspace(0., 1., 101)
    >>> y = 5 * x + 3 + rng.random(size=101) / 5
    >>> test_linearity(x, y, n_knots=5, verbose=False)
    0.194032
    """
    residuals = OLS(y, add_constant(x)).fit().resid
    basis_matrix = patsy.dmatrix(
        f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x},
        return_type='dataframe')
    results = OLS(residuals, basis_matrix).fit()
    results.summary()
    nobs = results.nobs
    f_value = results.fvalue
    p_value = np.round(results.f_pvalue, 6)
    print('Test for Linearity: '
          f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; '
          f'F = {f_value:.3f}; p = {p_value:.6f}.')
    return p_value
示例#13
0
def nuevo_regress():
    modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit()
    summary = modelo.summary()
    vals_residuales = modelo.resid
    print(summary)
    print(anderson(vals_residuales))
    grafica_qq(vals_residuales)
示例#14
0
 def testPow(n):
     raw_X = trainData.OverallQual.values.reshape(-1, 1)
     OLS_y = trainData.SalePrice
     X = raw_X**n
     features = sm.add_constant(X)
     ols_sm = OLS(OLS_y.values, features)
     model = ols_sm.fit()
     return model.rsquared
示例#15
0
 def fit(self, x, y):
     x = array(x).reshape(-1, 1)
     model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit()
     self.m = model.predict(
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))
     self.s = wls_prediction_std(
         model,
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0]
     return self
示例#16
0
 def _capm(self):
     rfr = self.rf_rate / self.freq()
     rr = self.ucrp_r - rfr
     if 'CASH' in self.B.columns:
         cash = self.B.CASH
     else:
         cash = 0
     m = OLS(self.r - 1 - (1 - cash) * rfr,
             np.vstack([np.ones(len(self.r)), rr - 1]).T)
     return m.fit()
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        print(regressor_OLS.summary())
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    return x
示例#18
0
 def stats_models(self, X_train, y_train, show_summary=False):
     '''
     perform OLS from stats model 
     return model results
     '''
     X = sm.add_constant(X_train)
     model_stats = OLS(y_train, X)
     results_stats = model_stats.fit()
     if show_summary:
         results_stats.summary()
     return results_stats
示例#19
0
def find_apex(decel):
    res = []
    for t in decel.index[10::10]:
        left = decel[:t]['accelY']
        right = decel[t:]['accelY']
        left_mod = OLS(left, add_constant(range(len(left)))).fit()
        right_mod = OLS(right, add_constant(range(len(right)))).fit()
        ssrs = [t, left_mod.ssr, right_mod.ssr]
        res.append(ssrs)
    apex = min(res, key=lambda x: x[1] + x[2])[0]
    return apex
示例#20
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
示例#21
0
 def est_via_ols(self):
     """
     Estimate average treatment effects with Linear Regression.
     """
     regressor = np.zeros((self.data.n, 1 + self.data.X.shape[1]))
     regressor[:, 0] = self.data.Z
     regressor[:, 1:] = self.data.X
     ols_model = LinearRegression(self.data.Y, regressor)
     reg_results = ols_model.fit()
     ate = reg_results.params[0]
     se = np.sqrt(reg_results.HC0_se[0])
     return self._get_results(ate, se)
示例#22
0
    def LRFunc(self, measureType):
        '''
        Linear regression using OLS 
        cut-off leverage: 3k/n
        cut-off for influence: 1
        cut-off for DFFITS 2*sqrt(k/n)
        cut-off for DFBETAS 2/sqrt(n)  where k=1
        '''
        dft = self.dfA[(self.dfA.MEASURE_TYPE == measureType)
                       & (self.dfA.FILTER_FLAG != 'WHO')].copy()
        reg = linear_model.LinearRegression()
        print(dft.MEASURE_VAL, dft.AGE)
        regression = OLS(dft.MEASURE_VAL, dft.AGE).fit()
        infl = regression.get_influence()
        test = regression.outlier_test()

        k = 1
        N = len(dft)
        print(N)
        dft['OLS_BONFPVAL'] = test['bonf(p)']
        dft['OLS_STUDENTRES'] = test['student_resid']
        dft['OLS_INFLUENCE'] = infl.summary_frame().cooks_d
        dft['OLS_DFFITS'] = infl.summary_frame().dffits
        dft['OLS_DFB_AGE'] = infl.summary_frame().dfb_AGE
        dft['N'] = [N] * N

        coL, coI, coDf1, coDf2 = 3.0 * k / N, 1, 2 * (k / N)**0.5, 2 / (N**0.5)
        dft1 = dft[(abs(dft['OLS_INFLUENCE']) <= coI)
                   & (abs(dft['OLS_DFFITS']) <= coDf1) &
                   (abs(dft['OLS_DFB_AGE']) <= coDf2)]

        if len(dft1) <= 2:
            for idx, row in dft.iterrows():
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_FEW_REMAIN'
            return

        reg.fit(dft1[['AGE']], dft1['SDS'])
        dft['pred1'] = reg.predict(dft[['AGE']])
        dft['diff1'] = dft['SDS'] - dft['pred1']
        stdVal = dft[dft.index.isin(dft1.index)].diff1.std()
        dft['STD_FOLD'] = dft.diff1 / stdVal

        self.stdVal[measureType] = stdVal
        self.coef[measureType] = reg.coef_[0]
        self.intercept[measureType] = reg.intercept_

        for idx, row in dft.iterrows():
            if abs(row.STD_FOLD) <= LRCutoffSD[measureType]:
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'PLAUSIBLE'
            else:
                self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_OUTLIER'

        return
示例#23
0
def half_life(spread):
    lag = spread.shift(1)
    lag.iloc[0] = lag.iloc[1]
    ret = spread - lag
    ret.iloc[0] = ret.iloc[1]
    lag2 = add_constant(lag)
    model = OLS(ret, lag2)
    res = model.fit()
    halflife = int(round(-log(2) / res.params[1], 0))

    if halflife <= 0:
        halflife = 1
    return halflife
示例#24
0
 def fit(self, X, y, **kwargs):
     if self.fit_intercept:
         X = sm.add_constant(X)
     try:
         self.alpha = kwargs['alpha']
     except:
         raise Exception(
             'cannot find alpha! please set the penalty of Lasso')
     else:
         self.model = OLS(y, X)
     self.res = self.model.fit_regularized(alpha=self.alpha,
                                           L1_wt=1,
                                           **kwargs)
示例#25
0
 def run_regr(self):
     if self.pca_flag == True:
         self.train_x, self.test_x = self.pca(
             self.train_x, self.test_x, n_components=self.n_components)
     regr = OLS(self.train_y['Y_M_1'], add_constant(self.train_x)).fit()
     # print(regr.summary())
     try:
         y_pred = regr.predict(add_constant(self.test_x))
     except Exception as e:
         print(e)
         return None
     # print(f'R-square is {r2_score(self.test_y.Y_M_1, y_pred)}')
     # print(f'Mean - y_pred {np.mean(y_pred)}, Mean - y {np.mean(self.test_y.Y_M_1)}')
     return r2_score(self.test_y.Y_M_1, y_pred)
示例#26
0
def get_half_life_from_scratch(stockX, stockY, beta, df_is):
    # called in get_df_coint
    z_array = get_z(stockX, stockY, beta, df_is)

    z_lag = np.roll(z_array, 1)
    z_lag[0] = 0
    z_ret = z_array - z_lag

    # adds intercept terms to X for regression
    z_lag2 = add_constant(z_lag)
    model = OLS(z_ret, z_lag2)
    res = model.fit()

    return int(-np.log(2) / res.params[1])
示例#27
0
def ols_cluster_robust(formula, cluster, covs, coef):
    """Model clusters with cluster-robust OLS, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    res = OLS.from_formula(formula, data=cov_rep).fit(
        cov_type='cluster', cov_kwds=dict(groups=cov_rep['id']))
    return get_ptc(res, coef)
def optimal_spreads_regression(cov_matrix, mid, market_rel_spread):
    regressors = 3*pd.DataFrame([np.diag(cov_matrix)], ['Variance'], mid.index).T
    regressors['Inverse decay'] = 1
    fit = OLS(market_rel_spread*mid, regressors).fit()
    risk_aversion = fit.params['Variance']
    intensity_decay = 2/fit.params['Inverse decay']
    return risk_aversion, intensity_decay, fit.rsquared
示例#29
0
    def calculate(self):
        '''
            vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1)))
            vander(x,n+1)
        '''
        if not len(self.xs) or \
            not len(self.ys):
            return

        if len(self.xs) != len(self.ys):
            return

#        xs = asarray(self.xs)
        ys = asarray(self.ys)
#        self._ols = OLS(ys, vander(xs, self.degree + 1))
#        self._result = self._ols.fit()
#            print len(xs), len(ys)
#        print self.degree
#        print vander(xs, self.degree + 1)
        X = self._get_X()
        if X is not None:
            try:
                self._ols = OLS(ys, X)
                self._result = self._ols.fit()
            except Exception, e:
                print e
示例#30
0
def ols_cluster_robust(formula, cluster, covs, coef):
    """Model clusters with cluster-robust OLS, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    res = OLS.from_formula(formula, data=cov_rep).fit(cov_type='cluster',
            cov_kwds=dict(groups=cov_rep['id']))
    return get_ptc(res, coef)
示例#31
0
def capm(y: pd.Series, bases: pd.DataFrame, rf=0.0, fee=0.0):
    freq = _freq(y.index)
    rf = rf / freq
    fee = fee / freq
    R = y.pct_change() - rf
    R.name = y.name
    R_base = bases.pct_change().sub(rf, axis=0)

    # CAPM:
    # R = alpha + rf + beta * (Rm - rf)
    model = OLS(R, R_base.assign(Intercept=1), missing="drop").fit()

    alpha = model.params["Intercept"] * freq
    betas = model.params[bases.columns]

    # reconstruct artificial portfolio
    proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee)
    cumproxy = (1 + proxy).cumprod()

    # residual portfolio
    r = y.pct_change() - cumproxy.pct_change()
    residual = (1 + r).cumprod()

    return {
        "alpha": alpha,
        "betas": betas,
        "cumproxy": cumproxy,
        "model": model,
        "residual": residual,
    }
示例#32
0
def _compute_vif(exog, exog_idx, weights=None, model_config=None):
    """
    Compute variance inflation factor, VIF, for one exogenous variable
    for OLS and WLS that allows weights.
    Parameters
    ----------
    exog: X features [X_1, X_2, ..., X_n]
    exog_idx: ith index for features
    weights: weights
    model_config: {"hasconst": True,
    "cov_type": "HC3"} by default
    
    Returns: vif
    -------
    """
    if model_config is None:
        model_config = {"hasconst": True,
                        "cov_type": "HC3"}
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    if weights is None:
        r_squared_i = OLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"]).fit().rsquared
    else:
        r_squared_i = WLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"],
                          weights=weights).fit(
            cov_type=model_config["cov_type"]).rsquared
    vif = 1. / (1. - r_squared_i)
    return vif
示例#33
0
def capm(y: pd.Series, bases: pd.DataFrame, rf=0., fee=0.):
    freq = _freq(y.index)
    rf = rf / freq
    fee = fee / freq
    R = y.pct_change() - rf
    R.name = y.name
    R_base = bases.pct_change().sub(rf, axis=0)

    # CAPM:
    # R = alpha + rf + beta * (Rm - rf)
    model = OLS.from_formula(f"Q('{y.name}') ~ {'+'.join(bases.columns)}",
                             R_base.join(R)).fit()

    alpha = model.params['Intercept'] * freq
    betas = model.params[bases.columns]

    # reconstruct artificial portfolio
    proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee)
    cumproxy = (1 + proxy).cumprod()

    # residual portfolio
    r = y.pct_change() - cumproxy.pct_change()
    residual = (1 + r).cumprod()

    return {
        'alpha': alpha,
        'betas': betas,
        'cumproxy': cumproxy,
        'model': model,
        'residual': residual,
    }
示例#34
0
def fit(xyz, xlim=None, ylim=None, zlim=None, **kwargs):    
    all_true = numpy.empty_like(xyz[:,0], dtype=bool) \
               if None in [xlim, ylim, zlim] \
               else None
    xbool = numpy.abs(xyz[:,0]) < xlim if xlim else all_true
    ybool = numpy.abs(xyz[:,1]) < ylim if ylim else all_true
    zbool = numpy.abs(xyz[:,2]) < zlim if zlim else all_true
    bools = numpy.logical_and(numpy.logical_and(xbool, ybool), zbool)
    XYZ = xyz[bools,:]
    XY = add_constant(XYZ[:,:2], prepend=False)
    Z  = XYZ[:,-1]
    model = OLS(Z, XY)
    result = model.fit()
    coeffs = result.params
    stderr = result.HC1_se

    return coeffs, stderr
示例#35
0
def linear(data, **kwargs):
    '''linear regression model fitted with ordinary least squares
    
    Parameters
    ----------
    data : array or dataframe
        first column is endogenous, second column is
        a column of ones, the rest are exogenous data

    ** Keyword Arguments **

    prior_type : str
        'uniform' or 'collinear adjusted dilution'
    
    Returns
    -------
    rslts : array
        1-d array of parameter coefficients
    '''
    
    prior_type = kwargs.get('prior_type', 'uniform')

    endog = data[:, [0]]
    exog = data[:, 1:]

    model = OLS(endog=endog, exog=exog, missing='drop')
    
    adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \
            np.var(endog)).reshape((-1, 1))
    
    fit = model.fit()
    
    par_rsquared = fit.params.reshape((-1,1))*adj    
    
    if prior_type == 'uniform':
        prior = 1.
    elif prior_type == 'collinear adjusted dilution':
        prior = collinear_adj_prior(exog)
    else:
        raise ValueError('prior {} not supported'.format(prior_type))
    
    posterior = math.exp(fit.llf)*prior
        
    return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params, fit.pvalues, fit.bse, par_rsquared.flat))
示例#36
0
def mixed_model_cluster(formula, cluster, covs, coef):
    """Model clusters with a mixed-model, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    # TODO: remove this once newer version of statsmodels is out.
    # speeds convergence by using fixed estimates from OLS
    params = OLS.from_formula(formula, data=cov_rep).fit().params

    res = MixedLM.from_formula(formula, groups='id',
            data=cov_rep).fit(start_params=dict(fe=params), reml=False,
                    method='bfgs')

    return get_ptc(res, coef)
示例#37
0
class OLSRegressor(BaseRegressor):
    degree = Property(depends_on='_degree')
    _degree = Int
    constant = None
#    _result = None
#    @on_trait_change('xs,ys')
#    def _update_data(self):
#        self._ols = OLS(self.xs, vander(self.ys, self.degree + 1))
#        self._result = self._ols.fit()
#    def _xs_changed(self):
#            xs = asarray(self.xs)
#            ys = asarray(self.ys)
# #            print len(xs), len(ys)
#            self._ols = OLS(ys, vander(xs, self.degree + 1))
#            self._result = self._ols.fit()
    def __degree_changed(self):
        self.calculate()

    def calculate(self):
        '''
            vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1)))
            vander(x,n+1)
        '''
        if not len(self.xs) or \
            not len(self.ys):
            return

        if len(self.xs) != len(self.ys):
            return

#        xs = asarray(self.xs)
        ys = asarray(self.ys)
#        self._ols = OLS(ys, vander(xs, self.degree + 1))
#        self._result = self._ols.fit()
#            print len(xs), len(ys)
#        print self.degree
#        print vander(xs, self.degree + 1)
        X = self._get_X()
        if X is not None:
            try:
                self._ols = OLS(ys, X)
                self._result = self._ols.fit()
            except Exception, e:
                print e
# In[49]:

X=pd.DataFrame([timevncats.index.to_series(),timevncats.index.to_series()**2],index='x x**2'.split()).T






import statsmodels.api as sm


# In[65]:

ols=OLS(timevncats,sm.add_constant(X))


# In[66]:

ols=ols.fit()





nclients=Clientes.shape[0]

predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0]

print('Full data set should take %i hours' % int(predtime))
示例#39
0
 def fit_ols(y, x, idx=-1):
     ols = OLS(y, add_constant(x))
     results = ols.fit()
     return results.params.values[idx], results.cov_params().values[idx, idx]
示例#40
0
"""

import numpy as np
from statsmodels.api import add_constant, OLS, WLS
import matplotlib.pyplot as plt


# (x, y) is the set of observations.  w contains precomputed weights; we'll
# also compute these weights in this script.
x, y, w = np.loadtxt('draper_smith_table9p1.txt', unpack=True)

X = add_constant(x, prepend=True)

# --- OLS ---------------------------------------------------------------
# Ordinary least squares fit.
ols_result = OLS(y, X).fit()

print ols_result.summary()

# Make a plot of the OLS residuals vs y and vs x.
# The following recreates Fig. 9.1.
plt.figure(1)
plt.clf()
plt.subplot(2, 1, 1)
plt.plot(ols_result.fittedvalues, ols_result.resid, 'bo')
plt.title("OLS Residuals versus fitted values")
plt.xlabel('y')
plt.ylabel('e')
plt.grid()
plt.subplot(2, 1, 2)
plt.plot(x, ols_result.resid, 'bo')
示例#41
0
    def test_beta(self, b0_vals, param_nums, ftol=10 ** - 5, maxiter=30,
                  print_weights=1):
        """
        Returns the profile log likelihood for regression parameters
        'param_num' at 'b0_vals.'

        Parameters
        ----------
        b0_vals: list
            The value of parameters to be tested

        param_num: list
            Which parameters to be tested

        maxiter: int, optional
            How many iterations to use in the EM algorithm.  Default is 30

        ftol: float, optional
            The function tolerance for the EM optimization.
            Default is 10''**''-5

        print_weights: bool
            If true, returns the weights tate maximize the profile
            log likelihood. Default is False

        Returns
        -------

        test_results: tuple
            The log-likelihood and p-pvalue of the test.

        Notes
        ----

        The function will warn if the EM reaches the maxiter.  However, when
        optimizing over nuisance parameters, it is possible to reach a
        maximum number of inner iterations for a specific value for the
        nuisance parameters while the resultsof the function are still valid.
        This usually occurs when the optimization over the nuisance parameters
        selects paramater values that yield a log-likihood ratio close to
        infinity.

        Examples
        -------

        import statsmodels.api as sm
        import numpy as np

        # Test parameter is .05 in one regressor no intercept model
        data=sm.datasets.heart.load()
        y = np.log10(data.endog)
        x = data.exog
        cens = data.censors
        model = sm.emplike.emplikeAFT(y, x, cens)
        res=model.test_beta([0], [0])
        >>>res
        >>>(1.4657739632606308, 0.22601365256959183)

        #Test slope is 0 in  model with intercept

        data=sm.datasets.heart.load()
        y = np.log10(data.endog)
        x = data.exog
        cens = data.censors
        model = sm.emplike.emplikeAFT(y, sm.add_constant(x, prepend=1), cens)
        res=model.test_beta([0], [1])
        >>>res
        >>>(4.623487775078047, 0.031537049752572731)

        """
        censors = self.model.censors
        endog = self.model.endog
        exog = self.model.exog
        uncensored = (censors == 1).flatten()
        censored = (censors == 0).flatten()
        uncens_endog = endog[uncensored]
        uncens_exog = exog[uncensored, :]
        reg_model = OLS(uncens_endog, uncens_exog).fit()
        llr, pval, new_weights = reg_model.el_test(b0_vals, param_nums, return_weights=True)  # Needs to be changed
        km = self.model._make_km(endog, censors).flatten()  # when merged
        uncens_nobs = self.model.uncens_nobs
        F = np.asarray(new_weights).reshape(uncens_nobs)
        # Step 0 ^
        params = self.params()
        survidx = np.where(censors == 0)
        survidx = survidx[0] - np.arange(len(survidx[0]))
        numcensbelow = np.int_(np.cumsum(1 - censors))
        if len(param_nums) == len(params):
            llr = self._EM_test([], F=F, params=params,
                                      param_nums=param_nums,
                                b0_vals=b0_vals, survidx=survidx,
                             uncens_nobs=uncens_nobs,
                             numcensbelow=numcensbelow, km=km,
                             uncensored=uncensored, censored=censored,
                             ftol=ftol, maxiter=25)
            return llr, chi2.sf(llr, self.model.nvar)
        else:
            x0 = np.delete(params, param_nums)
            try:
                res = optimize.fmin(self._EM_test, x0,
                                   (params, param_nums, b0_vals, F, survidx,
                                    uncens_nobs, numcensbelow, km, uncensored,
                                    censored, maxiter, ftol), full_output=1,
                                    disp = 0)

                llr = res[1]
                return llr, chi2.sf(llr, len(param_nums))
            except np.linalg.linalg.LinAlgError:
                return np.inf, 0
示例#42
0
  square = lambda row: row**2
  sum_of_squares = df['difference'].apply(square).sum()
  return(sum_of_squares)

x0 = [-20, .0008, 1.1]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})

clf = linear_model.LinearRegression()
x = df[['AADT', 'L']].as_matrix()
y = df['Crashes']
clf.fit(x, y)
clf.coef_
clf.intercept_

model = OLS(y, add_constant(x))
model_fit = model.fit()
model_fit.summary()

def estimator(x, row_in='Crashes'):
  estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L'])
  df['estimated'] = df.apply(estimated, axis=1)
  #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in])
  probability = lambda row: poisson.pmf(row[row_in], row['estimated'])
  df['probability'] = df.apply(probability, axis=1)
  product = df['probability'].product()
  return(-product)

x0 = [1.6, .0000026, .032]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})