示例#1
0
    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = OLS(y_train, X_train, missing='drop')

        results = model.fit()

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = OLS(y_train, X_train, missing='drop')
            results = model.fit()
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return
示例#2
0
    def remove_outliers(train, targetField, dropVal, studentResid, verbose=True):
        """
        Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features

        :param pandas.DataFrame train: data for training
        :param str targetField: target from train/ test :py:class:`pandas.DataFrame`
        :param obj dropVal: value to drop rows across
        :param float studentResid: number to threshold absolute value of student residuals above
        :param bool verbose: flag to print out OLS summary information and number of outlier removed
        """

        train = train.dropna()
        if dropVal is not None:
            train = train.ix[(train.T != dropVal).all()]

        design = train[[i for i in train if i != targetField]]
        target = train[targetField]

        design = StandardScaler().fit_transform(design)
        model = OLS(target, design)
        mask = np.ones((train.shape[0])).astype(bool)
        if studentResid is not None:
            mask = (model.fit().outlier_test()['student_resid'].abs() < 2)

        if verbose:
            print model.fit().summary()
            print 'Removed:' + str(train.shape[0] - sum(mask))

        return train.ix[mask]
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''):
    
    #Express the observations relative to the base period 
    obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))])

    #Load the best estimate forcings from Piers
    forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt'
    data = np.genfromtxt(forc_file,skip_header=4)
    years = data[:,0]
    tot_forc = data[:,13]
    ant_forc = data[:,14]
    
    #Integrate anthropogenic and natural forcing with standard FAIR parameters
    C, t_nat = fair_scm(other_rf=tot_forc-ant_forc)
    C, t_anthro = fair_scm(other_rf=ant_forc)
    #Express relative to the centre of the base period
    t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)])
    t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)])
    # -----------------------------------------------
    
    
    # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared
    # -----------------------------------------------
    #Interpolate the annual forced responses to the grid of the observed data
    if reg_type !='mon':
        t_nat = np.interp(obs_years+0.5, years+0.5, t_nat)
        t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro)
    else:
        t_nat = np.interp(obs_years, years+0.5, t_nat)
        t_anthro = np.interp(obs_years, years+0.5, t_anthro)

    #Linearly project the final half year
    t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_anthro[obs_years<=(years[-1]+0.5)][-1]
    t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_nat[obs_years<=(years[-1]+0.5)][-1]
    # -----------------------------------------------
    
    #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant
    y = np.copy(obs)
    x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)})
    # add constant vector on to dataframe we will fit to temp observations
    x = statsmodels.tools.tools.add_constant(x)
    # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset.
    model = OLS(y, x)
    result = model.fit()
    # collect output scaling factors for anthro and natural temperature timeseries
    sf = result.params

    #Form scaled anthropgenic warming index
    awi = t_anthro * sf['x1']
    #Scaled natural warming index
    nwi = t_nat * sf['x2']
    #Scaled total externally forced warming index
    gwi = awi + nwi
    
    print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2'])

    
    return awi, nwi
示例#4
0
    def alpha_beta(self):
        rr = (self.X - 1).mean(1)

        m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T)
        reg = m.fit()
        alpha, beta = reg.params.const * 252, reg.params.x1
        return alpha, beta
示例#5
0
    def alpha_beta(self):
        rr = (self.X - 1).mean(1)

        m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T)
        reg = m.fit()
        alpha, beta = reg.params.const * 252, reg.params.x1
        return alpha, beta
def linear_regression(data):
    """
    goal of this function :
        - to apply a linear regression ; ie. to calculate the coefficient and
        the intercept value of the regression line
    input parameter :
        - json file's content (data)
    output :
        - dict containing the coefficient value and intercept for each word
    cmd packages :
        - numpy (ones, arange)
        - statsmodels.api (ols)
    """

    #initialisation
    dict_linreg = {}

    #for each entry in the json file (data)
    #intercept value and coefficient calculation
    for k, v in data.items():
        mat_x = np.ones((len(v), 2))
        mat_x[:, 1] = np.arange(0, len(v))

        reg = OLS(v, mat_x)
        results = reg.fit()

        dict_linreg[k] = [results.params[1], results.params[0]]

    return (dict_linreg)
示例#7
0
 def testPow(n):
     raw_X = trainData.OverallQual.values.reshape(-1, 1)
     OLS_y = trainData.SalePrice
     X = raw_X**n
     features = sm.add_constant(X)
     ols_sm = OLS(OLS_y.values, features)
     model = ols_sm.fit()
     return model.rsquared
 def _capm(self):
     rfr = self.rf_rate / self.freq()
     rr = self.ucrp_r - rfr
     if 'CASH' in self.B.columns:
         cash = self.B.CASH
     else:
         cash = 0
     m = OLS(self.r - 1 - (1 - cash) * rfr,
             np.vstack([np.ones(len(self.r)), rr - 1]).T)
     return m.fit()
示例#9
0
 def stats_models(self, X_train, y_train, show_summary=False):
     '''
     perform OLS from stats model 
     return model results
     '''
     X = sm.add_constant(X_train)
     model_stats = OLS(y_train, X)
     results_stats = model_stats.fit()
     if show_summary:
         results_stats.summary()
     return results_stats
示例#10
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
示例#11
0
 def est_via_ols(self):
     """
     Estimate average treatment effects with Linear Regression.
     """
     regressor = np.zeros((self.data.n, 1 + self.data.X.shape[1]))
     regressor[:, 0] = self.data.Z
     regressor[:, 1:] = self.data.X
     ols_model = LinearRegression(self.data.Y, regressor)
     reg_results = ols_model.fit()
     ate = reg_results.params[0]
     se = np.sqrt(reg_results.HC0_se[0])
     return self._get_results(ate, se)
示例#12
0
def half_life(spread):
    lag = spread.shift(1)
    lag.iloc[0] = lag.iloc[1]
    ret = spread - lag
    ret.iloc[0] = ret.iloc[1]
    lag2 = add_constant(lag)
    model = OLS(ret, lag2)
    res = model.fit()
    halflife = int(round(-log(2) / res.params[1], 0))

    if halflife <= 0:
        halflife = 1
    return halflife
示例#13
0
def get_half_life_from_scratch(stockX, stockY, beta, df_is):
    # called in get_df_coint
    z_array = get_z(stockX, stockY, beta, df_is)

    z_lag = np.roll(z_array, 1)
    z_lag[0] = 0
    z_ret = z_array - z_lag

    # adds intercept terms to X for regression
    z_lag2 = add_constant(z_lag)
    model = OLS(z_ret, z_lag2)
    res = model.fit()

    return int(-np.log(2) / res.params[1])
示例#14
0
def alpha_analysis(y,
                   x,
                   parameters,
                   name_parameters,
                   latex_name_parameters,
                   name_fig,
                   CI=True):
    alphas = []
    pvalues = []
    rsquared_adj = []
    s = []
    for k in range(0, y.shape[0]):
        model = OLS(endog=y[k], exog=x)  # no intercept by default
        fitted = model.fit()
        alphas.append(*fitted.params)
        pvalues.append(*fitted.pvalues)
        rsquared_adj.append(fitted.rsquared_adj)
        s.append(fitted.cov_HC0[0, 0])

    df = DataFrame({
        name_parameters: parameters,
        'alpha': alphas,
        'p-value': pvalues,
        'R_{adj}': rsquared_adj
    })
    df = df[[name_parameters, 'alpha', 'R_{adj}', 'p-value']]
    # latex_table = df.to_latex(index=False)

    alphas = array(alphas)
    s = array(s)
    fig = figure()

    plot(parameters, alphas, 'blue', label=r'$\alpha$')
    if CI:
        CI_up = alphas + t.ppf(0.975, len(x) - 1) * s
        CI_low = alphas - t.ppf(0.975, len(x) - 1) * s
        plot(parameters, CI_low, color='red')
        plot(parameters, CI_up, color='red', label='95% CI')
    legend()
    xlabel(latex_name_parameters, fontsize=14)
    ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
    ylabel(r'$\alpha$', fontsize=14)
    grid(True, linestyle='--')
    xlim(xmin=min(parameters), xmax=max(parameters))
    fig.savefig('pictures\cva' + '\\' + name_fig + '.png')

    return df
def fit_efficiency_model(
    p_in, p_out, p_in_density, efficiency, use_monthly_dummies=False, use_time=False
):
    # local import to suppress warning in unit tests, see:
    # https://github.com/statsmodels/statsmodels/issues/7139
    from statsmodels.api import OLS
    from statsmodels.tools.tools import add_constant

    X = pd.DataFrame(
        {
            "p_in_density": p_in_density,
        }
    )

    if use_time:
        # not really time, just a sequentially increasing number
        X["time"] = range(len(X))

    if use_monthly_dummies:
        # we add a constant below, so we have to drop one month
        X = X.join(pd.get_dummies(p_in_density.time.dt.month, drop_first=True))

    # other possible parameters:
    #  - specific power
    #  - turbine age

    X = add_constant(X)
    Y = efficiency.values

    model = OLS(Y, X)
    fit_result = model.fit()

    efficiency_without_pin = (
        fit_result.params.const
        + fit_result.params.p_in_density * p_in_density.mean().values
        + fit_result.resid
    )

    if use_time:
        efficiency_without_pin += fit_result.params.time * X["time"]

    # note: this might be broken if lengths of p_in and p_out do not match up
    assert len(p_in) == len(efficiency_without_pin), "input lengths do not match"
    efficiency_without_pin = xr.ones_like(p_in) * efficiency_without_pin

    return fit_result, efficiency_without_pin
示例#16
0
def fit(xyz, xlim=None, ylim=None, zlim=None, **kwargs):    
    all_true = numpy.empty_like(xyz[:,0], dtype=bool) \
               if None in [xlim, ylim, zlim] \
               else None
    xbool = numpy.abs(xyz[:,0]) < xlim if xlim else all_true
    ybool = numpy.abs(xyz[:,1]) < ylim if ylim else all_true
    zbool = numpy.abs(xyz[:,2]) < zlim if zlim else all_true
    bools = numpy.logical_and(numpy.logical_and(xbool, ybool), zbool)
    XYZ = xyz[bools,:]
    XY = add_constant(XYZ[:,:2], prepend=False)
    Z  = XYZ[:,-1]
    model = OLS(Z, XY)
    result = model.fit()
    coeffs = result.params
    stderr = result.HC1_se

    return coeffs, stderr
示例#17
0
def linear(data, **kwargs):
    '''linear regression model fitted with ordinary least squares
    
    Parameters
    ----------
    data : array or dataframe
        first column is endogenous, second column is
        a column of ones, the rest are exogenous data

    ** Keyword Arguments **

    prior_type : str
        'uniform' or 'collinear adjusted dilution'
    
    Returns
    -------
    rslts : array
        1-d array of parameter coefficients
    '''

    prior_type = kwargs.get('prior_type', 'uniform')

    endog = data[:, [0]]
    exog = data[:, 1:]

    model = OLS(endog=endog, exog=exog, missing='drop')

    adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \
            np.var(endog)).reshape((-1, 1))

    fit = model.fit()

    par_rsquared = fit.params.reshape((-1, 1)) * adj

    if prior_type == 'uniform':
        prior = 1.
    elif prior_type == 'collinear adjusted dilution':
        prior = collinear_adj_prior(exog)
    else:
        raise ValueError('prior {} not supported'.format(prior_type))

    posterior = math.exp(fit.llf) * prior

    return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params,
                      fit.pvalues, fit.bse, par_rsquared.flat))
示例#18
0
class OLSRegressor(BaseRegressor):
    degree = Property(depends_on='_degree')
    _degree = Int
    constant = None
#    _result = None
#    @on_trait_change('xs,ys')
#    def _update_data(self):
#        self._ols = OLS(self.xs, vander(self.ys, self.degree + 1))
#        self._result = self._ols.fit()
#    def _xs_changed(self):
#            xs = asarray(self.xs)
#            ys = asarray(self.ys)
# #            print len(xs), len(ys)
#            self._ols = OLS(ys, vander(xs, self.degree + 1))
#            self._result = self._ols.fit()
    def __degree_changed(self):
        self.calculate()

    def calculate(self):
        '''
            vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1)))
            vander(x,n+1)
        '''
        if not len(self.xs) or \
            not len(self.ys):
            return

        if len(self.xs) != len(self.ys):
            return

#        xs = asarray(self.xs)
        ys = asarray(self.ys)
#        self._ols = OLS(ys, vander(xs, self.degree + 1))
#        self._result = self._ols.fit()
#            print len(xs), len(ys)
#        print self.degree
#        print vander(xs, self.degree + 1)
        X = self._get_X()
        if X is not None:
            try:
                self._ols = OLS(ys, X)
                self._result = self._ols.fit()
            except Exception, e:
                print e
示例#19
0
def linear(data, **kwargs):
    '''linear regression model fitted with ordinary least squares
    
    Parameters
    ----------
    data : array or dataframe
        first column is endogenous, second column is
        a column of ones, the rest are exogenous data

    ** Keyword Arguments **

    prior_type : str
        'uniform' or 'collinear adjusted dilution'
    
    Returns
    -------
    rslts : array
        1-d array of parameter coefficients
    '''
    
    prior_type = kwargs.get('prior_type', 'uniform')

    endog = data[:, [0]]
    exog = data[:, 1:]

    model = OLS(endog=endog, exog=exog, missing='drop')
    
    adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \
            np.var(endog)).reshape((-1, 1))
    
    fit = model.fit()
    
    par_rsquared = fit.params.reshape((-1,1))*adj    
    
    if prior_type == 'uniform':
        prior = 1.
    elif prior_type == 'collinear adjusted dilution':
        prior = collinear_adj_prior(exog)
    else:
        raise ValueError('prior {} not supported'.format(prior_type))
    
    posterior = math.exp(fit.llf)*prior
        
    return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params, fit.pvalues, fit.bse, par_rsquared.flat))
示例#20
0
 def est_via_dml(self, outcome_model=OLS(), treatment_model=OLS()):
     Y = np.zeros(self.data.n)
     Xc = np.zeros((self.data.n, self.data.covariate_dims))
     Z = np.zeros(self.data.n)
     G = np.zeros(self.data.n)
     Labels = np.zeros(self.data.n)
     size_max = max(list(self.data.data_by_size.keys()))
     idx = 0
     for k, v in self.data.data_by_size.items():
         y, z, g, xc, labels = v
         Y[idx:idx + len(y)] = y
         Xc[idx:idx + len(y)] = xc
         Z[idx:idx + len(y)] = z
         G[idx:idx + len(y)] = g
         Labels[idx:idx + len(y)] = labels
         idx += len(y)
     outcome_reg = outcome_model.fit(Xc, Y)
     treatment_reg = treatment_model.fit(Xc, Z)
     y_res = Y - outcome_reg.insample_predict()
     z_res = Z - treatment_reg.insample_predict()
     data = ClusterData(y_res, z_res,
                        np.zeros((self.data.n, self.data.X.shape[1])),
                        Labels, self.data.cluster_feature,
                        self.data.n_moments, False)
     z_g_res = np.zeros((self.data.n, 2))
     y_res = np.zeros(self.data.n)
     idx = 0
     for k, v in data.data_by_size.items():
         y, z, g, xc, labels = v
         y_res[idx:idx + len(y)] = y
         z_g_res[idx:idx + len(y), 0] = z
         z_g_res[idx:idx + len(y), 1] = g * z
     ols_model = LinearRegression(y_res, z_g_res)
     result = ols_model.fit()
     ret = {'beta(g)': np.zeros(size_max), 'se': np.zeros(size_max)}
     cov_HC0 = result.cov_HC0
     for g in range(size_max):
         ret['beta(g)'][g] = result.params[0] + result.params[1] * g
         test_arr = np.array([1, g])
         ret['se'][g] = np.sqrt(test_arr.dot(cov_HC0[:2, :2]).dot(test_arr))
     return ret
示例#21
0
 def est_via_ols(self):
     y = np.zeros(self.data.n)
     regressor = np.zeros((self.data.n, 2 + self.data.covariate_dims))
     size_max = max(list(self.data.data_by_size.keys()))
     idx = 0
     for k, v in self.data.data_by_size.items():
         Y, Z, G, Xc, labels = v
         y[idx:idx + len(Y)] = Y
         regressor[idx:idx + len(Y), 0] = Z
         regressor[idx:idx + len(Y), 1] = G * Z
         regressor[idx:idx + len(Y), 2:] = Xc
         idx += len(Y)
     ols_model = LinearRegression(y, regressor)
     result = ols_model.fit()
     ret = {'beta(g)': np.zeros(size_max), 'se': np.zeros(size_max)}
     cov_HC0 = result.cov_HC0
     for g in range(size_max):
         ret['beta(g)'][g] = result.params[0] + result.params[1] * g
         test_arr = np.array([1, g])
         ret['se'][g] = np.sqrt(test_arr.dot(cov_HC0[:2, :2]).dot(test_arr))
     return ret
示例#22
0
def alpha_analysis_hull(y, x, bS, bV, name_dataframe=''):
    alphas = []
    pvalues = []
    rsquared_adj = []
    CI_up = []
    CI_low = []
    duplicate_b_S = []
    duplicate_b_V = []
    s = []
    for k in range(0, y.shape[0]):
        for l in range(0, y.shape[1]):
            model = OLS(endog=y[k, l, ], exog=x)  # no intercept by default
            fitted = model.fit()
            alphas.append(*fitted.params)
            pvalues.append(*fitted.pvalues)
            rsquared_adj.append(fitted.rsquared_adj)
            s.append(fitted.HC0_se[0])
            duplicate_b_S.append(bS[k])
            duplicate_b_V.append(bV[l])
    s = array(s)
    CI_up = (alphas + t.ppf(0.975, len(x) - 1) * s)
    CI_low = (alphas - t.ppf(0.975, len(x) - 1) * s)
    df = DataFrame({
        'b_S': duplicate_b_S,
        'b_V': duplicate_b_V,
        'alpha': alphas,
        'Standard Error': s,
        'p-value': pvalues,
        'R_{adj}': rsquared_adj,
        'CI95_up': CI_up,
        'CI95_low': CI_low
    })
    df = df[[
        'b_S', 'b_V', 'alpha', 'Standard Error', 'R_{adj}', 'p-value',
        'CI95_low', 'CI95_up'
    ]]
    if name_dataframe != '':
        save_dataframe(name_dataframe, df)
    return df
def linear_regression(data):
    reg_coeff = []
    reg_intercept = []
    dict_linreg = {}

    #intercept value and coefficient calculation
    for k, v in data.items():
        mat_x = np.ones((len(v), 2))
        mat_x[:, 1] = np.arange(0, len(v))

        reg = OLS(v, mat_x)
        results = reg.fit()

        reg_coeff.append(results.params[1])
        reg_intercept.append(results.params[0])

        dict_linreg[k] = [results.params[1], results.params[0]]

        #r² value
        results.rsquared

    return (dict_linreg)
示例#24
0
  sum_of_squares = df['difference'].apply(square).sum()
  return(sum_of_squares)

x0 = [-20, .0008, 1.1]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})

clf = linear_model.LinearRegression()
x = df[['AADT', 'L']].as_matrix()
y = df['Crashes']
clf.fit(x, y)
clf.coef_
clf.intercept_

model = OLS(y, add_constant(x))
model_fit = model.fit()
model_fit.summary()

def estimator(x, row_in='Crashes'):
  estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L'])
  df['estimated'] = df.apply(estimated, axis=1)
  #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in])
  probability = lambda row: poisson.pmf(row[row_in], row['estimated'])
  df['probability'] = df.apply(probability, axis=1)
  product = df['probability'].product()
  return(-product)

x0 = [1.6, .0000026, .032]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})
示例#25
0
    7711,
    9692,
    11791,
    14380,
    17205,
    20438,
    24324,
    28018,
    31161,
    34546,
    37198,
])
x = np.arange(len(confirmed))
x = add_constant(x)
model = OLS(np.log(confirmed[:14]), x[:14])
result = model.fit()
result.summary()
plt.plot(
    np.exp(result.predict(x[:14])),
    label="Prédiction du fonction exp",
)
plt.plot(confirmed[:14], ".", label="Cas réels, CN")
plt.legend()
plt.xlabel("jours")
plt.ylabel("nombres de malades")
plt.show()
world_population = 7763252653
days = 0
infected = confirmed[14]
while infected < world_population:
    days += 1
示例#26
0
def _fit_regression(X, y):
    lm = OLS(y, X)
    flm = lm.fit()
    logging.info(flm.summary())
    metadata = {'summary': str(flm.summary()), 'summary2': str(flm.summary2())}
    return format_output(flm), metadata
示例#27
0
def m(x, nu=0, gamma=1):
    return (0.5 * math.pi) * np.sinh(gamma) / (np.cosh(gamma) - np.cos(x - nu))


###Create data
X = np.random.multivariate_normal(np.ones(k), sigma, size=[
    N,
])
U = np.random.standard_normal(size=[
    500,
])
V = np.random.standard_normal(size=[
    500,
])
Y = np.dot(theta, D) + g(np.dot(X, b)) + U
D = m(np.dot(X, b)) + V
OLS_model = OLS(Y, D)
result = OLS_model.fit()

###Naive double machine learning
naiveMl1 = RandomForestRegressor()  # X -> Y
naiveMl1.fit(X, Y)
Vhat1 = Y - naiveMl1.predict(X)

naiveMl2 = RandomForestRegressor()  # X -> Y
naiveMl2.fit(X, D)
Vhat2 = D - naiveMl1.predict(X)

np.mean(np.dot(Vhat1, Vhat2)) / np.mean(np.dot(Vhat2, D))
示例#28
0
 def _model(self, X, y):
     model = OLS(y, X)
     result = model.fit()
     print result.summary()
     return result
示例#29
0
def linregress(df, X, y):
    dfX = _items(df, X)
    dfy = _value(df, y)
    model = OLS(dfy, dfX)
    result = model.fit()
    return result.summary()
import pandas as pd
import numpy as np
import matplotplib.pyplot as plt
from statsmodels.api import OLS


def fit_linear(df, columns)
    '''
    Parameters:
    DF: Dataframe with y assumed to be sale_price
    
    X: List of columns to be used as predictors for sale_price

    ----------------------------
    Returns:
    Prints summary and returns fit OLS model
    
    '''

    y = df.saleprice
    X = df[columns]
    X = sm.add_constant(X)
    lr = OLS(y, X)
    model = lr.fit()
    
    print(model.summary())
    
    return model
示例#31
0
    R = dailyret.iloc[
        t - lookback + 1:t +
        1, ].T  # here the columns of R are the different observations.
    hasData = np.where(R.notna().all(axis=1))[0]
    R.dropna(inplace=True)  # avoid any stocks with missing returns
    avgR = R.mean(axis=1)
    R = R.values - avgR.values.reshape(
        (R.shape[0], 1))  # subtract mean from returns
    covR = pd.DataFrame(
        R.T).cov()  # compute covariance matrix, with observations in rows.
    B, X = eig(
        covR
    )  # X is the factor exposures matrix, B the variances of factor returns
    X = X[:, 0:numFactors]  # Retain only numFactors
    model = OLS(R[:, -1], X)
    results = model.fit()
    b = results.params  # b are the factor returns for time period t-1 to t.
    Rexp = avgR + np.dot(
        X, b
    )  # Rexp is the expected return for next period assuming factor returns remain constant.
    idxSort = Rexp.argsort()

    positionsTable[t, hasData[idxSort.values[np.arange(0, topN)]]] = -1
    positionsTable[t, hasData[idxSort.values[np.arange(-topN, 0)]]] = 1

capital = np.nansum(np.array(abs(positionsTable).shift()), axis=1)
positionsTable[capital == 0, ] = 0
capital[capital == 0] = 1
ret = np.nansum(
    np.array(pd.DataFrame(positionsTable).shift()) * np.array(dailyret),
    axis=1) / capital
示例#32
0
class CrossSectionalModelLinear(CrossSectionalModelBase):
    def __init__(self, jsonPath=None, paraDict={}):
        self.parameter = paraDict
        if jsonPath is not None:
            with open(jsonPath, 'r') as f:
                self.parameter = json.loads(f)
        self.fit_intercept = self.parameter.get('fit_intercept', True)
        self.model = None

    def fit(self, X_train, y_train):
        if self.fit_intercept:
            X_train = sm.add_constant(X_train)
        self.model = OLS(y_train, X_train)
        self.res = self.model.fit()
        return self.res

    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.res.predict(X)

    def get_para(self):
        if self.parameter != {}:
            return pd.DataFrame.from_dict(self.parameter,
                                          orient='index',
                                          columns=['ParaValue'])
        else:
            print('Hyper parameters are default')

    def get_model(self):
        try:
            return self.res
        except:
            print('fit your model first!')
            return None

    def get_score(self, y_real, **kwargs):
        '''
        get score of the prediction based on the scoreMethod
        
        ----
            
            y: y_real
            kwargs:
                scoreMethod: str
                        'r2': r2_score
                        'mse': mean_squared_error
                        'mae': mean_absolute_error
                X: ndarray, input X to get y_pred
                y_pred: input y_pred directly
        '''
        if 'y_pred' in kwargs.keys():
            y_pred = kwargs['y_pred']
        elif 'X' in kwargs.keys():
            y_pred = self.res.predict(kwargs['X'])

        def r2(y_real, y_pred):
            return r2_score(y_real, y_pred)

        def mse(y_real, y_pred):
            return mean_squared_error(y_real, y_pred)

        def mae(y_real, y_pred):
            return mean_absolute_error(y_real, y_pred)

        methodDict = {'r2': r2, 'mse': mse, 'mae': mae}
        scoreMethod = kwargs.get('scoreMethod', 'r2')
        scoreMethod = methodDict[scoreMethod]
        return scoreMethod(y_real, y_pred)

    def get_coef(self):
        '''
        get estimated coefficients for the linear regression problem
        '''
        return self.res.params

    def get_model_summary(self):
        '''
        get summary of the model
        
        return
        ----
        summary of model: coef, pvalue, t-statistics, R2, R2_adj...
        '''
        return self.res.summary()
示例#33
0
        if compo_counter == 3:
            y_train[line_counter][0] = data[first_key][second_key]
            line_counter += 1
            compo_counter = 0

        else:
            x_train[line_counter][compo_counter] = data[first_key][second_key]
            compo_counter += 1

## reshape datasets
np.reshape(x_train, (-1, 1))
np.reshape(y_train, (-1, 1))

## create and summary model
model = OLS(y_train, x_train)
y_pred = model.fit()
#print(y_pred.summary())

## predict the answer
pred = y_pred.predict(x_train)

## calculate RSME
line_counter = 0
while 1:
    data = json_data["data"]
    if len(data) <= line_counter: break

    loss[line_counter][0] = abs(y_train[line_counter][0] -
                                int(pred[line_counter]))
    line_counter += 1
sectors = X.iloc[:, -10:]
X = (X.drop(sectors.columns, axis=1)
     .groupby(level='ticker')
     .transform(lambda x: (x - x.mean()) / x.std())
    .join(sectors)
    .fillna(0))


# ### 1-Day Returns

# In[14]:


target = 'target_1d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()
print(trained_model.summary())


# ### 5-Day Returns

# In[21]:


target = 'target_5d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()
print(trained_model.summary())


# #### Obtain the residuals
示例#35
0
 def fit_ols(y, x, idx=-1):
     ols = OLS(y, add_constant(x))
     results = ols.fit()
     return results.params.values[idx], results.cov_params().values[idx, idx]
示例#36
0
def linregress_loose(X, y, *args, **kwargs):
    X = list(zip(*(_series(x) for x in X)))
    y = _series(y)
    model = OLS(y, X)
    result = model.fit(*args, **kwargs)
    return result.summary()




import statsmodels.api as sm


# In[65]:

ols=OLS(timevncats,sm.add_constant(X))


# In[66]:

ols=ols.fit()





nclients=Clientes.shape[0]

predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0]

print('Full data set should take %i hours' % int(predtime))