示例#1
0
def lasso_lars(X_tr, y_tr, X_v, y_v, X_te, y_te, **kwargs):
    '''
    This function runs the lasso lars model on train, validate, 
    and test data with the option to include key word arguments
    '''
    # create lasso lars model
    lars = LassoLars(**kwargs)
    # fit the model to train data
    lars.fit(X_tr, y_tr)
    
    # fit the model to train data
    lars_pred = lars.predict(X_tr)
    # calculate the rmse on the train data    
    lars_rmse = sqrt(mean_squared_error(y_tr, lars_pred))
    
    # predict the popularity on the validate data
    lars_pred_v = lars.predict(X_v)
    # calculate the rmse on the validate data
    lars_rmse_v = sqrt(mean_squared_error(y_v, lars_pred_v))
    
    # predict the popularity on the test data
    lars_pred_t = lars.predict(X_te)
    # calculate the rmse on the test data
    lars_rmse_t = sqrt(mean_squared_error(y_te, lars_pred_t))
    # print the train rmse
    print('RMSE for LASSO + LARS \n')
    print('On train data:\n', round(lars_rmse, 6), '\n') 
    return lars_rmse, lars_rmse_v, lars_rmse_t
示例#2
0
def _lassolars(*,
               train,
               test,
               x_predict=None,
               metrics,
               alpha=1.0,
               fit_intercept=True,
               verbose=False,
               normalize=True,
               precompute='auto',
               max_iter=500,
               eps=2.220446049250313e-16,
               copy_X=True,
               fit_path=True,
               positive=False,
               jitter=None,
               random_state=None):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html#sklearn.linear_model.LassoLars
    """

    model = LassoLars(alpha=alpha,
                      fit_intercept=fit_intercept,
                      verbose=verbose,
                      normalize=normalize,
                      precompute=precompute,
                      max_iter=max_iter,
                      eps=eps,
                      copy_X=copy_X,
                      fit_path=fit_path,
                      positive=positive,
                      jitter=jitter,
                      random_state=random_state)
    model.fit(train[0], train[1])
    model_name = 'LassoLars'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
示例#3
0
class LassoLarsPrim(primitive):
    def __init__(self, random_state=0):
        super(LassoLarsPrim, self).__init__(name='LassoLars')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "LassoLars is a lasso model implemented using the LARS algorithm, and unlike the implementation based on coordinate descent, this yields the exact solution, which is piecewise linear as a function of the norm of its coefficients."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = LassoLars(alpha=0.1)
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
示例#4
0
def LassoRegression(X_train, X_test, y_train, y_test):
    regr = LassoLars(alpha=0.1)
    print len(X_train.values.tolist()[0])
    print len(X_train.values.tolist())
    regr.fit(X_train.values.tolist(), y_train.values.tolist())
    predictions = regr.predict(X_test)
    return predictions
def Lasso(x_train, y_train, x_test, y_test):
    estimator = LassoLars()
    estimator.fit(x_train, y_train)
    y_pred = estimator.predict(x_test)
    mse_score = mse(y_test, y_pred)
    print("mse_score: " + str(mse_score))
    r2_score = r2(y_test, y_pred)
    print("r2_score: " + str(r2_score))
def lasso_regression(args):
    start = time.time()
    with open(args.trainfile) as f:
        train = np.genfromtxt(f, delimiter=',')
    x = train[:, :-1]

    fe = FeatureEngineering(30, 2, 400)
    x = fe.fit_transform(x)
    x = np.column_stack((np.ones(x.shape[0], ), x))
    y = train[:, -1]

    kf = KFold(n_splits=10)
    kf.get_n_splits(x)
    ls = [0.003]
    min_error = float('inf')
    min_l = None

    for l in ls:
        error_sum = 0
        for train_index, test_index in kf.split(x):
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

            reg = LassoLars(alpha=l)
            reg.fit(x_train, y_train)
            error = (np.linalg.norm(reg.predict(x_test) - y_test)**
                     2) / (2 * x_test.shape[0])
            error_sum += error
        if error_sum < min_error:
            min_error = error_sum
            min_l = l

    reg = LassoLars(alpha=min_l)
    reg.fit(x, y)
    w = reg.coef_
    error = (np.linalg.norm(reg.predict(x) - y)**2) / (2 * x.shape[0])
    print('Lambda: ', min_l, '. Error: ', error)

    with open(args.testfile) as f:
        test = np.genfromtxt(f, delimiter=',')

    x = fe.transform(test)
    x = np.column_stack((np.ones(test.shape[0], ), x))
    predictions = x @ w
    np.savetxt(args.outputfile, predictions)
    print('Time: ', time.time() - start)
示例#7
0
def predict_LarsLasso(X, y, train, test, alpha=0.1):
    # Fit
    lars = LassoLars(alpha)
    lars.fit(X.iloc[train], y.iloc[train])

    # Predict
    prediction = lars.predict(X.iloc[test])
    return prediction
def LassoLarsTest(dataMat, labelMat):
    clf1 = LassoLars(alpha=1, max_iter=100)
    clf1.fit(dataMat[0:99], labelMat[0:99])
    labelTest1 = clf1.predict(dataMat[100:199])
    print('LassoLars ', ((labelTest1 - labelMat[100:199])**2).sum())
    clf2 = LassoLarsCV(max_n_alphas=10, max_iter=100)
    clf2.fit(dataMat[0:99], labelMat[0:99])
    labelTest2 = clf2.predict(dataMat[100:199])
    print('LassoLarsCV', ((labelTest2 - labelMat[100:199])**2).sum())
示例#9
0
def lasso_lars(X, y):
    #train model
    lars = LassoLars(alpha=0.1)\
    .fit(X, y)

    lars_pred = lars.predict(X)

    lars_rmse = sqrt(mean_squared_error(y, lars_pred))
    return lars_rmse
示例#10
0
def ll_validate_test(X, y, X_vt, y_vt):
    #train model
    lars = LassoLars(alpha=0.1)\
    .fit(X, y)

    #validate model
    lars_pred_v = lars.predict(X_vt)

    lars_rmse_v = sqrt(mean_squared_error(y_vt, lars_pred_v))
    return lars_rmse_v
示例#11
0
class in_lassoLars(regression):
    def trainAlgo(self):
        self.model = LassoLars(alpha=self.param['alpha'],
                               normalize=self.param['normalize'],
                               fit_intercept=self.param['fit_intercept'],
                               max_iter=self.param['max_iter'],
                               positive=self.param['positive'])
        self.model.fit(self.inputData['X'], self.outputData['Y'])

    def predictAlgo(self):
        self.result['Y'] = self.model.predict(self.inputData['X'])
示例#12
0
def lasso_lars_test(x_scaleddf, target, X_test, y_test):
    '''
    runs Lasso Lars algorithm
    '''
    # Make a model
    lars = LassoLars(alpha=1)
    # Fit a model
    lars.fit(x_scaleddf, target)
    # Make Predictions
    lars_pred = lars.predict(X_test)
    # calculate MAE
    lars_MAE = mean_absolute_error(y_test, lars_pred)
    return lars_MAE, lars, lars_pred
示例#13
0
    def fit_model_11(self,toWrite=False):
        model = LassoLars(alpha=1,max_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 11 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model11/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
def lasso_lars(x_scaleddf, target):
    '''
    runs Lasso Lars algorithm
    '''
    # Make a model
    lars = LassoLars(alpha=1)
    # Fit a model
    lars.fit(x_scaleddf, target)
    # Make Predictions
    lars_pred = lars.predict(x_scaleddf)
    # Computer root mean squared error
    lars_rmse = sqrt(mean_squared_error(target, lars_pred))
    return lars_rmse
示例#15
0
class _LassoLarsImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
示例#16
0
class DkuLassoLarsRegressor(BaseEstimator):
    def __init__(self, max_var=0):
        self.max_var = max_var
        self.lars = None
        self.X_offset = None
        self.y_offset = None
        self.X_scale = None
        self.coef_ = None
        self.current_index = None
        self.intercept_ = None
        self.coef_path_ = None

    def fit(self, X, y):
        # note: for now we perform rescaling. While this requires some more computation on our part, it has better
        # numerical stability (could test with or without)
        self.lars = LassoLars(alpha=0.0).fit(X, y)
        # we recreate the rescaling
        _, _, self.X_offset, self.y_offset, self.X_scale = self.lars._preprocess_data(
            X, y, True, True, True)
        # we normalize the coef path here
        self.coef_path_ = [x / self.X_scale for x in self.lars.coef_path_.T]
        self.coef_ = self.lars.coef_
        self.intercept_ = self.lars.intercept_
        self.alphas = self.lars.alphas_
        if self.max_var > 0:
            self._perform_cut(self.max_var)
        return self

    def _perform_cut(self, n):
        n = min(n, self.lars.coef_path_.shape[1] - 1)
        self.current_index = n
        # note: not normalized, this is normal since the _set_intercept will normalize it
        coef = self.lars.coef_path_[:, n]
        self.lars.coef_ = coef
        # recompute the intercept and normalize coefficients using scikit private method
        self.lars._set_intercept(self.X_offset, self.y_offset, self.X_scale)
        self.coef_ = self.lars.coef_

    def post_process(self, user_meta):
        if self.current_index is not None:
            n = self.current_index
        else:
            n = self.max_var
        n = user_meta.get("lars_cut", n)
        if n > 0:
            self._perform_cut(n)

    def predict(self, X):
        return self.lars.predict(X)
示例#17
0
def linear_regressor(x, target, causes):
    """ Regression and prediction using a lasso

    :param x: data
    :param target: target - effect
    :param causes: causes of the causal mechanism
    :return: regenerated data with the fitted model
    """

    if len(causes) == 0:
        x = np.random.normal(size=(target.shape[0], 1))

    lasso = LassoLars(alpha=1.)  # no regularization
    lasso.fit(x, target)

    return lasso.predict(x)
示例#18
0
def KFoldValidationLasso(X, Y, lam, k=10):
    loss = 0
    for i in range(k):
        start = math.floor(X.shape[0] * i / k)
        end = math.floor(X.shape[0] * (i + 1) / k)

        x = np.r_[X[:start, :], X[end:, :]]
        y = np.r_[Y[:start], Y[end:]]
        vx = X[start:end, :]
        vy = Y[start:end]

        model = LassoLars(alpha=lam)
        model.fit(x, y)
        vyhat = model.predict(vx)

        loss += (np.linalg.norm(vy - vyhat, ord=2) /
                 np.linalg.norm(vy, ord=2))**2
    loss /= k
    return loss
示例#19
0
def test_simple_vs_refined_algorithm(theta, fit_path):
    # Test the consistency of the results between the 2 versions of
    # the algorithm.

    # Simple Algorithm (2 steps of Lasso Lars)
    lasso1 = LassoLars(alpha=alpha)
    lasso1.fit(X_train, y_train)
    X1 = X_train.copy()
    X1[:, lasso1.coef_ == 0] = 0

    lasso2 = LassoLars(alpha=alpha*theta)
    lasso2.fit(X1, y_train)
    pred_simple = lasso2.predict(X_test)

    # Refined Algorithm
    relasso = RelaxedLassoLars(alpha=alpha, theta=theta, fit_path=fit_path)
    relasso.fit(X_train, y_train)
    pred_refined = relasso.predict(X_test)

    assert_array_almost_equal(pred_simple, pred_refined)
    assert_array_almost_equal(lasso2.coef_, relasso.coef_)
    assert_almost_equal(lasso2.score(X_test, y_test),
                        relasso.score(X_test, y_test),
                        decimal=2)
 y_split = np.array_split(y,10)
 from sklearn.linear_model import LassoLars
 l = [0.0,0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100]
 nmse = np.zeros(len(l))
 for i in range(len(l)):
     alpha = l[i]
     regressor = LassoLars(alpha)
     for j in range(10):
         ###
         temp = list(range(10))
         temp.remove(j)
         Xi_test = X_split[j]
         Xi = np.concatenate([X_split[k] for k in temp])
         yi_test = y_split[j]
         yi = np.concatenate([y_split[k] for k in temp])
         ###
         regressor.fit(Xi, yi)
         ###
         yi_pred = regressor.predict(Xi_test)
         nmse[i]+=compute_error(yi_test,yi_pred)
 ###
 nmse = nmse/10
 ###
 alpha = l[np.argmin(nmse)]
 regressor = LassoLars(alpha)
 regressor.fit(X, y)
 y_pred = regressor.predict(X_test)
 f = open(sys.argv[4], "a")
 for i in range(len(y_pred)):
     f.write(str(y_pred[i])+'\n')
 f.close()
def lassoLars(X, y, value):
    regressor = LassoLars(alpha=0.3, max_iter=600000)
    regressor.fit(X, y)
    y_pred = regressor.predict(value)
    return y_pred
示例#22
0
# LassoLars Regression
# The Least Angle Regression (LARS) can be used as an alternative method for calculating Least Absolute Shrinkage
# and Selection Operator (LASSO) fit.
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LassoLars

# load the iris datasets
dataset = datasets.load_diabetes()

# fit a LASSO using LARS model to the data
model = LassoLars(alpha=0.1)
model.fit(dataset.data, dataset.target)
print(model)

# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
mse = np.mean((predicted - expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
示例#23
0
def task2(data):

    df = data

    dfreg = df.loc[:, ['Adj Close', 'Volume']]
    dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)
    # We want to separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(dfreg)))
    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'Adj Close'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))
    # Scale the X so that everyone can have the same distribution for linear regression
    X = preprocessing.scale(X)
    # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ##################
    ##################
    ##################

    # Linear regression
    clfreg = LinearRegression(n_jobs=-1)
    clfreg.fit(X_train, y_train)
    # Quadratic Regression 2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
    clfpoly2.fit(X_train, y_train)

    # Quadratic Regression 3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
    clfpoly3.fit(X_train, y_train)

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)
    clfknn.fit(X_train, y_train)

    # Lasso Regression
    clflas = Lasso()
    clflas.fit(X_train, y_train)

    # Multitask Lasso Regression
    # clfmtl = MultiTaskLasso(alpha=1.)
    # clfmtl.fit(X_train, y_train).coef_

    # Bayesian Ridge Regression
    clfbyr = BayesianRidge()
    clfbyr.fit(X_train, y_train)

    # Lasso LARS Regression
    clflar = LassoLars(alpha=.1)
    clflar.fit(X_train, y_train)

    # Orthogonal Matching Pursuit Regression
    clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
    clfomp.fit(X_train, y_train)

    # Automatic Relevance Determination Regression
    clfard = ARDRegression(compute_score=True)
    clfard.fit(X_train, y_train)

    # Logistic Regression
    # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
    # coefs_ = []
    # for c in cs:
    #   clflgr.set_params(C=c)
    #   clflgr.fit(X_train, y_train)
    #   coefs_.append(clflgr.coef_.ravel().copy())

    clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
    clfsgd.fit(X_train, y_train)

    ##################
    ##################
    ##################

    #Create confindence scores
    confidencereg = clfreg.score(X_test, y_test)
    confidencepoly2 = clfpoly2.score(X_test, y_test)
    confidencepoly3 = clfpoly3.score(X_test, y_test)
    confidenceknn = clfknn.score(X_test, y_test)
    confidencelas = clflas.score(X_test, y_test)
    # confidencemtl = clfmtl.score(X_test, y_test)
    confidencebyr = clfbyr.score(X_test, y_test)
    confidencelar = clflar.score(X_test, y_test)
    confidenceomp = clfomp.score(X_test, y_test)
    confidenceard = clfard.score(X_test, y_test)
    confidencesgd = clfsgd.score(X_test, y_test)

    # results
    print('The linear regression confidence is:', confidencereg * 100)
    print('The quadratic regression 2 confidence is:', confidencepoly2 * 100)
    print('The quadratic regression 3 confidence is:', confidencepoly3 * 100)
    print('The knn regression confidence is:', confidenceknn * 100)
    print('The lasso regression confidence is:', confidencelas * 100)
    # print('The lasso regression confidence is:',confidencemtl*100)
    print('The Bayesian Ridge regression confidence is:', confidencebyr * 100)
    print('The Lasso LARS regression confidence is:', confidencelar * 100)
    print('The OMP regression confidence is:', confidenceomp * 100)
    print('The ARD regression confidence is:', confidenceard * 100)
    print('The SGD regression confidence is:', confidencesgd * 100)

    #Create new columns
    forecast_reg = clfreg.predict(X_lately)
    forecast_pol2 = clfpoly2.predict(X_lately)
    forecast_pol3 = clfpoly3.predict(X_lately)
    forecast_knn = clfknn.predict(X_lately)
    forecast_las = clflas.predict(X_lately)
    forecast_byr = clfbyr.predict(X_lately)
    forecast_lar = clflar.predict(X_lately)
    forecast_omp = clfomp.predict(X_lately)
    forecast_ard = clfard.predict(X_lately)
    forecast_sgd = clfsgd.predict(X_lately)

    #Process all new columns data
    dfreg['Forecast_reg'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
        dfreg['Forecast_reg'].loc[next_date] = i

    dfreg['Forecast_pol2'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol2:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol2'].loc[next_date] = i

    dfreg['Forecast_pol3'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol3:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol3'].loc[next_date] = i

    dfreg['Forecast_knn'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_knn:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_knn'].loc[next_date] = i

    dfreg['Forecast_las'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_las:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_las'].loc[next_date] = i

    dfreg['Forecast_byr'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_byr:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_byr'].loc[next_date] = i

    dfreg['Forecast_lar'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_lar:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_lar'].loc[next_date] = i

    dfreg['Forecast_omp'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_omp:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_omp'].loc[next_date] = i

    dfreg['Forecast_ard'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_ard:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_ard'].loc[next_date] = i

    dfreg['Forecast_sgd'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_sgd:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_sgd'].loc[next_date] = i

    return dfreg.index.format(formatter=lambda x: x.strftime(
        '%Y-%m-%d')), dfreg['Adj Close'].to_list(
        ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(
        ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(
        ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(
        ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(
        ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
    rank_result['Lars_pca'] = sumsum / float(result_row)
    rs_score['Lars_pca'] = r2_score(y_test, y)
    LarsModel = Lars()
    LarsModel.fit(X_train_std, y_train)
    y = LarsModel.predict(X_test_std)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
    rank_result['Lars_std'] = sumsum / float(result_row)
    rs_score['Lars_std'] = r2_score(y_test, y)

    LassoLarsModel = LassoLars()
    LassoLarsModel.fit(X_train_pca, y_train)
    y = LassoLarsModel.predict(X_test_pca)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
    rank_result['LassoLars_pca'] = sumsum / float(result_row)
    rs_score['LassoLars_pca'] = r2_score(y_test, y)
    LassoLarsModel = LassoLars()
    LassoLarsModel.fit(X_train_std, y_train)
    y = LassoLarsModel.predict(X_test_std)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
示例#25
0
def all_models_info():
    '''takes in data
    sets baseline
    sets SSE, MSE, and RMSE
    returns infor for all 4'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    #LARS Model
    lars = LassoLars(alpha=1.0)
    lars.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lars'] = lars.predict(X_train)
    rmse_train_lars = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2
    y_validate['appraised_value_pred_lars'] = lars.predict(X_validate)
    rmse_validate_lars = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lars)**1 / 2
    #GLM
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_glm'] = glm.predict(X_train)
    rmse_train_glm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2
    y_validate['appraised_value_pred_glm'] = glm.predict(X_validate)
    rmse_validate_glm = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2
    # PF
    pf = PolynomialFeatures(degree=2)
    X_train_degree2 = pf.fit_transform(X_train)
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # LM2
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train.appraised_value)
    y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2)
    rmse_train_lm2 = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2
    y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2)
    rmse_validate_lm2 = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
    print("--------------------------------------------------------------")
    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars,
          "\nValidation/Out-of-Sample: ", rmse_validate_lars)
    print("--------------------------------------------------------------")
    print(
        "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ",
        rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm)
    print("--------------------------------------------------------------")
    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ",
          rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
示例#26
0
lars_alpha, lars_err

##
max_iter = 50000

lasso_model = Lasso(alpha=alpha_list[0], max_iter=max_iter).fit(trainX, trainY)
elasticNet_model = ElasticNet(alpha=alpha_list[1],
                              max_iter=max_iter).fit(trainX, trainY)
ridge_model = Ridge(alpha=alpha_list[2], max_iter=max_iter).fit(trainX, trainY)
lars_model = LassoLars(alpha=alpha_list[3],
                       max_iter=max_iter).fit(trainX, trainY)

lasso_pred = np.expm1(lasso_model.predict(raw_test_df))
ridge_pred = np.expm1(ridge_model.predict(raw_test_df))
elasticNet_pred = np.expm1(elasticNet_model.predict(raw_test_df))
lars_pred = np.expm1(lars_model.predict(raw_test_df))
pred_list = np.array(
    [lasso_pred, ridge_pred, elasticNet_pred, lars_pred, xgb_pred])

# take average of 4 models
err_list.append(xgb_err)
err_list = np.array(err_list)

w_list = 1 / err_list
total_w = np.sum(w_list)
predictions = np.matmul(w_list / total_w, pred_list)

# xgb_w, lasso_w, elas_w, ridge_w, lars_w = 1/xgb_err, 1/lasso_err, 1/elas_err, 1/ridge_err, 1/lars_err
# total_w = xgb_w + lasso_w + elas_w + ridge_w + lars_w
# predictions = lasso_w/total_w*lasso_pred + ridge_w/total_w*ridge_pred + \
#   elas_w/total_w*elasticNet_pred + xgb_w/total_w*xgb_pred +
示例#27
0
    # print (MSELasso(y_test,pred.reshape((pred.size,1))))
    vals = [0.0000001, 0.0001, 1, 10]
    errors = np.empty(4)

    for j in range(4):

        lm = vals[j]
        k = 4
        err = np.empty(k)
        l = int(np.ma.size(x_train, axis=0) / k)
        x_cv, x_tr = np.split(x_train.copy(), [l], axis=0)
        y_cv, y_tr = np.split(y_train.copy(), [l], axis=0)
        model = LassoLars(alpha=lm)
        model.fit(x_tr, y_tr.ravel())
        pred = model.predict(x_cv)
        err[0] = MSELasso(y_cv, pred.reshape((pred.size, 1)))

        for i in range(k - 1):
            x_tr[i * l:(i + 1) * l], x_cv = x_cv, x_tr[i * l:(i + 1) *
                                                       l].copy()
            y_tr[i * l:(i + 1) * l], y_cv = y_cv, y_tr[i * l:(i + 1) *
                                                       l].copy()
            model = LassoLars(alpha=lm)
            model.fit(x_tr, y_tr.ravel())
            pred = model.predict(x_cv)
            err[i + 1] = MSELasso(y_cv, pred.reshape((pred.size, 1)))

        errors[j] = np.mean(err)

    x_tr = np.concatenate((x_train, np.square(x_train), np.power(x_train, 3)),
示例#28
0
# Linear Regression

linear_reg = LinearRegression()
linear_reg.fit(X_train, Y_train)
Y_pred = linear_reg.predict(X_test)
linear_r2 = r2_score(Y_expected, Y_pred)
linear_mse = mean_squared_error(Y_expected, Y_pred)
print("Linear Regression\n", "R2: ", linear_r2, "MSE:", linear_mse)
plot_prediction("Linear Regression", Y_pred, test['close'])

# Lasso Lars

lassolars_reg = LassoLars()
lassolars_reg.fit(X_train, Y_train)
Y_pred = lassolars_reg.predict(X_test)
lassolars_r2 = r2_score(Y_expected, Y_pred)
lassolars_mse = mean_squared_error(Y_expected, Y_pred)
print("Lasso Lars Regression\n", "R2: ", lassolars_r2, "MSE:", lassolars_mse)
plot_prediction("Lasso Lars Regression", Y_pred, test['close'])

# Theil Sen Regressor

theil_reg = TheilSenRegressor()
theil_reg.fit(X_train, Y_train)
Y_pred = theil_reg.predict(X_test)
theil_r2 = r2_score(Y_expected, Y_pred)
theil_mse = mean_squared_error(Y_expected, Y_pred)
print("Theil Sen Regression\n", "R2: ", theil_r2, "MSE:", theil_mse)
plot_prediction("Theil Sen Regression", Y_pred, test['close'])
示例#29
0
train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(
        pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace=True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)

# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(
    pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace=True)

X_enc_test = fh.transform(np.asarray(X_test.astype(str)))

y_act = pd.read_csv("test/mtest.csv", usecols=['click'])
y_pred = clf.predict(X_enc_test)

with open('logloss.txt', 'a') as f:
    f.write('\n' + str(log_loss(y_act, y_pred)))

with open("submission/submission_elnet.csv", "w") as f:
    f.write("id,click\n")
    for idx, xid in enumerate(X_test.id):
        f.write(str(xid) + "," + "{0:.10f}".format(y_pred[idx]) + "\n")
f.close()
示例#30
0
from sklearn import linear_model
lassy = linear_model.Lasso(alpha=0.001)
lassy.fit(X_train_scaled, y_train)
y_pred = lassy.predict(X_test_scaled)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 


# In[19]:


laslars = LassoLars(alpha=0.0001)
laslars.fit(X_train_scaled, y_train)
y_pred = laslars.predict(X_test_scaled)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 


# In[20]:


from sklearn.linear_model import ElasticNet
elastic = ElasticNet(random_state=0)
elastic.fit(X_train_scaled, y_train)
y_pred = elastic.predict(X_test_scaled)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
示例#31
0
def dt_process(df2,option_slctd):

    df = df2.copy()         #work with a local copy
    opted_country = option_slctd  # 'Brazil'  # input("Select the country - ")
    print(opted_country)
    dt_one_country = df[df["location"] == opted_country][['date', 'new_cases']] #work the predictions only for the column 'new_cases' in the rest of code
    dt_one_country['new_cases'] = dt_one_country['new_cases'].fillna(0)
    dt_one_country['date'] = pd.to_datetime(dt_one_country['date'])
    dt_one_country['Days Since'] = dt_one_country['date'] - dt_one_country['date'].min()
    dt_one_country['Days Since'] = dt_one_country['Days Since'].dt.days     #use the days since the starting date of records of this country, use this as the known variable to make the prediction

    train_ml = dt_one_country.iloc[:int(dt_one_country.shape[0] * 0.95)]    #First 95% dates used for fitting the regressor
    valid_ml = dt_one_country.iloc[int(dt_one_country.shape[0] * 0.95):]    #last 5% dates to be predicted and compared to validation data of these dates

    fitinput_x = np.array(train_ml["Days Since"]).reshape(-1, 1)            #data should be in arrays for regressors, i think, have to cross check this   Days Since is the known x data
    fitinput_y = np.array(train_ml["new_cases"]).reshape(-1, 1)             # new_cases is the y data, this data is used to 'fit' the regressor

    # linreg = LinearRegression(normalize=True)                              #use this Linear Regressor model 'lin_reg' to fit and predict
    Larspd = LassoLars(alpha=.1)
    Larspd.fit(fitinput_x, fitinput_y)                                     #fitting the regressor

    x_pred = np.array(valid_ml["Days Since"]).reshape(-1, 1)
    y_pred = Larspd.predict(x_pred)                                        #predicting using regressor for the 5% days

    model_scores = []
    model_scores.append(np.sqrt(mean_squared_error(valid_ml["new_cases"], y_pred)))
    # lin_reg.score(x_pred,valid_ml['new_cases'])
    # print(np.sqrt(mean_squared_error(valid_ml["new_cases"], y_pred)))

    # plt.figure(figsize=(11, 6))
    prediction_linreg = Larspd.predict(np.array(dt_one_country["Days Since"]).reshape(-1, 1))      #use this as predictor for all the days, to understand the fitting line
    linreg_output = []
    # print("i am predicting ")
    for i in range(prediction_linreg.shape[0]):
        linreg_output.append(prediction_linreg[i])#[0])
    # print("i am before figure ")
    fig_LarsReg = go.Figure()     #this handle can be returned to plot the figure outside of this function
    #not currently returned
    #shows the original recorded data for all the days
    fig_LarsReg.add_trace(go.Scatter(x=dt_one_country['date'], y=dt_one_country["new_cases"],
                                       mode='lines+markers', name="Train Data for new Cases"))
    #shows the predicted data for all the days
    fig_LarsReg.add_trace(go.Scatter(x=valid_ml['date'], y=y_pred,
                                       mode='lines', name="Lars Regression Best Fit Line",
                                       line=dict(color='red', dash='dot')))
    # fig_LarsReg.add_trace(go.Scatter(x=dt_one_country['date'], y=linreg_output,
    #                                    mode='lines', name="Linear Regression Best Fit Line",
    #                                    line=dict(color='black', dash='dot')))
    fig_LarsReg.add_vline(x=valid_ml['date'].iloc[0], line_dash="dash")  # ,#add vertical line on the date to know the SPLIT between training and test data
    fig_LarsReg.update_layout(title="new Cases Lars Regression Prediction " + str(opted_country),
                                xaxis_title="Date", yaxis_title="new Cases", legend=dict(x=0, y=1, traceorder="normal"))
    # fig_LarsReg.show()

    poly = PolynomialFeatures(degree=8)                 #Polynomial regressor initiate the model
    train_poly = poly.fit_transform(fitinput_x)         #do not know why we need this fit_transform specifically for Polynomial method

    fitin_valid = np.array(valid_ml["Days Since"]).reshape(-1, 1)
    valid_poly = poly.fit_transform(fitin_valid)
    y_train_to_compare = train_ml['new_cases']

    lin_reg = LinearRegression(normalize=True)
    lin_reg.fit(train_poly, y_train_to_compare)

    prediction_poly = lin_reg.predict(valid_poly)
    lin_reg.score(valid_poly, valid_ml['new_cases'].values)
    # print(np.sqrt(mean_squared_error(valid_ml["new_cases"], prediction_poly)))
    model_scores.append(np.sqrt(mean_squared_error(valid_ml["new_cases"], prediction_poly)))        #use this score to compare predictors and to know how close the predicted data is with the real known data
    additional_30days = np.linspace(1, 30, 30)      #predict additionally for 30days not in record, to know how the curve progresses
    pred_input_compiled_data = []
    pred_input_compiled_data = np.array(dt_one_country["Days Since"]).reshape(-1, 1)
    pred_input_compiled_data = np.append(pred_input_compiled_data, pred_input_compiled_data[-1] + additional_30days)

    # add_pred_dates = pd.DataFrame(columns=['date'])
    add_pred_dates = dt_one_country['date']

    for i in range(1, 31):
        add_pred_dates = add_pred_dates.append(add_pred_dates.iloc[-1:] + timedelta(days=1), ignore_index=True)  #increment the days count for the 30added days using datetime class

    # comp_data=poly.fit_transform(np.array(dt_one_country["Days Since"]).reshape(-1,1))
    comp_data = poly.fit_transform(pred_input_compiled_data.reshape(-1, 1))
    # plt.figure(figsize=(11, 6))
    predictions_poly = lin_reg.predict(comp_data)

    fig_PolyReg = go.Figure()       #returning this handle to show figure outside the function
    fig_PolyReg.add_trace(go.Scatter(x=dt_one_country['date'], y=dt_one_country["new_cases"],
                                     mode='lines+markers', name="Train Data for new Cases in " + str(opted_country)))
    # fig.add_trace(go.Scatter(x=dt_one_country['date'], y=predictions_poly,
    fig_PolyReg.add_trace(go.Scatter(x=add_pred_dates, y=predictions_poly,
                                     mode='lines', name="Polynomial Regression Best Fit",
                                     line=dict(color='red', dash='dot')))
    fig_PolyReg.add_vline(x=valid_ml['date'].iloc[0], line_dash="dash")  # ,#add vertical line on the date to know the SPLIT between training and test data
    fig_PolyReg.update_layout(title="new Cases Polynomial Regression Prediction",
                              xaxis_title="Date", yaxis_title="new Cases",
                              legend=dict(x=0, y=1, traceorder="normal"))
    # fig_PolyReg.show()

    # train_ml=dt_one_country.iloc[:int(dt_one_country.shape[0]*0.95)]
    # valid_ml=dt_one_country.iloc[int(dt_one_country.shape[0]*0.95):]

    model_train = dt_one_country.iloc[:int(dt_one_country.shape[0] * 0.95)]
    valid = dt_one_country.iloc[int(dt_one_country.shape[0] * 0.95):]
    y_pred = valid.copy()

    #there is no x,y data for fitting using Holts model --- just pass the known data, that is new_cases for the known days
    holt = Holt(np.asarray(model_train["new_cases"])).fit(smoothing_level=0.9, smoothing_trend=0.4, optimized=False)    #Holt model, smoothing parameters can be varied to observe behavior
    y_pred["Holt"] = holt.forecast(len(valid))      #how many data to predict
    # y_holt_pred["Holt"]=holt.forecast(len(valid)+30)
    # print(np.sqrt(mean_squared_error(y_pred["new_cases"], y_pred["Holt"])))
    model_scores.append(np.sqrt(mean_squared_error(y_pred["new_cases"], y_pred["Holt"])))

    fig_Holt = go.Figure()
    fig_Holt.add_trace(go.Scatter(x=model_train['date'], y=model_train["new_cases"],
                                  mode='lines+markers', name="Train Data for new Cases " + str(opted_country)))
    fig_Holt.add_trace(go.Scatter(x=valid['date'], y=valid["new_cases"],
                                  mode='lines+markers', name="Validation Data for new Cases " + str(opted_country)))
    fig_Holt.add_vline(x=valid['date'].iloc[0], line_dash="dash")  # ,#add vertical line on the date to know the SPLIT between training and test data
    fig_Holt.add_trace(go.Scatter(x=valid['date'], y=y_pred["Holt"],
                                  mode='lines+markers', name="Prediction of new Cases " + str(opted_country)))
    fig_Holt.update_layout(title="new Cases Holt's Linear Model Prediction",
                           xaxis_title="Date", yaxis_title="new Cases", legend=dict(x=0, y=1, traceorder="normal"))
    # fig_Holt.show()

    # the following is Log Linear predictor not currently shown in figure
    x_train = train_ml['Days Since']
    y_train_1 = train_ml['new_cases']
    y_train_1 = y_train_1.astype('float64')
    y_train_1 = y_train_1.apply(lambda x: np.log1p(x))      #first take logarithm of data and then use Linear predictor
    y_train_1.replace([np.inf, -np.inf], 0, inplace=True)
    x_test = valid_ml['Days Since']
    y_test = valid_ml['new_cases']
    # y_test = y_test.astype('float64')
    # y_test = y_test.apply(lambda x: np.log1p(x))
    # y_test.replace([np.inf, -np.inf], 0, inplace=True)

    regr = LinearRegression(normalize=True)
    regr.fit(np.array(x_train).reshape(-1, 1), np.array(y_train_1).reshape(-1, 1))

    ypred = regr.predict(np.array(x_test).reshape(-1, 1))
    # print(np.sqrt(mean_squared_error(y_test, np.expm1(ypred))))

    # # Plot results
    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    #
    # ax1.plot(valid_ml['date'], np.expm1(ypred))
    # ax1.plot(dt_one_country['date'], dt_one_country['new_cases'])
    # ax1.axvline(valid_ml['date'].iloc[0], linewidth=2, ls=':', color='grey', alpha=0.5)
    # ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    # ax1.set_xlabel("Day count ")
    # ax1.set_ylabel("new Cases")
    #
    # ax2.plot(valid_ml['date'], ypred)
    # ax2.plot(dt_one_country['date'], np.log1p(dt_one_country['new_cases']))
    # ax2.axvline(valid_ml['date'].iloc[0], linewidth=2, ls=':', color='grey', alpha=0.5)
    # ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    # ax2.set_xlabel("Day count ")
    # ax2.set_ylabel("Logarithm new Cases")
    #
    # plt.suptitle(("newCases predictions based on Log-Lineal Regression for " + opted_country))

    # The following is Lagged Linear prediction, the performance is not as quoted in the website, so there seems issues in this following code, reasons yet to be found out
    train_days = int(dt_one_country.shape[0] * 0.95)
    test_days = dt_one_country['Days Since'].iloc[-1] - train_days
    lag_size = 30       #Lagged method as shown in the quoted website, keep lagged records (as columns) of 'lag_size' of new_cases
    lagpred_data_features = dt_one_country.copy()       #work with local copy, needed to do store inplace the predicted out and to compare with reference
    lagpred_data_features = calculate_lag(lagpred_data_features, range(1, lag_size), 'new_cases')       #update the new_cases_1,new_cases_2 etc columns

    filter_col_new_cases = [col for col in lagpred_data_features if col.startswith('new_cases')]        #use the additional lagging columns named as new_cases_1,new_cases_2, etc new_cases_29
    lagpred_data_features[filter_col_new_cases] = lagpred_data_features[filter_col_new_cases].apply(
        lambda x: np.log1p(x))              #Linear prediction with logarithm data
    lagpred_data_features.replace([np.inf, -np.inf], 0, inplace=True)
    lagpred_data_features.fillna(0, inplace=True)

    start_fcst = 1 + lagpred_data_features['Days Since'].iloc[train_days]  # prediction day 1
    end_fcst = lagpred_data_features['Days Since'].iloc[-1]  # last prediction day

    for d in list(range(start_fcst, end_fcst + 1)):             #do day by day fitting and prediction for each of the prediction days
        X_train, Y_train_1, X_test = split_data_one_day(lagpred_data_features, d)       #generate training and testing data for each day
        model_1, pred_1 = lin_reg_lag(X_train, Y_train_1, X_test)           #fit and predict for the day
        lagpred_data_features.new_cases.iloc[d] = pred_1                    #add the prediction data to the records

        # Recompute lags
        lagpred_data_features = calculate_lag(lagpred_data_features, range(1, lag_size), 'new_cases')   #update the new_cases_1,new_cases_2 etc columns

        lagpred_data_features.replace([np.inf, -np.inf], 0, inplace=True)
        lagpred_data_features.fillna(0, inplace=True)

        # print("Process for ", country_name, "finished in ", round(time.time() - ts, 2), " seconds")

    predicted_data = lagpred_data_features.new_cases
    real_data = dt_one_country.new_cases
    # dates_list_num = list(range(0,len(dates_list)))
    dates_list_num = dt_one_country['date']
    # Plot results
    model_scores.append(np.sqrt(mean_squared_error(real_data.iloc[train_days:], np.expm1(predicted_data.iloc[train_days:]))))
    fig_LagPred = go.Figure()
    fig_LagPred.add_trace(go.Scatter(x=dates_list_num, y=np.expm1(predicted_data),
                                     mode='lines+markers', name="Prediction new Cases " + str(opted_country)))
    fig_LagPred.add_trace(go.Scatter(x=dates_list_num, y=real_data,
                                     mode='lines+markers', name="Validation Data for new Cases " + str(opted_country)))
    fig_LagPred.add_vline(x=dates_list_num.iloc[start_fcst], line_dash="dash")  # ,
    # annotation=dict())#, annotation_position="top right")
    # fig_LagPred.add_trace(go.Scatter(x=valid['date'], y=y_pred["Holt"],
    #                               mode='lines+markers', name="Prediction of new Cases " + str(opted_country)))
    fig_LagPred.update_layout(title="new Cases Linear Lagged Model Prediction",
                              xaxis_title="Date", yaxis_title="new Cases", legend=dict(x=0, y=1, traceorder="normal"))

    # fig_LagPred.show()

    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))
    #
    # ax1.plot(dates_list_num, np.expm1(predicted_data))
    # ax1.plot(dates_list_num, real_data)
    # ax1.axvline(dates_list_num.iloc[start_fcst], linewidth=2, ls = ':', color='grey', alpha=0.5)
    # ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    # ax1.set_xlabel("Day count ")
    # ax1.set_ylabel("new Cases")
    #
    # ax2.plot(dates_list_num, predicted_data)
    # ax2.plot(dates_list_num, np.log1p(real_data))
    # ax2.axvline(dates_list_num.iloc[start_fcst], linewidth=2, ls = ':', color='grey', alpha=0.5)
    # ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left')
    # ax2.set_xlabel("Day count ")
    # ax2.set_ylabel("Log new Cases")

    # plt.suptitle(("ConfirmedCases predictions based on Log-Lineal Regression for "+country_name))
    model_names = ["Lasso Lars Regression", "Polynomial Regression","Holts Linear Prediction","Linear Regression Lagged Model"]      #use this score to compare predictors
    model_summary = pd.DataFrame(zip(model_names, model_scores),
                                 columns=["Model Name", "Root Mean Squared Error"]).sort_values(
        ["Root Mean Squared Error"])
    print(model_summary)
    return fig_LarsReg, fig_PolyReg, fig_Holt, fig_LagPred
# LassoLars Regression
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LassoLars
# load the iris datasets
dataset = datasets.load_diabetes()
# fit a LASSO using LARS model to the data
model = LassoLars(alpha=0.1)
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
示例#33
0
def ProcessData(df,vect1,vect2,builder):
    descriptionmatrix = vect1.transform([str(x) for x in df['titledescription'].values])
    locationmatrix = vect2.transform([str(x) for x in df['locationfull'].values])
    # x = build_design_matrices([builder], df, return_type='dataframe', NA_action=NAAction(on_NA='drop', NA_types=[]))
    y = df['SalaryNormalized'].values
    #x_combo = np.hstack([np.asarray(x[0]),descriptionmatrix.toarray(),locationmatrix.toarray()])
    x_combo = np.hstack([descriptionmatrix.toarray(),locationmatrix.toarray()])
    return (np.asarray(y), sparse.coo_matrix(x_combo))

train = PreProcess(pd.read_csv('train.csv'))
(vect1,vect2,builder) = InitializeTransformers(train)
(y, x) = ProcessData(train, vect1, vect2,builder)

(y_test, x_test) = ProcessData(PreProcess(pd.read_csv('solution.csv')),vect1,vect2,builder)

lasso = Lasso()
lasso.fit(x,y)
y_pred = lasso.predict(x_test)

lassolars = LassoLars(alpha=2)
lassolars.fit(x.toarray(),y)
lars_pred = lassolars.predict(x_test)

print np.sqrt(mean_squared_error(y_test, y_pred))

print r2_score(y_test,y_pred)

print np.sqrt(mean_squared_error(y_test,lars_pred))

print r2_score(y_test,lars_pred)