Пример #1
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4,5)
    y = np.ones((4,5))
    assert_equal(iqr(x, y), 5*np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2*np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y),
                 np.array([  73.5,   87.5,  103.5,  121.5,  141.5]))
    assert_equal(mse(x, y, axis=1),
                 np.array([   3.,   38.,  123.,  258.]))

    assert_almost_equal(rmse(x, y),
                        np.array([  8.5732141 ,   9.35414347,  10.17349497,
                                   11.02270384,  11.89537725]))
    assert_almost_equal(rmse(x, y, axis=1),
                        np.array([  1.73205081,   6.164414,
                                   11.09053651,  16.0623784 ]))

    assert_equal(maxabs(x, y),
                 np.array([ 14.,  15.,  16.,  17.,  18.]))
    assert_equal(maxabs(x, y, axis=1),
                 np.array([  3.,   8.,  13.,  18.]))

    assert_equal(meanabs(x, y),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(meanabs(x, y, axis=1),
                 np.array([  1.4,   6. ,  11. ,  16. ]))
    assert_equal(meanabs(x, y, axis=0),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))

    assert_equal(medianabs(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianabs(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(bias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(bias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(medianbias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianbias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(vare(x, y),
                 np.array([ 31.25,  31.25,  31.25,  31.25,  31.25]))
    assert_equal(vare(x, y, axis=1),
                 np.array([ 2.,  2.,  2.,  2.]))
Пример #2
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4,5)
    y = np.ones((4,5))
    assert_equal(iqr(x, y), 5*np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2*np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y),
                 np.array([  73.5,   87.5,  103.5,  121.5,  141.5]))
    assert_equal(mse(x, y, axis=1),
                 np.array([   3.,   38.,  123.,  258.]))

    assert_almost_equal(rmse(x, y),
                        np.array([  8.5732141 ,   9.35414347,  10.17349497,
                                   11.02270384,  11.89537725]))
    assert_almost_equal(rmse(x, y, axis=1),
                        np.array([  1.73205081,   6.164414,
                                   11.09053651,  16.0623784 ]))

    assert_equal(maxabs(x, y),
                 np.array([ 14.,  15.,  16.,  17.,  18.]))
    assert_equal(maxabs(x, y, axis=1),
                 np.array([  3.,   8.,  13.,  18.]))

    assert_equal(meanabs(x, y),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(meanabs(x, y, axis=1),
                 np.array([  1.4,   6. ,  11. ,  16. ]))
    assert_equal(meanabs(x, y, axis=0),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))

    assert_equal(medianabs(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianabs(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(bias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(bias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(medianbias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianbias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(vare(x, y),
                 np.array([ 31.25,  31.25,  31.25,  31.25,  31.25]))
    assert_equal(vare(x, y, axis=1),
                 np.array([ 2.,  2.,  2.,  2.]))
def test_eval_measures():
    # mainly regression tests
    x = np.arange(20).reshape(4, 5)
    y = np.ones((4, 5))

    assert_equal(iqr(x, y), 5 * np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2 * np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5]))
    assert_equal(mse(x, y, axis=1), np.array([3.0, 38.0, 123.0, 258.0]))

    assert_almost_equal(
        rmse(x, y),
        np.array(
            [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725]
        ),
    )
    assert_almost_equal(
        rmse(x, y, axis=1),
        np.array([1.73205081, 6.164414, 11.09053651, 16.0623784]),
    )

    err = x - y
    loc = np.where(x != 0)
    err[loc] /= x[loc]
    err[np.where(x == 0)] = np.nan
    expected = np.sqrt(np.nanmean(err ** 2, 0) * 100)
    assert_almost_equal(rmspe(x, y), expected)
    err[np.where(np.isnan(err))] = 0.0
    expected = np.sqrt(np.nanmean(err ** 2, 0) * 100)
    assert_almost_equal(rmspe(x, y, zeros=0), expected)

    assert_equal(maxabs(x, y), np.array([14.0, 15.0, 16.0, 17.0, 18.0]))
    assert_equal(maxabs(x, y, axis=1), np.array([3.0, 8.0, 13.0, 18.0]))

    assert_equal(meanabs(x, y), np.array([7.0, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6.0, 11.0, 16.0]))
    assert_equal(meanabs(x, y, axis=0), np.array([7.0, 7.5, 8.5, 9.5, 10.5]))

    assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianabs(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(bias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianbias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0]))

    assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25]))
    assert_equal(vare(x, y, axis=1), np.array([2.0, 2.0, 2.0, 2.0]))
Пример #4
0
def get_best_model(train, test, model_formula):
    # Step 1: specify the form of the model

    grid = 10**np.arange(-8, -3, dtype=np.float64)

    best_alpha = []
    best_score = 1000

    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)

    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
Пример #5
0
    def negative_binomial_model_eval(self, x_train, x_test, y_train, y_test):
        train = x_train.copy()
        test = x_test.copy()
        train['total_cases'] = y_train
        test['total_cases'] = y_test
        # Step 1: specify the form of the model
        model_formula = train.columns[0]
        for i in range(1, len(train.columns)-1):
            model_formula = model_formula+" + "+train.columns[i]
        model_formula = train.columns[-1] + ' ~ ' + model_formula

        grid = 10 ** np.arange(-8, -3, dtype=np.float64)

        best_alpha = []
        best_score = 1000

        # Step 2: Find the best hyper parameter, alpha
        for alpha in grid:
            model = smf.glm(formula=model_formula, data=train, family=sm.families.NegativeBinomial(alpha=alpha))

            results = model.fit()
            predictions = results.predict(test).astype(int)
            score = eval_measures.meanabs(predictions, test.total_cases)

            if score < best_score:
                best_alpha = alpha
                best_score = score

        # st.write('best alpha = ', best_alpha)
        # st.write('best score = ', best_score)
        return best_alpha
Пример #6
0
def get_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + "                     "reanalysis_specific_humidity_g_per_kg + "                     "reanalysis_dew_point_temp_k + "                     "reanalysis_min_air_temp_k + "                     "station_min_temp_c + "                     "station_max_temp_c + "                     "station_avg_temp_c + "                     "reanalysis_air_temp_k"
     
    
    grid = 10 ** np.arange(-8, -3, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('Alpha = ', best_alpha)
    print('Score = ', best_score)
            
    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
Пример #7
0
def bestrandfrorest(train, test):


    n_estimate = np.arange(2, 100, 2, dtype=np.float64)  
    rand_state = np.arange(2, 100, 2, dtype=np.float64)                 
    best_estimate = []
    best_rand_state = []
    best_score = 1000
    eee = 10 ** -6

    train_X = train.copy()
    train_Y = train_X.total_cases
    train_X.drop('total_cases', axis=1, inplace=True)
    train_X.drop('city', axis=1, inplace=True)
    train_X.drop('week_start_date', axis=1, inplace=True)

    test_X = test.copy()
    test_Y = test_X.total_cases
    test_X.drop('total_cases', axis=1, inplace=True)
    test_X.drop('city', axis=1, inplace=True)
    test_X.drop('week_start_date', axis=1, inplace=True)

    for n in n_estimate:
      for r in rand_state:
        nest = int(n)
        rnd = int(r)

        randForestModel = RandomForestRegressor(n_estimators=nest, random_state=rnd)
        randForestModel.fit(train_X, train_Y)

        predictions = randForestModel.predict(test_X).astype(int)
        acc = eval_measures.meanabs(predictions, test_Y)

        if acc < best_score + eee:
          best_score = acc
          best_estimate = nest
          best_rand_state = rnd
      
      print(n)

    print(best_estimate)
    print(best_rand_state)
    randForestModel = RandomForestRegressor(n_estimators=best_estimate, random_state=best_rand_state)
    randForestModel.fit(train_X, train_Y)
    predictions = randForestModel.predict(test_X).astype(int)
    acc = eval_measures.meanabs(predictions, test_Y)
    return randForestModel, acc
Пример #8
0
def printErrors(test, pred, model):
    '''
    Objective: to print errors of the models
    Inputs:
    test: test dataframe
    pred: predictions
    model: model that is used
    Outputs:
    Mean absolute error, mean squared error, root mean squared error
    '''
    print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0)))
    print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0)))
    print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
Пример #9
0
    def train_negative_binomial_model(self, x_train, y_train, test_size):
        '''generate and train the negative binomial model
        :param xtrain: matrix with the features, pandas or numpy
        :param ytrain: array with the targets, pandas or numpy
        :param test_size: from 0 to 1, percentual of train to use as test in the parameter evaluatio of the model
        :return: negative binomial model fitted
        '''
        x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(
            x_train, y_train, shuffle=False, test_size=test_size)
        train = x_train_train.copy()
        test = x_train_test.copy()
        train['target'] = y_train_train
        test['target'] = y_train_test

        # Step 1: specify the form of the model
        model_formula = train.columns[0]
        for i in range(1, len(train.columns) - 1):
            model_formula = model_formula + " + " + train.columns[i]
        model_formula = train.columns[-1] + ' ~ ' + model_formula

        grid = 10**np.arange(-8, -3, dtype=np.float64)
        best_alpha = []
        best_score = 1000
        # Step 2: Find the best hyper parameter, alpha
        for alpha in grid:
            model = smf.glm(formula=model_formula,
                            data=train,
                            family=sm.families.NegativeBinomial(alpha=alpha))
            results = model.fit()
            predictions = results.predict(test).astype(int)
            score = eval_measures.meanabs(predictions, test.total_cases)
            if score < best_score:
                best_alpha = alpha
                best_score = score

        # fit the final model
        data = x_train.copy()
        data['target'] = y_train
        model_formula = data.columns[0]
        for i in range(1, len(data.columns) - 1):
            model_formula = model_formula + " + " + data.columns[i]
        model_formula = data.columns[-1] + ' ~ ' + model_formula

        # # Step 4: refit on entire dataset
        model = smf.glm(formula=model_formula,
                        data=data,
                        family=sm.families.NegativeBinomial(alpha=best_alpha))
        fitted_model = model.fit()

        return fitted_model
Пример #10
0
def gradient_boosting(train_data, val_data):
    params = {
        'n_estimators': 800,
        'max_depth': 5,
        'min_samples_split': 3,
        'learning_rate': 0.01,
        'loss': 'ls'
    }
    clf = ensemble.GradientBoostingRegressor(**params)
    train_label = train_data['total_cases']
    train_feat = train_data.drop('total_cases', axis=1)
    clf.fit(train_feat, train_label)
    predictions = clf.predict(train_feat)
    mae = eval_measures.meanabs(predictions, train_label)
    #print("Training MAE: %.4f" % mae)

    val_label = val_data['total_cases']
    val_feat = val_data.drop('total_cases', axis=1)
    val_predictions = clf.predict(val_feat)
    mae = eval_measures.meanabs(val_predictions, val_label)
    #print("Validation MAE: %.4f" % mae)

    return clf
 def evaluate(self):
     """
     Calculates the MAE between the predicted and test ratings.
     """
     predicted = self.get_full_rating_matrix()
     real_ratings = []
     predicted_ratings = []
     for user, movie_ratings in self.test_set.iteritems():
         for movie_id, rating in movie_ratings.iteritems():
             predicted_user = predicted.get(user, None)
             if not predicted_user:
                 continue
             predicted_rating = predicted_user.get(movie_id, None)
             if not predicted_rating:
                 continue
             predicted_ratings.append(predicted_rating)
             real_ratings.append(rating)
     return meanabs(real_ratings, predicted_ratings)
Пример #12
0
def eval_metrics(forecast, observed):
    '''Return forecast evaluation metrics.

    Parameters
    ----------
    forecast : pd.Series
        Forecasted values.
    observed : pd.Series
        Observed values.

    Return
    ------
    mae : float
        Mean Absolute Error metric.
    rmserr : float
        Root Mean Squared Error metric. Named rmserr to avoid
        conflicting with statsmodels rmse function.
    '''
    return meanabs(forecast, observed), rmse(
        forecast,
        observed), (((forecast - observed).abs() / observed).mean()) * 100
Пример #13
0
def forecast_arima(df: pd.DataFrame, cols: list, with_graph: bool = True):
    lag = 0
    order = 1
    moving_avg_model = 0
    steps = 50

    for col in cols:
        model = ARIMA(df[col].iloc[:-steps],
                      order=(lag, order, moving_avg_model))
        model_fit = model.fit()

        model_for = model_fit.get_forecast(steps=steps, alpha=0.05)
        print('\t==== Summary of forecast ARIMA(%d, %d, %d) ====\n' %
              (lag, order, moving_avg_model))
        print(model_for.summary_frame(), model_for.conf_int(), sep='\n')
        print('RMSE: %f\nMAE: %f' %
              (rmse(df[col][-50:], model_for.predicted_mean),
               meanabs(df[col][-50:], model_for.predicted_mean)))
        print()

        if with_graph is True:
            plt.figure(figsize=(12, 5))
            plt.xlabel(col)
            plt.title('Forecast for %s using ARIMA(%d, %d, %d)' %
                      (col, lag, order, moving_avg_model))

            ax1 = model_for.predicted_mean.plot(color='blue',
                                                grid=True,
                                                label='Actual')
            ax2 = df[col][-50:].plot(color='red',
                                     grid=True,
                                     secondary_y=True,
                                     label='Estimated')

            h1, l1 = ax1.get_legend_handles_labels()
            h2, l2 = ax2.get_legend_handles_labels()

            plt.legend(h1 + h2, l1 + l2, loc=2)
            plt.show()
Пример #14
0
    def TrainModel(self, DATA, args={}):
        np.random.seed(1)
        self, options = UpdateOptions(self, args)

        self.Models = {}
        self.Accuracy = []
        DATA_X = DATA.iloc[:, :-1]
        DATA_Y = DATA.iloc[:, -1]
        models = {}
        acc = []
        for i in range(0, self.NumberOfModels):
            newData = np.random.randint(DATA.shape[0], size=DATA.shape[0])
            newTest = np.delete(np.arange(0, DATA.shape[0]),
                                pd.unique(newData))

            data = DATA.iloc[newData, :].reset_index(drop=True)

            models[i] = RandomForest()
            models[i].TrainModel(data)

            tst_X = DATA_X.iloc[newTest, :].reset_index(drop=True)
            tst_Y = DATA_Y.iloc[newTest].reset_index(drop=True)

            predictions = models[i].Predict(tst_X).astype(int)
            acc.append(eval_measures.meanabs(predictions, tst_Y))

        acc = np.asarray(acc)

        for i in range(0, self.NumberOfOutModels):
            index = acc.argmin()
            self.Accuracy.append(acc[index])
            acc[index] = acc.max()
            self.Models[i] = models[index]

        self.Accuracy = np.asarray(self.Accuracy)

        # print(str(self.Accuracy.mean()))

        return self.Accuracy.mean()
Пример #15
0
def MultipleRandFrorest(idf, K=50, nest=100, rnd=20):

    df = idf.copy()
    train_Y = df.total_cases
    df.drop('total_cases', axis=1, inplace=True)

    models = {}
    acc = []

    for i in range(0, K):
        newData = np.random.randint(df.shape[0], size=df.shape[0])
        newTest = np.delete(np.arange(0, df.shape[0]), np.unique(newData))

        data_X = df.iloc[newData, :].reset_index(drop=True)
        data_Y = train_Y.iloc[newData].reset_index(drop=True)

        models[i] = RandomForestRegressor(n_estimators=nest, random_state=rnd)
        models[i].fit(data_X, data_Y)

        tst_X = df.iloc[newTest, :].reset_index(drop=True)
        tst_Y = train_Y.iloc[newTest].reset_index(drop=True)

        predictions = models[i].predict(tst_X).astype(int)
        acc.append(eval_measures.meanabs(predictions, tst_Y))

    acc = np.asarray(acc)
    out_models = {}
    out_ACC = []
    for i in range(0, round(K)):

        indx = acc.argmin()
        out_ACC.append(acc[indx])
        acc[indx] = 100
        out_models[i] = models[indx]

    out_ACC = np.asarray(out_ACC)
    print(str(out_ACC.mean()))

    return out_models
Пример #16
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')
    train_features = pd.read_csv(train_features_path, index_col=[0, 1, 2])
    train_labels = pd.read_csv(train_labels_path, index_col=[0, 1, 2])
    # Seperate data for San Juan
    sj_train_features = train_features.loc['sj']
    sj_train_labels = train_labels.loc['sj']
    # Separate data for Iquitos
    iq_train_features = train_features.loc['iq']
    iq_train_labels = train_labels.loc['iq']

    # Remove 'week_start_date' string.
    sj_train_features.drop('week_start_date', axis=1, inplace=True)
    iq_train_features.drop('week_start_date', axis=1, inplace=True)

    #find NaN in data be unsatisfying and eliminate those ddata
    sj_train_features.fillna(method='ffill', inplace=True)
    iq_train_features.fillna(method='ffill', inplace=True)

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())
    '''
	sj_train_subtrain = sj_train.head(800)
	sj_train_subtest = sj_train.tail(sj_train.shape[0] - 800)
	iq_train_subtrain = iq_train.head(400)
	iq_train_subtest = iq_train.tail(iq_train.shape[0] - 400)
	'''
    choose = rand.sample(range(0, sj_train.shape[0] - 1), 800)
    val = [i for i in range(sj_train.shape[0]) if i not in choose]
    sj_train_subtrain = sj_train.ix[choose]
    sj_train_subtest = sj_train.ix[val]

    choose = rand.sample(range(0, iq_train.shape[0] - 1), 400)
    val = [i for i in range(iq_train.shape[0]) if i not in choose]
    iq_train_subtrain = iq_train.ix[choose]
    iq_train_subtest = iq_train.ix[val]

    sj_best_model = get_best_model(sj_train_subtrain, sj_train_subtest, 'sj')
    iq_best_model = get_best_model(iq_train_subtrain, iq_train_subtest, 'iq')

    #Use K-fold to create cross validation data
    kf = KFold(n_splits=12)
    sj_score = []
    for train_index, test_index in kf.split(sj_train):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = sj_train.ix[train_index], sj_train.ix[test_index]
        predictions = sj_best_model.predict(X_test).astype(int)
        for i in range(predictions.shape[0] - 1, 3, -1):
            predictions.ix[i] = predictions.ix[i - 4]
        sj_score.append(eval_measures.meanabs(predictions, X_test.total_cases))

    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))
    print(sj_score)
    iq_score = []
    for train_index, test_index in kf.split(iq_train):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = iq_train.ix[train_index], iq_train.ix[test_index]
        predictions = iq_best_model.predict(X_test).astype(int)
        #print(predictions)
        for i in range(predictions.shape[0] - 1, 0, -1):
            predictions.ix[i] = predictions.ix[i - 1]
        #print(predictions)
        iq_score.append(eval_measures.meanabs(predictions, X_test.total_cases))
    print(iq_score)
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    figs, axes = plt.subplots(nrows=2, ncols=1)

    # plot sj
    sj_train['fitted'] = sj_best_model.fittedvalues
    sj_train.fitted.plot(ax=axes[0], label="Predictions")
    sj_train.total_cases.plot(ax=axes[0], label="Actual")

    # plot iq
    iq_train['fitted'] = iq_best_model.fittedvalues
    iq_train.fitted.plot(ax=axes[1], label="Predictions")
    iq_train.total_cases.plot(ax=axes[1], label="Actual")

    plt.suptitle("Dengue Predicted Cases vs. Actual Cases")
    plt.legend()
    plt.show()

    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)
    sj_predictions = sj_best_model.predict(sj_test).astype(int)

    for i in range(sj_predictions.shape[0] - 1, 3, -1):
        sj_predictions.ix[i] = sj_predictions.ix[i - 4]

    iq_predictions = iq_best_model.predict(iq_test).astype(int)
    for i in range(iq_predictions.shape[0] - 1, 0, -1):
        iq_predictions.ix[i] = iq_predictions.ix[i - 1]

    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/benchmark_shift.csv")
Пример #17
0
    def CalculatePerformance(self, result, target):

        if self.NegativeBinomial == {}:
            return None
        else:
            return eval_measures.meanabs(result, target)
            #     print('{} {}  {}'.format(
            #         valid_data[column].index.values[IDX], valid_data[column].iloc[IDX], forecast_data[column].iloc[IDX]))

            ax1.plot(valid_data[column].index,
                     forecast_data[column],
                     linewidth=0.5)

            # log the MSE and MAE
            logging.info(
                'Mean square error for {} forecast for data order {} for column {} : {}'
                .format(forecast_type[forecast], order_type[order], column,
                        mse(valid_data[column], forecast_data[column])))
            logging.info(
                'Absolute square error for {} forecast for data order {} for column {} : {}'
                .format(forecast_type[forecast], order_type[order], column,
                        meanabs(valid_data[column], forecast_data[column])))
            forecast_cnt = len(valid_data.index.values)
            correct_forecast_direction = 0
            for data_idx in range(forecast_cnt):
                if (valid_data[column][data_idx] >
                        0) == (forecast_data[column][data_idx] > 0):
                    correct_forecast_direction = correct_forecast_direction + 1
            logging.info(
                'Percent of valid data that is incerease from previous day {}'.
                format(valid_data[column].gt(0).sum() / forecast_cnt))
            logging.info(
                'Percent of forecast that matches valid direction {}'.format(
                    correct_forecast_direction / forecast_cnt))

            # write out the results to csv files for post processing
            output_file = open(
Пример #19
0
def evaluation(validation_data: pd.Series, forecast_ets: pd.Series, forecast_arima: pd.Series, forecast_xgboost: pd.Series, forecast_snaive: pd.Series, forecast_drift: pd.Series, forecast_average: pd.Series, verbose: bool):

    print("\tPrediction model\t\t|\tRMSE\t\t\t\t|\tMAE\t\t\t\t\t|\tSeasonal MASE\t\t|\tSMAPE")
    print("\t-------------------------------------------------------------------------------------------------")

    sn = [
        rmse([float(x) for x in validation_data.values], forecast_snaive),
        meanabs([float(x) for x in validation_data.values], forecast_snaive),
        eval.mase(validation_data.values, forecast_snaive, 5),
        eval.smape(validation_data.values, forecast_snaive)
    ]
    if verbose:
        print("\tSeasonal Naive", end="\t\t\t|\t")
        print(sn[0], end="\t|\t")
        print(sn[1], end="\t|\t")
        print(sn[2], end="\t|\t")
        print(sn[3])

    df = [
        rmse([float(x) for x in validation_data.values], forecast_drift),
        meanabs([float(x) for x in validation_data.values], forecast_drift),
        eval.mase(validation_data.values, forecast_drift, 5),
        eval.smape(validation_data.values, forecast_drift)
    ]
    if verbose:
        print("\tDrift", end="\t\t\t\t\t|\t")
        print(df[0], end="\t|\t")
        print(df[1], end="\t|\t")
        print(df[2], end="\t|\t")
        print(df[3])
    av = [
        rmse([float(x) for x in validation_data.values], forecast_average),
        meanabs([float(x) for x in validation_data.values], forecast_average),
        eval.mase(validation_data.values, forecast_average, 5),
        eval.smape(validation_data.values, forecast_average)
    ]
    if verbose:
        print("\tAverage", end="\t\t\t\t\t|\t")
        print(av[0], end="\t|\t")
        print(av[1], end="\t|\t")
        print(av[2], end="\t|\t")
        print(av[3])

    xgb = [
        rmse([float(x) for x in validation_data.values], forecast_xgboost),
        meanabs([float(x) for x in validation_data.values], forecast_xgboost),
        eval.mase(validation_data.values, forecast_xgboost, 5),
        eval.smape(validation_data.values, forecast_xgboost)
    ]
    if verbose:
        print("\tXGBoost Regression", end="\t\t|\t")
        print(xgb[0], end="\t|\t")
        print(xgb[1], end="\t|\t")
        print(xgb[2], end="\t|\t")
        print(xgb[3])

    ets = [
        rmse([float(x) for x in validation_data.values], forecast_ets),
        meanabs([float(x) for x in validation_data.values], forecast_ets),
        eval.mase(validation_data.values, forecast_ets, 5),
        eval.smape(validation_data.values, forecast_ets)
    ]
    if verbose:
        print("\tExponentialSmoothing", end="\t|\t")
        print(ets[0], end="\t|\t")
        print(ets[1], end="\t|\t")
        print(ets[2], end="\t|\t")
        print(ets[3])

    ar = [
        rmse([float(x) for x in validation_data.values], forecast_arima),
        meanabs([float(x) for x in validation_data.values], forecast_arima),
        eval.mase(validation_data.values, forecast_arima, 5),
        eval.smape(validation_data.values, forecast_arima)
    ]
    if verbose:
        print("\tARIMA", end="\t\t\t\t\t|\t")
        print(ar[0], end="\t|\t")
        print(ar[1], end="\t|\t")
        print(ar[2], end="\t|\t")
        print(ar[3])

    eval_results = DataFrame ([sn, df, av, xgb, ets, ar], index = ['seasonal_naive','drift','average','xgboost','ets','arima'],columns = ['RMSE','MAE','MASE', 'sMAPE'])
    return eval_results
Пример #20
0
    def CalculatePerformance(self, result, target):

        if self.Models == {}:
            return None
        else:
            return eval_measures.meanabs(result, target)
Пример #21
0
from statsmodels.tsa.stattools import grangercausalitytests

grangercausalitytests(df3[['a', 'd']], maxlag=5)
grangercausalitytests(df3[['b', 'd']], maxlag=5)

np.random.seed(42)

df = pd.DataFrame(np.random.randint(20, 30, (50, 2)),
                  columns=['test', 'predictions'])

df.head()

df.plot(figsize=(12, 8))

from statsmodels.tools.eval_measures import mse, rmse, meanabs

mse(df['test'], df['predictions'])
rmse(df['test'], df['predictions'])
meanabs(df['test'], df['predictions'])

df1.head()
df1.index

from statsmodels.graphics.tsaplots import month_plot, quarter_plot

month_plot(df1['Pass_K'])

df1q = df1['Pass_K'].resample(rule='Q').sum()
quarter_plot(df1q)
Пример #22
0
def get_best_model_sj(train, test):
    # Step 1: specify the form of the model

    #CHANGE HERE ---- SJ FEATURES
    model_formula = "total_cases ~ 1 + " \
                    "reanalysis_specific_humidity_g_per_kg + " \
                    "reanalysis_dew_point_temp_k + " \
                    "station_avg_temp_c + " \
                    "station_max_temp_c + " \
                    "reanalysis_air_temp_k + " \
                    "reanalysis_relative_humidity_percent + " \
                    "reanalysis_relative_humidity_percent_2 + " \
                    "reanalysis_relative_humidity_percent_3 + " \
                    "reanalysis_precip_amt_kg_per_m2_2 + " \
                    "reanalysis_precip_amt_kg_per_m2_3 + " \
                    "reanalysis_specific_humidity_g_per_kg_2 + " \
                    "reanalysis_specific_humidity_g_per_kg_3 + " \
                    "reanalysis_dew_point_temp_k_2 + " \
                    "reanalysis_dew_point_temp_k_3 + " \
                    "reanalysis_dew_point_temp_k_4 + " \
                    "reanalysis_air_temp_k_2 + " \
                    "reanalysis_air_temp_k_4 + " \
                    "reanalysis_air_temp_k_5 + " \
                    "reanalysis_air_temp_k_6 + " \
                    "reanalysis_air_temp_k_7 + " \
                    "reanalysis_air_temp_k_8 + " \
                    "station_max_temp_c_3 + " \
                    "station_max_temp_c_4 + " \
                    "station_max_temp_c_5 + " \
                    "station_max_temp_c_6 + " \
                    "station_max_temp_c_2 + " \
                    "reanalysis_sat_precip_amt_mm_2 + " \
                    "precipitation_amt_mm_2 + " \
                    "precipitation_amt_mm_3"

    grid = 10**np.arange(-8, -3, dtype=np.float64)

    best_alpha = []
    best_score = 1000

    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)

    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
Пример #23
0
        '2': np.round(sdarbict1, 4)
    },
]
mstable = pd.DataFrame(msdata)
print('')
print('== ARIMA Model Selection ==')
print('')
print(mstable)
print('')

##########################################

# 3.8. ARIMA Models Forecasting Accuracy

# 3.8.1. Multi-Steps Forecast
rwdmae1 = fa.meanabs(rwdf1, spyf)
rwdrmse1 = fa.rmse(rwdf1, spyf)
darmae1 = fa.meanabs(darf1, spyf)
darrmse1 = fa.rmse(darf1, spyf)
srwdmae1 = fa.meanabs(srwdf1, spyf)
srwdrmse1 = fa.rmse(srwdf1, spyf)
sdarmae1 = fa.meanabs(sdarf1, spyf)
sdarrmse1 = fa.rmse(sdarf1, spyf)

fadata1 = [
    {
        '0': '',
        '1': 'MAE',
        '2': 'RMSE'
    },
    {
Пример #24
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    ###Define the xgb parameters
    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    num_boost_rounds = 1000
    ##Use K-fold to create cross validation data
    kf = KFold(n_splits=6)

    ##Do the stacking by adding 5 dataframes 'negbi', 'gb', 'xgb','adaboost','extratree' ,'bagging'which store the training prediction
    sj_train = sj_train.assign(negbi=0)
    sj_train = sj_train.assign(gb=0)
    sj_train = sj_train.assign(xgb=0)
    sj_train = sj_train.assign(abr=0)
    sj_train = sj_train.assign(etr=0)
    sj_train = sj_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        ###(1)neg_binomial method
        sj_neg_model = get_best_model(X_train, X_val, 'sj')
        predictions_neg = sj_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 3, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 4]

        ###(2)gradient boosting method
        sj_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = sj_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgboost method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        sj_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = sj_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        sj_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        sj_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = sj_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        sj_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        sj_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = sj_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        sj_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        sj_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = sj_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in sj_train  predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to sj training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            sj_train['negbi'].ix[index] = predictions_neg.ix[idx]
            sj_train['gb'].ix[index] = predictions_gb[idx]
            sj_train['xgb'].ix[index] = predictions_xgb[idx]
            sj_train['abr'].ix[index] = predictions_abr[idx]
            sj_train['etr'].ix[index] = predictions_etr[idx]
            sj_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    iq_train = iq_train.assign(negbi=0)
    iq_train = iq_train.assign(gb=0)
    iq_train = iq_train.assign(xgb=0)
    iq_train = iq_train.assign(abr=0)
    iq_train = iq_train.assign(etr=0)
    iq_train = iq_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]

        ###(1)neg_binomial method
        iq_neg_model = get_best_model(X_train, X_val, 'iq')
        predictions_neg = iq_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 0, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 1]

        ###(2)gradient boosting method
        iq_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = iq_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgb method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        iq_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = iq_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        iq_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        iq_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = iq_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        iq_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        iq_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = iq_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        iq_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        iq_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = iq_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in iq_train predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to iq training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            iq_train['negbi'].ix[index] = predictions_neg.ix[idx]
            iq_train['gb'].ix[index] = predictions_gb[idx]
            iq_train['xgb'].ix[index] = predictions_xgb[idx]
            iq_train['abr'].ix[index] = predictions_abr[idx]
            iq_train['etr'].ix[index] = predictions_etr[idx]
            iq_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    ###Now the training data looks like [feature, total_cases, negbi, gb, xgb]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)
    ##Like training, add 'negbi' and 'gb' to the testing dataframe
    sj_test = sj_test.assign(negbi=0)
    sj_test = sj_test.assign(gb=0)
    sj_test = sj_test.assign(xgb=0)
    sj_test = sj_test.assign(abr=0)
    sj_test = sj_test.assign(etr=0)
    sj_test = sj_test.assign(br=0)

    ##(1)neg_binomial prediction
    sj_predictions_neg = sj_neg_model.predict(sj_test).astype(int)
    for i in range(sj_predictions_neg.shape[0] - 1, 3, -1):
        sj_predictions_neg.ix[i] = sj_predictions_neg.ix[i - 4]
    ##(2)gradient boosting prediction
    sj_predictions_gb = sj_gb_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    sj_predictions_xgb = sj_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    sj_predictions_abr = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    sj_predictions_etr = sj_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    sj_predictions_br = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to sj testing data...")
    for i in range(len(sj_test['negbi'])
                   ):  #Add the prediction to the corresponding column
        sj_test['negbi'].ix[i] = sj_predictions_neg.ix[i]
        sj_test['gb'].ix[i] = sj_predictions_gb[i]
        sj_test['xgb'].ix[i] = sj_predictions_xgb[i]
        sj_test['abr'].ix[i] = sj_predictions_abr[i]
        sj_test['etr'].ix[i] = sj_predictions_etr[i]
        sj_test['br'].ix[i] = sj_predictions_br[i]

    ##Same process as city sj
    iq_test = iq_test.assign(negbi=0)
    iq_test = iq_test.assign(gb=0)
    iq_test = iq_test.assign(xgb=0)
    iq_test = iq_test.assign(abr=0)
    iq_test = iq_test.assign(etr=0)
    iq_test = iq_test.assign(br=0)

    ###(1)neg_binomial prediction
    iq_predictions_neg = iq_neg_model.predict(iq_test).astype(int)
    for i in range(iq_predictions_neg.shape[0] - 1, 0, -1):
        iq_predictions_neg.ix[i] = iq_predictions_neg.ix[i - 1]
    ##(2)gradient boosting prediction
    iq_predictions_gb = iq_gb_model.predict(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    iq_predictions_xgb = iq_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    iq_predictions_abr = iq_abr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    iq_predictions_etr = iq_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    iq_predictions_br = iq_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to iq testing data...")
    for i in range(len(iq_test['negbi'])):
        iq_test['negbi'].ix[i] = iq_predictions_neg.ix[i]
        iq_test['gb'].ix[i] = iq_predictions_gb[i]
        iq_test['xgb'].ix[i] = iq_predictions_xgb[i]
        iq_test['abr'].ix[i] = iq_predictions_abr[i]
        iq_test['etr'].ix[i] = iq_predictions_etr[i]
        iq_test['br'].ix[i] = iq_predictions_br[i]

    ##use new information to run a linear regression
    print("Building linear regression model...")
    #Now the linear regression model uses (X = [features, negbi, gb, xgb], y = total_cases )to train(fit)
    sj_lr = LR()
    sj_lr.fit(sj_train.drop('total_cases', axis=1), sj_train['total_cases'])
    iq_lr = LR()
    iq_lr.fit(iq_train.drop('total_cases', axis=1), iq_train['total_cases'])

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_lr.predict(sj_test)
    iq_predictions = iq_lr.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/stacking_6_less_feature.csv")
    '''
Пример #25
0
# Fix S% issue with "--" and multiply S% by 2 if d-man
while i < regression_data['TOI/GP'].size:
    if regression_data['S%'][i] == "--":
        shootingp.append(0)
    else:
        shotp = float(regression_data['S%'][i])
        shootingp.append(shotp)
    if regression_data['Pos'][i] == "D" and regression_data['S%'][i] != "--":
        shootingp.append(float(regression_data['S%'][i]) * 2)
    i = i + 1

shootingp_array = np.array(shootingp)
regression_data['S%'] = pd.Series(shootingp_array)

variables = regression_data[['G', 'A', 'TOI/GP', 'PPP']].values

salary = regression_data['AAV'].values

var_train, var_test, sal_train, sal_test = train_test_split(variables,
                                                            salary,
                                                            test_size=0.2,
                                                            random_state=5)

lin_model = sm.OLS(sal_train, var_train)
result = lin_model.fit()
sal_pred = result.predict(var_test)

print(result.summary())

print("Mean Absolute Error: " + str(meanabs(sal_test, sal_pred, axis=0)))
Пример #26
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    choose = rand.sample(range(0, sj_train.shape[0] - 1), 800)
    val = [i for i in range(sj_train.shape[0]) if i not in choose]
    sj_train_subtrain = sj_train.ix[choose]
    sj_train_subtest = sj_train.ix[val]
    sj_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1)
    sj_etr.fit(sj_train_subtrain.drop('total_cases', axis=1),
               sj_train_subtrain['total_cases'])
    ##The model generate by neg_binomial with best alpha on val_set chosen before

    kf = KFold(n_splits=12)

    sj_model_list = []
    sj_err_list = []
    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        #sj_etr = ETR(n_estimators = 2000,  max_depth = 3,criterion = 'mae',verbose = 1)
        #sj_etr.fit(X_train.drop(['station_avg_temp_c','total_cases'],axis = 1),X_train['total_cases'])
        predictions = sj_etr.predict(X_val.drop('total_cases', axis=1))
        sj_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        #sj_model_list.append(sj_etr)
        loop += 1
    print(sj_err_list)

    #argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0]
    #print(argmax)

    #sj_best_model = sj_model_list[argmax]
    sj_best_model = sj_etr
    #print(sj_best_model.feature_importances_)

    choose = rand.sample(range(0, iq_train.shape[0] - 1), 400)
    val = [i for i in range(iq_train.shape[0]) if i not in choose]
    iq_train_subtrain = iq_train.ix[choose]
    iq_train_subtest = iq_train.ix[val]
    iq_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1)
    iq_etr.fit(iq_train_subtrain.drop('total_cases', axis=1),
               iq_train_subtrain['total_cases'])

    iq_model_list = []
    iq_err_list = []
    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        #iq_etr = ETR(n_estimators = 2000,  max_depth = 3,criterion = 'mae',verbose = 1)
        #iq_etr.fit(X_train.drop(['station_min_temp_c','total_cases'],axis = 1),X_train['total_cases'])
        predictions = iq_etr.predict(X_val.drop('total_cases', axis=1))
        iq_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        #iq_model_list.append(iq_etr)

        loop += 1
    print(iq_err_list)
    #argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0]
    #print(argmax)
    #iq_best_model = iq_model_list[argmax]
    iq_best_model = iq_etr
    #print(iq_best_model.feature_importances_)
    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_best_model.predict(sj_test)
    iq_predictions = iq_best_model.predict(iq_test)
    sj_predictions = np.round(sj_predictions).astype(int)
    iq_predictions = np.round(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([[28], [25], [34], sj_predictions,
                                             [8], [6], [10], iq_predictions])
    submission.to_csv("./data/ext_final_new.csv")
    '''
Пример #27
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    kf = KFold(n_splits=6)

    sj_model_list = []
    sj_err_list = []
    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        sj_etr = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1)
        sj_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = sj_etr.predict(X_val.drop('total_cases', axis=1))
        sj_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        sj_model_list.append(sj_etr)
        loop += 1
    print(sj_err_list)
    argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0]
    print(argmax)
    sj_best_model = sj_model_list[argmax]

    iq_model_list = []
    iq_err_list = []
    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        iq_etr = ETR(n_estimators=400, max_depth=4, random_state=0)
        iq_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = iq_etr.predict(X_val.drop('total_cases', axis=1))
        iq_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        iq_model_list.append(iq_etr)

        loop += 1
    print(iq_err_list)
    argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0]
    print(argmax)
    iq_best_model = iq_model_list[argmax]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_best_model.predict(sj_test)
    iq_predictions = iq_best_model.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/ext_new.csv")
    '''