def stg2_GBR(fobj, train_x, hold_x, sub_test, ntrees=100):
    ntrees = ntrees
    e = 30
    r = 0.3

    arr = [0, 1, 2, 21, 22, 23, 24, 25, 26]

    hold_test_x = hold_x[:, arr]
    hold_test_y = hold_x[:, -1]
    sub_test = sub_test[:, arr]

    fobj.write('Trees: %r' % ntrees)

    print 'Stage 2: Exp Threshold - %r' % e
    fobj.write('Stage 2: Exp Threshold - %r\n' % e)
    c_train = train_x[(train_x[:, -1] <= e)]
    c_arr = [0, 1, 2, 21, 22, 23, 24, 25, 26, 27]

    c_train = c_train[:, c_arr]
    c_train_y = c_train[:, -1]
    c_train_x = c_train[:, :-1]

    kaggle_file = 'Kaggle_GBR_Ankit_e' + str(e) + '_r' + str(r) + '_t_' + str(
        ntrees) + str(len(arr)) + '.csv'
    df_kaggle_file = 'Kaggle_df_GBR_Ankit_e' + str(e) + '_r' + str(
        r) + '_t_' + str(ntrees) + str(len(arr)) + '.csv'
    print 'Stage 2: Rate - %r' % r
    fobj.write('GBR exp: %r rate: %r' % (e, r))
    rf_model = gbr(n_estimators=ntrees,
                   loss='lad',
                   learning_rate=r,
                   max_depth=6)
    est = rf_model.fit(c_train_x, c_train_y)

    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'GBR Train Error: %r\n' % (error)
    fobj.write('GBR Train-2 Error: %r\n' % (error))

    train_y_pred = est.predict(hold_test_x)
    error = mt.mean_absolute_error(hold_test_y, train_y_pred)
    print 'GBR 20 percent Hold Error: %r\n' % (error)
    fobj.write('GBR 20 percent Hold Error: %r\n' % (error))

    print 'Test Size:%r' % len(sub_test)
    test_y_pred = est.predict(sub_test)
    id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file),
               all_data,
               delimiter=',',
               header='Id,Expected')
    df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file))
    df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file),
              header=True,
              index=False)
Пример #2
0
def model_gradientboosting_regressor(X_train, X_test, y_train, y_test):
    model_name = f'model_{count}_gradientboosting_regressor'

    model = gbr()
    model.fit(X_train, y_train)
    model.independentcols = independentcols

    score = model.score(X_test, y_test)

    print(f'{model_name} accuracy: {score}')
    joblib.dump(model, f'model/{model_name}.joblib')
Пример #3
0
    def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False):
        size = 1.3 * self.report_width // 10

        models = {}
        models["Linear regressor"]                  = lr()
        models["Lasso regressor"]                   = lassor()
        models["Lasso CV regressor"]                = lassocvr()
        models["Ridge regressor"]                   = rr(alpha=0, normalize=True)
        models["Ridge CV regressor"]                = rcvr(alphas = alphas)
        models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform')
        models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance')
        models["K nearest neighbors regressor K5"]  = knnr(n_neighbors=5)
        models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10)
        models["SGD regressor"]                     = sgdr(max_iter=10000, warm_start=True)
        models["Decision tree regressor"]           = dtr()
        models["Decision tree regressor D3"]        = dtr(max_depth=3)
        models["Random forest regressor"]           = rfr()
        models["Ada boost regressor"]               = abr()
        models["Gradient boost regressor"]          = gbr()
        models["Support vector regressor"]          = svr()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        #kf = StratifiedKFold(n_splits=folds, shuffle=True)
        kf = KFold(n_splits=folds)
        results = []
        names = []
        for model_name in models:
            cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Regressor': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Regressor Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
def stg2_GBR(fobj, train_x, hold_x, sub_test, ntrees=100):
    ntrees = ntrees
    e = 30
    r = 0.3

    arr = [0,1,2,21,22,23,24,25,26]


    hold_test_x = hold_x[:,arr]
    hold_test_y = hold_x[:,-1]
    sub_test = sub_test[:,arr]

    fobj.write('Trees: %r'%ntrees)
    

    print 'Stage 2: Exp Threshold - %r' % e
    fobj.write('Stage 2: Exp Threshold - %r\n'%e)
    c_train = train_x[(train_x[:,-1]<=e)]
    c_arr = [0,1,2,21,22,23,24,25,26,27]

    c_train = c_train[:,c_arr]
    c_train_y = c_train[:,-1]
    c_train_x = c_train[:,:-1]

    kaggle_file = 'Kaggle_GBR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr)) +'.csv'
    df_kaggle_file = 'Kaggle_df_GBR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr))+ '.csv'
    print 'Stage 2: Rate - %r' % r
    fobj.write('GBR exp: %r rate: %r'%(e,r))
    rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=r, max_depth=6)
    est = rf_model.fit(c_train_x, c_train_y)
            
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'GBR Train Error: %r\n' % (error)
    fobj.write('GBR Train-2 Error: %r\n' % (error))

    train_y_pred = est.predict(hold_test_x)
    error = mt.mean_absolute_error(hold_test_y, train_y_pred)
    print 'GBR 20 percent Hold Error: %r\n' % (error)
    fobj.write('GBR 20 percent Hold Error: %r\n' % (error))


    print 'Test Size:%r' % len(sub_test)
    test_y_pred = est.predict(sub_test)
    id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected')
    df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file))
    df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
Пример #5
0
    def fit(previos_data: pd.DataFrame):

        labels = previos_data[previos_data.columns[-1]]
        tabel = previos_data[previos_data.columns[:-1]]

        X_train, X_test, y_train, y_test = train_test_split(tabel,
                                                            labels,
                                                            test_size=0.33,
                                                            random_state=42)

        self.predictor = gbr()

        self.predictor.fit(X_train, y_test)

        self.f1_score = f1_score(y_test, self.predictor.predict(X_test))

        self.is_fitted = True

        return self
def runGBR(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    rate = 0.3
    max_depth = 6
    exp = 30

    c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)]
    c_train_y = c_train_1_x[:,-1]
    c_train_x = c_train_1_x[:,:-1]

    rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate, max_depth=max_depth)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'GBR Train Error: %r\n' % (error)
    fobj.write('GBR Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
    def __init__(self,
                 n_estimators=100,
                 max_depth=10,
                 learning_rate=0.01,
                 loss='deviance',
                 max_features=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 verbose=0,
                 fit_intercept=True,
                 normalize=False,
                 name='GBC'):
        """ Initialize object with the informed hyper-parameter values.
        Args:

        """
        # set method's name and paradigm
        super().__init__(name, fit_intercept, normalize)

        assert loss in ('deviance', 'exponential')
        assert n_estimators >= 1
        assert max_depth >= 2
        assert learning_rate > 0
        assert min_samples_split >= 1
        assert min_samples_leaf >= 1

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.loss = loss
        self.min_samples_leaf = min_samples_leaf
        self.output_directory = ''
        self.model = gbr(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_samples_split=min_samples_split,
                         min_samples_leaf=min_samples_leaf,
                         max_features=max_features,
                         verbose=verbose,
                         loss=loss)
    def __init__(self,
                 n_estimators=100,
                 max_depth=2,
                 learning_rate=0.01,
                 loss='ls',
                 max_features=None,
                 min_samples_split=2,
                 min_samples_leaf=3,
                 alpha=.09,
                 verbose=0,
                 name='GBR-Reg',
                 fit_intercept=True,
                 normalize=False):
        """ Initialize object with the informed hyper-parameter values.
        Args:

(self, lambda_1=0.1, lambda_2=0, name='MSSL',
                 fit_intercept=True, normalize=False)

        """
        # set method's name and paradigm
        super().__init__(name, fit_intercept, normalize)

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.loss = loss
        self.min_samples_leaf = min_samples_leaf
        self.output_directory = ''
        self.model = gbr(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_samples_split=min_samples_split,
                         min_samples_leaf=min_samples_leaf,
                         max_features=max_features,
                         verbose=verbose,
                         loss=loss)
def runGBR(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    rate = 0.3
    max_depth = 6
    exp = 30

    c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)]
    c_train_y = c_train_1_x[:, -1]
    c_train_x = c_train_1_x[:, :-1]

    rf_model = gbr(n_estimators=ntrees,
                   loss='lad',
                   learning_rate=rate,
                   max_depth=max_depth)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'GBR Train Error: %r\n' % (error)
    fobj.write('GBR Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
Пример #10
0
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))
    
    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size)
    
    #removing the predictor column from the matrix
    u_test_x = u_test_x[:,:-1]

    #for exp in exp_thresh:
    exp = 25
    fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
    #step 2 prning to required exppected threshold
    c_train_x = train_x[(train_x[:,-1]<=exp)]
    c_train_y = c_train_x[:,-1]
    c_train_x = c_train_x[:,:-1]

    #step1 split Train
    ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size )

    
    fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y))
    fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y))

    fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

    print 'Fitting Model Measurements'
    trees_array = [100]

    kf = cv.KFold(len(ttrain_x), n_folds=5)
    st_time = time.time()

    l_rate = [0.3]
    max_depth = [4,6]
    min_samples_leaf = [3, 5, 9, 17]

    for rate in l_rate:
        fobj.write('\n\nLearning Rate: %r\n' % (rate))
        print '\n\nLearning Rate: %r\n' % (rate)
        for ntrees in trees_array:
            valid_acc = []
            test_acc = []
            train_acc = []
            est_arr = []
            unconst_acc = []
            fold = 0

            for train_idx, test_idx in kf:
                fobj.write('\nfold: %r\n'%(fold))
                print '\nfold: %r\n'%(fold)
                #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate)
                vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features)
                valid_acc.append(vacc)
                train_acc.append(tacc)
                est_arr.append(est)
                fobj.write('Train Size:%r\n' % (len(train_idx)))
                print 'Train Size:%r\n' % (len(train_idx))
                fobj.write('Validation Size:%r\n' % (len(test_idx)))
                fobj.write('Validation Error: %r\n' % (vacc))

                print 'Validation Error: %r\n' % (vacc)
                fobj.write('Train Error: %r\n' % (tacc))
                fobj.write('Constrained Test data size:%r\n' % (len(ttest_x)))
                test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x)))
                fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1]))
                print 'Constrained Test Error for fold: %r\n' % (test_acc[-1])
                y_res = est.predict(u_test_x)

                unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res))
                fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1]))
                print 'Complete test accuracy:%r\n' % (unconst_acc[-1])
                fold+=1
                break

            et_time = time.time()
            fobj.write('..Statistics..\n')
            fobj.write('Expected Threshold Limit: %r\n' % (exp))
            fobj.write('Trees:%r\n' % ntrees)
            fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
            fobj.write('Validation Average: %r\n' % (np.mean(valid_acc)))
            fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))

            #Print to console
            print('..Statistics..\n')
            print('Expected Threshold Limit: %r\n' % (exp))
            print('Trees:%r\n' % ntrees)
            print('Train Average: %r\n' % (np.mean(train_acc)))
            print('Validation Average: %r\n' % (np.mean(valid_acc)))
            print('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))
Пример #11
0
def driver(comp_mat,
           pps_mth='ORIGINAL',
           exp_thresh=[25],
           regression_mth=[],
           test_size=0.2,
           f_selection=False,
           n_features=3,
           reg_methods=[]):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))

    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(
        comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size)

    #removing the predictor column from the matrix
    u_test_x = u_test_x[:, :-1]

    #for exp in exp_thresh:
    exp = 25
    fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
    #step 2 prning to required exppected threshold
    c_train_x = train_x[(train_x[:, -1] <= exp)]
    c_train_y = c_train_x[:, -1]
    c_train_x = c_train_x[:, :-1]

    #step1 split Train
    ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(
        c_train_x, c_train_y, random_state=32, test_size=test_size)

    fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y))
    fobj.write('Total Number of Unconstrained test records: %r\n' %
               len(u_test_y))

    fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

    print 'Fitting Model Measurements'
    trees_array = [100]

    kf = cv.KFold(len(ttrain_x), n_folds=5)
    st_time = time.time()

    l_rate = [0.3]
    max_depth = [4, 6]
    min_samples_leaf = [3, 5, 9, 17]

    for rate in l_rate:
        fobj.write('\n\nLearning Rate: %r\n' % (rate))
        print '\n\nLearning Rate: %r\n' % (rate)
        for ntrees in trees_array:
            valid_acc = []
            test_acc = []
            train_acc = []
            est_arr = []
            unconst_acc = []
            fold = 0

            for train_idx, test_idx in kf:
                fobj.write('\nfold: %r\n' % (fold))
                print '\nfold: %r\n' % (fold)
                #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                rf_model = gbr(n_estimators=ntrees,
                               loss='lad',
                               learning_rate=rate)
                vacc, tacc, est = rf_regressor(rf_model,
                                               ttrain_x[train_idx],
                                               ttrain_y[train_idx],
                                               ttrain_x[test_idx],
                                               ttrain_y[test_idx],
                                               f_selection=f_selection,
                                               n_features=n_features)
                valid_acc.append(vacc)
                train_acc.append(tacc)
                est_arr.append(est)
                fobj.write('Train Size:%r\n' % (len(train_idx)))
                print 'Train Size:%r\n' % (len(train_idx))
                fobj.write('Validation Size:%r\n' % (len(test_idx)))
                fobj.write('Validation Error: %r\n' % (vacc))

                print 'Validation Error: %r\n' % (vacc)
                fobj.write('Train Error: %r\n' % (tacc))
                fobj.write('Constrained Test data size:%r\n' % (len(ttest_x)))
                test_acc.append(
                    mt.mean_absolute_error(ttest_y, est.predict(ttest_x)))
                fobj.write('Constrained Test Error for fold: %r\n' %
                           (test_acc[-1]))
                print 'Constrained Test Error for fold: %r\n' % (test_acc[-1])
                y_res = est.predict(u_test_x)

                unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res))
                fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1]))
                print 'Complete test accuracy:%r\n' % (unconst_acc[-1])
                fold += 1
                break

            et_time = time.time()
            fobj.write('..Statistics..\n')
            fobj.write('Expected Threshold Limit: %r\n' % (exp))
            fobj.write('Trees:%r\n' % ntrees)
            fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
            fobj.write('Validation Average: %r\n' % (np.mean(valid_acc)))
            fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            fobj.write('Total Time Taken: %r mins\n' %
                       ((et_time - st_time) / 60))

            #Print to console
            print('..Statistics..\n')
            print('Expected Threshold Limit: %r\n' % (exp))
            print('Trees:%r\n' % ntrees)
            print('Train Average: %r\n' % (np.mean(train_acc)))
            print('Validation Average: %r\n' % (np.mean(valid_acc)))
            print('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            print('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60))
Пример #12
0
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))
    
    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size)
    
    #removing the predictor column from the matrix
    u_test_x = u_test_x[:,:-1]

    #for exp in exp_thresh:
    exp = 25
    fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
    #step 2 prning to required exppected threshold
    c_train_x = train_x[(train_x[:,-1]<=exp)]
    c_train_y = c_train_x[:,-1]
    c_train_x = c_train_x[:,:-1]

    #step1 split Train
    ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size )

    
    fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y))
    fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y))

    fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

    print 'Fitting Model Measurements'
    trees_array = [1000]

    kf = cv.KFold(len(ttrain_x), n_folds=5)
    st_time = time.time()

    l_rate = [0.3]
    max_depth = [4,6]
    min_samples_leaf = [3, 5, 9, 17]

    for rate in l_rate:
        fobj.write('\n\nLearning Rate: %r\n' % (rate))
        print '\n\nLearning Rate: %r\n' % (rate)
        for ntrees in trees_array:
            valid_acc = []
            test_acc = []
            train_acc = []
            est_arr = []
            unconst_acc = []
            fold = 0

            for train_idx, test_idx in kf:
                fobj.write('\nfold: %r\n'%(fold))
                print '\nfold: %r\n'%(fold)
                #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate)
                vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features)
                valid_acc.append(vacc)
                train_acc.append(tacc)
                est_arr.append(est)
                fobj.write('Train Size:%r\n' % (len(train_idx)))
                print 'Train Size:%r\n' % (len(train_idx))
                fobj.write('Validation Size:%r\n' % (len(test_idx)))
                fobj.write('Validation Error: %r\n' % (vacc))

                print 'Validation Error: %r\n' % (vacc)
                fobj.write('Train Error: %r\n' % (tacc))
                fobj.write('Constrained Test data size:%r\n' % (len(ttest_x)))
                test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x)))
                fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1]))
                print 'Constrained Test Error for fold: %r\n' % (test_acc[-1])
                y_res = est.predict(u_test_x)

                unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res))
                fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1]))
                print 'Complete test accuracy:%r\n' % (unconst_acc[-1])
                fold+=1

            et_time = time.time()
            fobj.write('..Statistics..\n')
            fobj.write('Expected Threshold Limit: %r\n' % (exp))
            fobj.write('Trees:%r\n' % ntrees)
            fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
            fobj.write('Validation Average: %r\n' % (np.mean(valid_acc)))
            fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))

            #Print to console
            print('..Statistics..\n')
            print('Expected Threshold Limit: %r\n' % (exp))
            print('Trees:%r\n' % ntrees)
            print('Train Average: %r\n' % (np.mean(train_acc)))
            print('Validation Average: %r\n' % (np.mean(valid_acc)))
            print('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))

    print 'Generating Solution Result:'
    test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv',delimiter=',')
    print 'Test Size:%r' % len(test_x)
    test_y_pred = est.predict(test_x)
    id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv', all_data, delimiter=',', header='Id,Expected')
    df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv')
    df.to_csv('C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv', header=True, index=False)
Пример #13
0
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 regressors = [
     lr(),
     bay(),
     rr(alpha=.5, random_state=0),
     l(alpha=0.1, random_state=0),
     ll(),
     knn(),
     ard(),
     rfr(random_state=0, n_estimators=100),
     SVR(gamma='scale', kernel='rbf'),
     rcv(fit_intercept=False),
     en(random_state=0),
     dtr(random_state=0),
     ada(random_state=0),
     gbr(random_state=0)
 ]
 print('unscaled:', br)
 for reg in regressors:
     reg.fit(X_train, y_train)
     rmse, name = get_error(reg, X_test, y_test)
     name = reg.__class__.__name__
     print(name + '(rmse):', end=' ')
     print(rmse)
 print()
 print('scaled:', br)
 scaler = StandardScaler()
 X_train_std = scaler.fit_transform(X_train)
 X_test_std = scaler.fit_transform(X_test)
 for reg in regressors:
     reg.fit(X_train_std, y_train)
X_s = x_scaler.fit_transform(X)
y_s = y_scaler.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

#           Gradient Boosting Regressor start

# training the model

from sklearn.ensemble import GradientBoostingRegressor as gbr
gbr = gbr(loss='ls', learning_rate=0.1, n_estimators=800)

gbr.fit(X_train, y_train[0])
y_pred1 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))

gbr.fit(X_train, y_train[1])
y_pred2 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))

gbr.fit(X_train, y_train[2])
y_pred3 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))

gbr.fit(X_train, y_train[3])
y_pred4 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))

gbr.fit(X_train, y_train[4])
y_pred5 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))
Пример #15
0
def driver(comp_mat,
           pps_mth='ORIGINAL',
           exp_thresh=[25],
           regression_mth=[],
           test_size=0.2,
           f_selection=False,
           n_features=3,
           reg_methods=[],
           remove_mp=False):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))

    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(
        comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size)

    #removing the predictor column from the matrix
    u_test_x = u_test_x[:, :-1]

    #for exp in exp_thresh:
    exp = 25
    fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
    #step 2 prning to required exppected threshold
    c_train_x = train_x[(train_x[:, -1] <= exp)]
    c_train_y = c_train_x[:, -1]
    c_train_x = c_train_x[:, :-1]

    #step1 split Train
    ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(
        c_train_x, c_train_y, random_state=32, test_size=test_size)

    fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y))
    fobj.write('Total Number of Unconstrained test records: %r\n' %
               len(u_test_y))

    fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

    print 'Fitting Model Measurements'
    trees_array = [1000]

    kf = cv.KFold(len(ttrain_x), n_folds=5)
    st_time = time.time()

    l_rate = [0.3]
    max_depth = [4, 6]
    min_samples_leaf = [3, 5, 9, 17]

    for rate in l_rate:
        fobj.write('\n\nLearning Rate: %r\n' % (rate))
        print '\n\nLearning Rate: %r\n' % (rate)
        for ntrees in trees_array:
            valid_acc = []
            test_acc = []
            train_acc = []
            est_arr = []
            unconst_acc = []
            fold = 0

            for train_idx, test_idx in kf:
                fobj.write('\nfold: %r\n' % (fold))
                print '\nfold: %r\n' % (fold)
                #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                rf_model = gbr(n_estimators=ntrees,
                               loss='lad',
                               learning_rate=rate)
                vacc, tacc, est = rf_regressor(rf_model,
                                               ttrain_x[train_idx],
                                               ttrain_y[train_idx],
                                               ttrain_x[test_idx],
                                               ttrain_y[test_idx],
                                               f_selection=f_selection,
                                               n_features=n_features)
                valid_acc.append(vacc)
                train_acc.append(tacc)
                est_arr.append(est)
                fobj.write('Train Size:%r\n' % (len(train_idx)))
                print 'Train Size:%r\n' % (len(train_idx))
                fobj.write('Validation Size:%r\n' % (len(test_idx)))
                fobj.write('Validation Error: %r\n' % (vacc))

                print 'Validation Error: %r\n' % (vacc)
                fobj.write('Train Error: %r\n' % (tacc))
                fobj.write('Constrained Test data size:%r\n' % (len(ttest_x)))
                test_acc.append(
                    mt.mean_absolute_error(ttest_y, est.predict(ttest_x)))
                fobj.write('Constrained Test Error for fold: %r\n' %
                           (test_acc[-1]))
                print 'Constrained Test Error for fold: %r\n' % (test_acc[-1])
                y_res = est.predict(u_test_x)

                unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res))
                fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1]))
                print 'Complete test accuracy:%r\n' % (unconst_acc[-1])
                fold += 1

            et_time = time.time()
            fobj.write('..Statistics..\n')
            fobj.write('Expected Threshold Limit: %r\n' % (exp))
            fobj.write('Trees:%r\n' % ntrees)
            fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
            fobj.write('Validation Average: %r\n' % (np.mean(valid_acc)))
            fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            fobj.write('Total Time Taken: %r mins\n' %
                       ((et_time - st_time) / 60))

            #Print to console
            print('..Statistics..\n')
            print('Expected Threshold Limit: %r\n' % (exp))
            print('Trees:%r\n' % ntrees)
            print('Train Average: %r\n' % (np.mean(train_acc)))
            print('Validation Average: %r\n' % (np.mean(valid_acc)))
            print('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            print('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60))

    print 'Generating Solution Result:'
    test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv', delimiter=',')
    print 'Test Size:%r' % len(test_x)
    test_y_pred = est.predict(test_x)
    id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt(
        'C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv',
        all_data,
        delimiter=',',
        header='Id,Expected')
    df = pd.read_csv(
        'C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv'
    )
    df.to_csv(
        'C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv',
        header=True,
        index=False)
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr_263, target)))

# GradientBoostingRegressor梯度提升决策树
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof_gbr_263 = np.zeros(train_shape)
predictions_gbr_263 = np.zeros(len(X_test_263))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
    print("fold n°{}".format(fold_ + 1))
    tr_x = X_train_263[trn_idx]
    tr_y = y_train[trn_idx]
    gbr_263 = gbr(n_estimators=400,
                  learning_rate=0.01,
                  subsample=0.65,
                  max_depth=7,
                  min_samples_leaf=20,
                  max_features=0.22,
                  verbose=1)
    gbr_263.fit(tr_x, tr_y)
    oof_gbr_263[val_idx] = gbr_263.predict(X_train_263[val_idx])

    predictions_gbr_263 += gbr_263.predict(X_test_263) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target)))

# ExtraTreesRegressor 极端随机森林回归
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_etr_263 = np.zeros(train_shape)
predictions_etr_263 = np.zeros(len(X_test_263))
Пример #17
0
    #Load Test X matrix CSV
    test_x = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\norm_test_fmat.csv',delimiter=',')
    print 'Test Size:%r' % len(test_x)
    test_y_pred = est.predict(test_x)
    id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt('C:\Users\saura\Desktop\ML_Project\data\\mytest_solution.csv', all_data, delimiter=',', header='Id,Expected')
    df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\data\\mytest_solution.csv')
    df.to_csv('C:\Users\saura\Desktop\ML_Project\\final_solution.csv', header=True, index=False)

if __name__=="__main__":
    narray = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\rho_gr_0_85_fmat.csv',delimiter=',')
    print 'read narray.. size:%r'%(len(narray))
    nlabel = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\rho_gr_0_85_label.csv',delimiter=',')
    print 'read label'
    train_x, test_x, train_y, test_y = cv.train_test_split(narray, nlabel, random_state = 42, test_size=0.2)
    rf_model = gbr(n_estimators=100, loss='lad')
    rf_regressor(rf_model, train_x, train_y, test_x, test_y)

    #200 0.99157639  19.635566   19.24871943     FALSE


#gBR results
#n_estimators = 100
# read narray.. size:731556
# read label
# Train Size:585244
# Train Error 23.2042661418
# Train Size:146312
# valid Error:23.190855007768523
# Test Size:717625
train_df = df[:len(df1)]
train_df = train_df.reset_index(drop=True)
test_df = df[len(df1)+1:]
test_df = test_df.reset_index(drop=True)

########### Weather baseline
get_l2 = lambda y, y_hat: np.sum((y - y_hat)**2) / np.sum((y - np.mean(y))**2)

def create_sklearn_compatible_x_y(df, columns):
    X = df[columns].values
    y = df.y.values
    return X, y

max_estimators = 500
tree_num = np.round(np.linspace(20, max_estimators, 50))
model = gbr(n_estimators=max_estimators, max_depth=2)

columns = ['temp', 'atemp', 'hum', 'windspeed']
X_train, y_train = create_sklearn_compatible_x_y(train_df, columns)
X_test, y_test = create_sklearn_compatible_x_y(test_df, columns)

model.fit(X_train, y_train)

def plot_l2_vs_estimator_num(X_train, y_train, X_test, y_test):
    train_staged_y_hat = model.staged_predict(X_train)
    test_staged_y_hat = model.staged_predict(X_test)
    y_hat_iter = itertools.izip(train_staged_y_hat, test_staged_y_hat)

    get_l2 = lambda y, y_hat: np.sum((y - y_hat)**2) / np.sum((y - np.mean(y))**2)

    res = {}
    def regression(self, metric, folds=10, alphas=[], printt=True, graph=False):
        size = self.graph_width

        # significant model setup differences should be list as different models
        models = {}
        models["Linear regressor"]                  = lr()
        models["Lasso regressor"]                   = lassor()
        models["Lasso CV regressor"]                = lassocvr()
        models["Ridge regressor"]                   = rr(alpha=0, normalize=True)
        models["Ridge CV regressor"]                = rcvr(alphas = alphas)
        models["Elastic net regressor"]             = enr()
        models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform')
        models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance')
        models["K nearest neighbors regressor K5"]  = knnr(n_neighbors=5)
        models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10)
        models["SGD regressor"]                     = sgdr(max_iter=10000, warm_start=True)
        models["Decision tree regressor"]           = dtr()
        models["Decision tree regressor D3"]        = dtr(max_depth=3)
        models["Random forest regressor"]           = rfr()
        models["Ada boost regressor"]               = abr()
        models["Gradient boost regressor"]          = gbr()
        models["Support vector regressor RBF"]      = svr()
        models["Support vector regressor Linear"]   = svr('linear')
        models["Support vector regressor Poly"]     = svr(kernel='poly')
        self.models = models

        kf = KFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        et = []
        for model_name in models:
            start = time.time()
            cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric)  
            results.append(cv_scores)
            names.append(model_name)
            et.append((time.time() - start))
        report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et})
        report['Score (avg)'] = report.Score.apply(lambda x: np.sqrt(x).mean())
        report['Score (std)'] = report.Score.apply(lambda x: np.sqrt(x).std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True)
        report.drop('Score', axis=1, inplace=True)
        report.reset_index(inplace=True, drop=True)
        self.report_performance = report
        
        if printt:
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
            print(self.report_width * '*', '')
            print(report)
            print('\n')

        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Regressor Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0, bottom=0.25)
            self.graphs_model.append(fig)
            plt.show()             
        return None
Пример #20
0
traindata = traindata.fillna(0)
train_xcol = traindata.drop(['W'], axis=1).drop([0], axis=0).reset_index()

target = traindata['W']
target = target.drop([len(target) - 1], axis=0)

loss = 'lad'
learning_rate = 0.05
n_estimators = 500
min_samples_split = 3
max_depth = 10

gbm = gbr(loss=loss,
          learning_rate=learning_rate,
          n_estimators=n_estimators,
          min_samples_split=min_samples_split,
          max_depth=max_depth)

model = gbm.fit(train_xcol, target)

testdata = jaysdata.drop(['Lg', 'L', 'Year'], axis=1)
testdata = testdata.fillna(0)
test_xcol = testdata.drop(['W'], axis=1).drop([0], axis=0).reset_index()
test_ycol = testdata['W'].drop([len(testdata) - 1])

new_r_square = model.score(test_xcol, test_ycol)

hidden_layer_sizes = (100, 10)
mlpnn = mlp(hidden_layer_sizes=hidden_layer_sizes)
Пример #21
0
        test_data_X = sc_X.transform(test_data_X)
        train_data_y = np.log(1+train_data_y)

    if (Env_var.get('Pca') == 1):
        pca = PCA(n_components = 300).fit(train_data_X)    
        train_data_X = pca.transform(train_data_X)
        test_data_X = pca.transform(test_data_X)
    
    ###############################--------Model Setup--------###############################
    ann_regressor = KerasRegressor(build_fn=ann_model, epochs=30, batch_size=10, verbose=1)
    
    xgb_regressor = xgb(learning_rate = 0.0825, min_child_weight = 1, max_depth = 7, subsample = 0.8, verbose = 10, random_state = 2017, n_jobs = -1, eval_metric = "rmse")
    
    rfr_regressor = rfr(max_features = 0.9, min_samples_leaf = 50)
    
    gbr_regressor = gbr(n_estimators = 200, verbose = 5, learning_rate = 0.08, max_depth = 7, max_features = 0.5, min_samples_leaf = 50, subsample = 0.8, random_state = 2017)
    
    etr_regressor = etr(n_estimators = 200, verbose = 10, max_depth = 7, min_samples_leaf = 100, max_features = 0.9, min_impurity_split = 100, random_state = 2017)
    
    lr_regressor = lr()
    
    svr_regressor = svr(verbose = 10)
    
    ensemble = Ensemble(n_folds = 5,stacker =  lr_regressor,base_models = [ann_regressor, xgb_regressor, rfr_regressor, gbr_regressor, etr_regressor])
    
    
    ###############################--------Grid Search--------###############################

  
    if (Env_var.get('GridSearch') == 1):