def stg2_XTR(fobj, train_x, hold_x, sub_test, ntrees=100):
    ntrees = ntrees
    e = 40
    njobs = 2
    r = 0.3

    arr = [0, 1, 2, 21, 22, 23, 24, 25, 26]

    hold_test_x = hold_x[:, arr]
    hold_test_y = hold_x[:, -1]
    sub_test = sub_test[:, arr]

    fobj.write('Trees: %r' % ntrees)

    print 'Stage 2: Exp Threshold - %r' % e
    fobj.write('Stage 2: Exp Threshold - %r\n' % e)
    c_train = train_x[(train_x[:, -1] <= e)]
    c_arr = [0, 1, 2, 21, 22, 23, 24, 25, 26, 27]

    c_train = c_train[:, c_arr]
    c_train_y = c_train[:, -1]
    c_train_x = c_train[:, :-1]

    kaggle_file = 'Kaggle_XTR_Ankit_e' + str(e) + '_r' + str(r) + '_t_' + str(
        ntrees) + str(len(arr)) + '.csv'
    df_kaggle_file = 'Kaggle_df_XTR_Ankit_e' + str(e) + '_r' + str(
        r) + '_t_' + str(ntrees) + str(len(arr)) + '.csv'
    print 'Stage 2: Rate - %r' % r
    fobj.write('XTR exp: %r rate: %r' % (e, r))
    rf_model = etr(n_estimators=ntrees, n_jobs=njobs)
    est = rf_model.fit(c_train_x, c_train_y)

    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'XTR Train Error: %r\n' % (error)
    fobj.write('XTR Train-2 Error: %r\n' % (error))

    train_y_pred = est.predict(hold_test_x)
    error = mt.mean_absolute_error(hold_test_y, train_y_pred)
    print 'XTR 20 percent Hold Error: %r\n' % (error)
    fobj.write('XTR 20 percent Hold Error: %r\n' % (error))

    print 'Test Size:%r' % len(sub_test)
    test_y_pred = est.predict(sub_test)
    id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file),
               all_data,
               delimiter=',',
               header='Id,Expected')
    df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file))
    df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file),
              header=True,
              index=False)
def stg2_XTR(fobj, train_x, hold_x, sub_test, ntrees=100):
    ntrees = ntrees
    e = 40
    njobs=2
    r = 0.3

    arr = [0,1,2,21,22,23,24,25,26]


    hold_test_x = hold_x[:,arr]
    hold_test_y = hold_x[:,-1]
    sub_test = sub_test[:,arr]

    fobj.write('Trees: %r'%ntrees)
    

    print 'Stage 2: Exp Threshold - %r' % e
    fobj.write('Stage 2: Exp Threshold - %r\n'%e)
    c_train = train_x[(train_x[:,-1]<=e)]
    c_arr = [0,1,2,21,22,23,24,25,26,27]

    c_train = c_train[:,c_arr]
    c_train_y = c_train[:,-1]
    c_train_x = c_train[:,:-1]

    kaggle_file = 'Kaggle_XTR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr)) +'.csv'
    df_kaggle_file = 'Kaggle_df_XTR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr))+ '.csv'
    print 'Stage 2: Rate - %r' % r
    fobj.write('XTR exp: %r rate: %r'%(e,r))
    rf_model = etr(n_estimators=ntrees, n_jobs=njobs)
    est = rf_model.fit(c_train_x, c_train_y)
            
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'XTR Train Error: %r\n' % (error)
    fobj.write('XTR Train-2 Error: %r\n' % (error))

    train_y_pred = est.predict(hold_test_x)
    error = mt.mean_absolute_error(hold_test_y, train_y_pred)
    print 'XTR 20 percent Hold Error: %r\n' % (error)
    fobj.write('XTR 20 percent Hold Error: %r\n' % (error))


    print 'Test Size:%r' % len(sub_test)
    test_y_pred = est.predict(sub_test)
    id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected')
    df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file))
    df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
def runXTR(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    njobs = 2
    exp = 40

    c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)]
    c_train_y = c_train_1_x[:, -1]
    c_train_x = c_train_1_x[:, :-1]

    rf_model = etr(n_estimators=ntrees, n_jobs=-1)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'XTR Train-1 Error: %r\n' % (error)
    fobj.write('XTR Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
def runXTR(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    njobs = 2
    exp = 40

    c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)]
    c_train_y = c_train_1_x[:,-1]
    c_train_x = c_train_1_x[:,:-1]

    rf_model = etr(n_estimators=ntrees, n_jobs=-1)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'XTR Train-1 Error: %r\n' % (error)
    fobj.write('XTR Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
示例#5
0
def driver(comp_mat,
           pps_mth='ORIGINAL',
           exp_thresh=[40],
           regression_mth=[],
           test_size=0.2,
           f_selection=False,
           n_features=3,
           reg_methods=[]):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))

    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(
        comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size)

    #removing the predictor column from the matrix
    u_test_x = u_test_x[:, :-1]

    for exp in exp_thresh:
        fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
        #step 2 prning to required exppected threshold
        c_train_x = train_x[(train_x[:, -1] <= exp)]
        c_train_y = c_train_x[:, -1]
        c_train_x = c_train_x[:, :-1]

        #step1 split Train
        ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(
            c_train_x, c_train_y, random_state=32, test_size=test_size)

        fobj.write('Total Number of Constrained test records: %r\n' %
                   len(ttest_y))
        fobj.write('Total Number of Unconstrained test records: %r\n' %
                   len(u_test_y))

        fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

        print 'Fitting Model Measurements'
        trees_array = [100]

        kf = cv.KFold(len(ttrain_x), n_folds=5)
        st_time = time.time()

        l_rate = [0.1, 0.05, 0.02, 0.01]
        max_depth = [None]
        min_samples_leaf = [1]

        for dpth in max_depth:
            fobj.write('\n\nmax _depth: %r\n' % (dpth))
            print 'max _depth: %r' % (dpth)
            for msl in min_samples_leaf:
                fobj.write('\nmin_samples_leaf: %r\n' % (msl))
                print 'min_samples_leaf: %r' % (msl)
                for ntrees in trees_array:
                    valid_acc = []
                    test_acc = []
                    train_acc = []
                    est_arr = []
                    unconst_acc = []
                    fold = 0

                    for train_idx, test_idx in kf:
                        fobj.write('\nfold: %r\n' % (fold))
                        #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                        #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                        rf_model = etr(n_estimators=ntrees,
                                       max_depth=dpth,
                                       min_samples_split=msl)
                        vacc, tacc, est = rf_regressor(rf_model,
                                                       ttrain_x[train_idx],
                                                       ttrain_y[train_idx],
                                                       ttrain_x[test_idx],
                                                       ttrain_y[test_idx],
                                                       f_selection=f_selection,
                                                       n_features=n_features)
                        valid_acc.append(vacc)
                        train_acc.append(tacc)
                        est_arr.append(est)
                        fobj.write('Train Size:%r\n' % (len(train_idx)))
                        fobj.write('Validation Size:%r\n' % (len(test_idx)))
                        fobj.write('Validation Error: %r\n' % (vacc))
                        fobj.write('Train Error: %r\n' % (tacc))
                        fobj.write('Constrained Test data size:%r\n' %
                                   (len(ttest_x)))
                        test_acc.append(
                            mt.mean_absolute_error(ttest_y,
                                                   est.predict(ttest_x)))
                        fobj.write('Constrained Test Error for fold: %r\n' %
                                   (test_acc[-1]))
                        unconst_acc.append(
                            mt.mean_absolute_error(u_test_y,
                                                   est.predict(u_test_x)))
                        fobj.write('Complete test accuracy:%r\n' %
                                   (unconst_acc[-1]))
                        fold += 1
                        break

                    et_time = time.time()
                    fobj.write('..Statistics..\n')
                    fobj.write('Expected Threshold Limit: %r\n' % (exp))
                    fobj.write('Trees:%r\n' % ntrees)
                    fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
                    fobj.write('Validation Average: %r\n' %
                               (np.mean(valid_acc)))
                    fobj.write('Constrained Test Average: %r\n' %
                               (np.mean(test_acc)))
                    fobj.write('Unconstrained Test Avg: %r\n' %
                               (np.mean(unconst_acc)))
                    fobj.write('Total Time Taken: %r mins\n' %
                               ((et_time - st_time) / 60))

                    #Print to console
                    print('..Statistics..\n')
                    print('Expected Threshold Limit: %r\n' % (exp))
                    print('Trees:%r\n' % ntrees)
                    print('Train Average: %r\n' % (np.mean(train_acc)))
                    print('Validation Average: %r\n' % (np.mean(valid_acc)))
                    print('Constrained Test Average: %r\n' %
                          (np.mean(test_acc)))
                    print('Unconstrained Test Avg: %r\n' %
                          (np.mean(unconst_acc)))
                    print('Total Time Taken: %r mins\n' %
                          ((et_time - st_time) / 60))

        print 'Generating Solution Result:'
        test_x = np.loadtxt(
            'C:\Users\saura\Desktop\ML_Project\data\\norm_test_fmat.csv',
            delimiter=',')
        print 'Test Size:%r' % len(test_x)
        test_y_pred = est.predict(test_x)
        id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int)
        all_data = np.column_stack((id_col, test_y_pred))
        np.savetxt(
            'C:\Users\saura\Desktop\ML_Project\ensemble_data\\mytest_solution.csv',
            all_data,
            delimiter=',',
            header='Id,Expected')
        df = pd.read_csv(
            'C:\Users\saura\Desktop\ML_Project\ensemble_data\\mytest_solution.csv'
        )
        df.to_csv(
            'C:\Users\saura\Desktop\ML_Project\\ensemble_data\\final_solution.csv',
            header=True,
            index=False)
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[], remove_mp=False):
    global fobj
    fobj.write('Total Number of records: %r\n' % (len(comp_mat)))

    #remove marshal palmer result column:
    if remove_mp:
        print 'Before Removing label size: ',np.shape(comp_mat)
        y = comp_mat[:,-1]
        print 'Before Removing label size: ', np.shape(comp_mat)
        comp_mat = comp_mat[:,:-2]
        print 'Before Appending label size: ',np.shape(comp_mat)

        comp_mat = np.column_stack((comp_mat, y))
        print 'After appending label',np.shape(comp_mat)
        
    #step 1 split Train and test records
    train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size)
    
    #removing the predictor column from the matrix
    u_test_x = u_test_x[:,:-1]
    
    #for exp in exp_thresh:
    exp = 40
    fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp))
    #step 2 prning to required exppected threshold
    c_train_x = train_x[(train_x[:,-1]<=exp)]
    c_train_y = c_train_x[:,-1]
    c_train_x = c_train_x[:,:-1]

    #step1 split Train
    ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size )

    
    fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y))
    fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y))

    fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y))

    print 'Fitting Model Measurements'
    trees_array = [100]

    kf = cv.KFold(len(ttrain_x), n_folds=5)
    st_time = time.time()

    l_rate = [0.3]
    max_depth = [4,6]
    min_samples_leaf = [3, 5, 9, 17]

    for rate in l_rate:
        fobj.write('\n\nLearning Rate: %r\n' % (rate))
        print '\n\nLearning Rate: %r\n' % (rate)
        for ntrees in trees_array:
            valid_acc = []
            test_acc = []
            train_acc = []
            est_arr = []
            unconst_acc = []
            fold = 0

            for train_idx, test_idx in kf:
                fobj.write('\nfold: %r\n'%(fold))
                print '\nfold: %r\n'%(fold)
                #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor
                #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False)
                rf_model = etr(n_estimators=ntrees, max_depth=dpth, min_samples_split=msl)
                vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features)
                valid_acc.append(vacc)
                train_acc.append(tacc)
                est_arr.append(est)
                fobj.write('Train Size:%r\n' % (len(train_idx)))
                print 'Train Size:%r\n' % (len(train_idx))
                fobj.write('Validation Size:%r\n' % (len(test_idx)))
                fobj.write('Validation Error: %r\n' % (vacc))

                print 'Validation Error: %r\n' % (vacc)
                fobj.write('Train Error: %r\n' % (tacc))
                fobj.write('Constrained Test data size:%r\n' % (len(ttest_x)))
                test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x)))
                fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1]))
                print 'Constrained Test Error for fold: %r\n' % (test_acc[-1])
                y_res = est.predict(u_test_x)

                unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res))
                fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1]))
                print 'Complete test accuracy:%r\n' % (unconst_acc[-1])
                fold+=1

            et_time = time.time()
            fobj.write('..Statistics..\n')
            fobj.write('Expected Threshold Limit: %r\n' % (exp))
            fobj.write('Trees:%r\n' % ntrees)
            fobj.write('Train Average: %r\n' % (np.mean(train_acc)))
            fobj.write('Validation Average: %r\n' % (np.mean(valid_acc)))
            fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))

            #Print to console
            print('..Statistics..\n')
            print('Expected Threshold Limit: %r\n' % (exp))
            print('Trees:%r\n' % ntrees)
            print('Train Average: %r\n' % (np.mean(train_acc)))
            print('Validation Average: %r\n' % (np.mean(valid_acc)))
            print('Constrained Test Average: %r\n' % (np.mean(test_acc)))
            print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc)))
            print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))

    print 'Generating Solution Result:'
    test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv',delimiter=',')
    print 'Test Size:%r' % len(test_x)
    test_y_pred = est.predict(test_x)
    id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int)
    all_data = np.column_stack((id_col, test_y_pred))
    np.savetxt('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv', all_data, delimiter=',', header='Id,Expected')
    df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv')
    df.to_csv('C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv', header=True, index=False)
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target)))

# ExtraTreesRegressor 极端随机森林回归
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_etr_263 = np.zeros(train_shape)
predictions_etr_263 = np.zeros(len(X_test_263))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
    print("fold n°{}".format(fold_ + 1))
    tr_x = X_train_263[trn_idx]
    tr_y = y_train[trn_idx]
    etr_263 = etr(n_estimators=1000,
                  max_depth=8,
                  min_samples_leaf=12,
                  min_weight_fraction_leaf=0.0,
                  max_features=0.4,
                  verbose=1,
                  n_jobs=-1)
    etr_263.fit(tr_x, tr_y)
    oof_etr_263[val_idx] = etr_263.predict(X_train_263[val_idx])

    predictions_etr_263 += etr_263.predict(X_test_263) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr_263, target)))

train_stack2 = np.vstack(
    [oof_lgb_263, oof_xgb_263, oof_gbr_263, oof_rfr_263,
     oof_etr_263]).transpose()
# transpose()函数的作用就是调换x,y,z的位置,也就是数组的索引值
test_stack2 = np.vstack([
示例#8
0
    if (Env_var.get('Pca') == 1):
        pca = PCA(n_components = 300).fit(train_data_X)    
        train_data_X = pca.transform(train_data_X)
        test_data_X = pca.transform(test_data_X)
    
    ###############################--------Model Setup--------###############################
    ann_regressor = KerasRegressor(build_fn=ann_model, epochs=30, batch_size=10, verbose=1)
    
    xgb_regressor = xgb(learning_rate = 0.0825, min_child_weight = 1, max_depth = 7, subsample = 0.8, verbose = 10, random_state = 2017, n_jobs = -1, eval_metric = "rmse")
    
    rfr_regressor = rfr(max_features = 0.9, min_samples_leaf = 50)
    
    gbr_regressor = gbr(n_estimators = 200, verbose = 5, learning_rate = 0.08, max_depth = 7, max_features = 0.5, min_samples_leaf = 50, subsample = 0.8, random_state = 2017)
    
    etr_regressor = etr(n_estimators = 200, verbose = 10, max_depth = 7, min_samples_leaf = 100, max_features = 0.9, min_impurity_split = 100, random_state = 2017)
    
    lr_regressor = lr()
    
    svr_regressor = svr(verbose = 10)
    
    ensemble = Ensemble(n_folds = 5,stacker =  lr_regressor,base_models = [ann_regressor, xgb_regressor, rfr_regressor, gbr_regressor, etr_regressor])
    
    
    ###############################--------Grid Search--------###############################

  
    if (Env_var.get('GridSearch') == 1):
        
        if (Env_var.get('Model') == 'ann'):
            dropout_rate = [0.0, 0.001, 0.01]