예제 #1
0
    def build_models(self):

        self.remove_columns(
            [
                "institute_latitude",
                "institute_longitude",
                "institute_state",
                "institute_country",
                "var10",
                "var11",
                "var12",
                "var13",
                "var14",
                "var15",
                "instructor_past_performance",
                "instructor_association_industry_expert",
                "secondary_area",
                "var24",
            ]
        )

        model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8)
        model2 = RandomForestRegressor(n_estimators=50)
        model3 = ExtraTreesRegressor(n_estimators=50)

        model1.fit(self.X, self.y)
        model2.fit(self.X, self.y)
        model3.fit(self.X, self.y)

        return [model1, model2, model3]
예제 #2
0
def do_etrees(filename):
    df, Y = create_merged_dataset(filename)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED)
    X = df.drop(['driver', 'trip'], 1)
    etree.fit(X, Y)
    probs = etree.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
예제 #3
0
    def fit(self, X, y, weights = None, **kwargs):
        if weights is None: weights = np.ones(y.shape[0])
        data = np.hstack((y.reshape(y.shape[0],1),X))
        
        S = wcov(data, weights)
        corr = wcorr(data, weights)
        wsd = np.sqrt(S.diagonal())
        
        ExtraTrees = ExtraTreesRegressor(**kwargs)
        ExtraTrees.fit(X,y, sample_weight=weights)
        
        Rsquare = ( S[0,1:].dot(np.linalg.inv(S[1:,1:]).dot(S[1:,0])) )/S[0,0]
        
        # assign proportion of Rsquare to each covariate dep. on importance
        self.importances = ExtraTrees.feature_importances_ * Rsquare 
        model = self.constrained_optimization( corr )
        
        if self.fit_intercept:
            w = np.diagflat( weights/np.sum(weights),k=0)
            wmean = np.sum(w.dot(data), axis=0)
            self.intercept_ = wmean[0] - wsd[0]*np.sum(wmean[1:]*model.x/wsd[1:])

        self.coef_ = wsd[0]*model.x/wsd[1:] 
        
        return self
예제 #4
0
def cal_important_features(batch=10, threshold=1e-4):
  X_samples, Y_samples, scaler = dat.data_prepare('ocpm', 'lifetime_ecpm', outlier=0.05)
  tot_goot_atrs = {}
  for a in ATRS[5:]: tot_goot_atrs[a] = {}
  for i in np.arange(1,batch+1):
    Ts = timeit.default_timer()
    model = ExtraTreesRegressor(n_jobs=6)
    model.fit(X_samples, Y_samples)
    print "Totally %i features." % len(model.feature_importances_)
    print "[Labels] %i categories, %i interests, %i client_names, %i auto_tags" % (num.categories_len, num.interests_len, num.client_names_len, num.auto_tags_len)
    good_atrs = show_important_features(model.feature_importances_, threshold)
    for a in reversed(ATRS[5:]):
      for b in good_atrs[a]:
        if b in tot_goot_atrs[a]:
          tot_goot_atrs[a][b] += 1
        else:
          tot_goot_atrs[a][b] = 1
    print "%i batch finished in %.1f secs." % (i, (timeit.default_timer() - Ts))
    print "------------------------------------------------"
  # show performances
  for atr in reversed(ATRS[5:]):
    print "-------[%s]-----------------------" % atr
    for j in np.arange(1,batch+1):
      good_keys = [k for k,v in tot_goot_atrs[atr].items() if (v >= j)]
      print "%i keys occurs > %i times." % (len(good_keys), j)
  return tot_goot_atrs
예제 #5
0
def predict_with_one(X, out_file_name):
    n_samples, n_features = X.shape
    iter_num = 3
    div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0)
    model = ExtraTreesRegressor(n_estimators=5)
    score_matrix = np.zeros((n_features, n_features))

    t = time()
    round_num = 0
    for train, test in div:
        round_num += 1
        train_samples = X[np.array(train)]
        test_samples = X[np.array(test)]
        for i in range(n_features):
            for j in range(n_features):
                X_train = train_samples[:, i:i+1]
                X_test = test_samples[:, i:i+1]
                y_train = train_samples[:, j]
                y_test = test_samples[:, j]
        # for i in range(len(fl)):
        #     for j in range(len(fl)):
        #         if fl[j][1]-fl[j][0] != 1:
        #             continue
        #         X_train = train_samples[:, fl[i][0]:fl[i][1]]
        #         X_test = test_samples[:, fl[i][0]:fl[i][1]]
        #         y_train = train_samples[:, fl[j][0]]
        #         y_test = test_samples[:, fl[j][0]]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                score_matrix[i, j] += mae
                print('Round', round_num, '|', i, j, mae, time()-t)
    np.savetxt(os.path.join(CODE_PATH, out_file_name),
               score_matrix/iter_num, fmt='%.3f', delimiter=',')
예제 #6
0
파일: t_dtree.py 프로젝트: Catentropy/mylab
def mul_dtree(X, Y2):
    forest = ExtraTreesRegressor(n_estimators=5,
                             compute_importances=True,
                             random_state=0)
    forest.fit(X[:200], Y2[:200])
    forest.predict(X[200:])
    print Y2[200:]
예제 #7
0
파일: estimator.py 프로젝트: DJRumble/S2DS
 def fit(self, X, y, **kwargs):
     for key, value in kwargs.iteritems():
         if key in self.INITPARAMS.keys():
             self.INITPARAMS[key] = value
     model = ExtraTreesRegressor(**self.INITPARAMS)
     model.fit(X, y)
     self.model = model
예제 #8
0
 def classify(self):
     """Perform classification"""
     clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2)
     #pca = PCA(n_components = 400)
     #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata)
     #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata)
     #print self._ClassifyDriver__traindata.shape
     clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
     self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
예제 #9
0
def build_extra_tree_regressor(X_test, X_train_full, y_train_full):


    print "Building ExtraTrees regressor..."
    etr = ExtraTreesRegressor(n_estimators=500)
    etr.fit(X_train_full, y_train_full)
    etr_predict = etr.predict(X_test)

    return etr_predict
def reg_skl_etr(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                              max_features=param['max_features'],
                              n_jobs=param['n_jobs'],
                              random_state=param['random_state'])
    etr.fit(X_tr, y_reg_tr)
    pred = etr.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
예제 #11
0
def extra_trees_regressor(x, y, n_estimators, max_depth):
    kf = KFold(len(x), n_folds=3)
    scores = []
    for train_index, test_index in kf:
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
        clf.fit(X_train, y_train)
        scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5)
    return np.mean(scores)
예제 #12
0
class MyExtraTreeReg(MyRegressor):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesRegressor(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesRegressor(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    def predict(self, Xtest, option = None):
      return self._extree.predict(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
예제 #13
0
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7):


    # 训练模型
    scaler = StandardScaler().fit(X_train)
    rescaledX = scaler.transform(X_train)
    gbr = ExtraTreesRegressor(n_estimators=80)
    gbr.fit(X=rescaledX, y=Y_train)
    # 评估算法模型
    rescaledX_validation = scaler.transform(X_validation)
    predictions = gbr.predict(rescaledX_validation)
    print(mean_squared_error(Y_validation, predictions))
def estimate():
    from loadData import loadSets
    from helper import splitDataset, separateTargetFromTrain
    from sklearn.ensemble import ExtraTreesRegressor
    import numpy as np
    import math

    best_rmsle = 2
    best_i = 0
    
    trainingSet, testingSet = loadSets()
    testingSet = None

    trainingData, testingData = splitDataset(trainingSet, 0.6)
    testingData, validationData = splitDataset(testingData, 0.5)
    trainingSet = None
    
    trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData)
    testingTarget, testingFeatures = separateTargetFromTrain(testingData)
    validationTarget, validationFeatures = separateTargetFromTrain(validationData)

    testingTarget = testingTarget.values
    validationTarget = validationTarget.values
    
    trainingData = None
    testingData = None
    validationData = None    
    
    for i in range(2000, 3001, 1000):
        model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1)
        model.fit(trainingFeatures, trainingTarget)
        
        predictions = model.predict(testingFeatures)
                
        cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2)
        rmsle = math.sqrt(np.mean(cost))
        print i, " estimators: ", rmsle
        
        if rmsle < best_rmsle:
            best_rmsle = rmsle
            best_i = i
            
    print "Best: ", best_i, " estimators with rmsle: ", best_rmsle
    
    model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1)
    model.fit(trainingFeatures, trainingTarget)
    predictions = model.predict(validationFeatures)
            
    cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2)
    rmsle = math.sqrt(np.mean(cost))
    
    print "Final model cost: ", rmsle
예제 #15
0
def dummie_columns_extra_trees(train, test):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    predicting_columns = list(train._get_numeric_data().columns.values)
    predicting_columns.remove("LISTPRICE")
    predicting_columns.remove("SOLDPRICE")
    rf = ExtraTreesRegressor(
        n_estimators=300, n_jobs=-1)
    rf.fit(train[predicting_columns], train["SOLDPRICE"])
    score = rf.score(test[predicting_columns], test["SOLDPRICE"])
    predictions = rf.predict(test[predicting_columns])
    sample_predictions(test, predictions)
    print "Accuracy: {}\n".format(score)
    return score, predictions
def main():
    # X,Y = make_top_dataset(100000,30)
    X, Y = make_friedman1_random_attr(n_samples=100000, n_features=10)
    tX, tY = make_friedman1_random_attr(n_samples=100, n_features=10)

    start_time = time.time()

    ext = ETRs(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1)
    # ext = RFR(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1)
    ext.fit(X, Y)

    elapsed_time = time.time() - start_time
    print elapsed_time

    print score(ext, tX, tY)
예제 #17
0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    rf = ExtraTreesRegressor(
        n_estimators=300,
        n_jobs=-1
    )
    rf.fit(data_train_x, data_train_y)
    sample_predictions(rf.predict(data_test_x), data_test_y)
    score = rf.score(data_test_x, data_test_y)
    cross_validated_scores = cross_val_score(
        rf, data_test_x, data_test_y, cv=5)
    print "MSE Accuracy: {}".format(score)
    print "MSE Across 5 Folds: {}".format(cross_validated_scores)
    print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
예제 #18
0
def main():
    for ind in range(1, 15+1):
    #for ind in [3,4,5,7,9,11,12,13,14,15]: # no 1,2,6,8,10
        print "TrainingSet/ACT%d_competition_training.csv" % ind
        #read in  data, parse into training and target sets
        cols, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind)
        target = np.array( [x[0] for x in train] )

        train = filter_cols(train, cols, "../selected/selected_%d.txt" % ind)
        #print("Train: ", len(train), " cols:", len(train[0]))
        train = np.array( train )

        #In this case we'll use a random forest, but this could be any classifier
        cfr = ExtraTreesRegressor(n_estimators=1000, max_features=(len(train[0])//3), n_jobs=8, random_state=1279)

        #Simple K-Fold cross validation. 10 folds.
        cv = cross_validation.KFold(len(train), k=10, indices=False, shuffle=True)

        #iterate through the training and test cross validation segments and
        #run the classifier on each one, aggregating the results into a list
        results = []
        for traincv, testcv in cv:
            ft = cfr.fit(train[traincv], target[traincv])
            score = ft.score(train[testcv], target[testcv])
            results.append(score)
            print "\tFold %d: %f" % (len(results), score)

        #print out the mean of the cross-validated results
        print "Results: " + str( np.array(results).mean() )
예제 #19
0
def main():
    for ind in range(1, 15+1):
        print "TrainingSet/ACT%d_competition_training.csv" % ind
        #read in  data, parse into training and target sets
        cols, molecules1, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind)
        target = np.array( [x[0] for x in train] )

        #load train
        train = filter_cols(train, cols, "../selected/cor9/selected_%d.txt" % ind)
        train = np.array(train)
        #print("Train: ", len(train), " cols:", len(train[0]))

        # seeds used: orig=1279, cor8=1278, cor9=1277
        cfr = ExtraTreesRegressor(n_estimators=2000, max_features=(len(train[0])//3), n_jobs=8, random_state=1277)
                                  #min_samples_leaf=2, min_samples_split=2, random_state=1279)
        rf = cfr.fit(train, target)

        #predict train
        pred = rf.predict(train)
        write_file("erStacking/cor9/er_stacking_%d.csv" % ind, molecules1, pred)

        #load test
        cols, molecules2, test = read_data("../TestSet/ACT%d_competition_test.csv" % ind)
        test = filter_cols(test, cols, "../selected/cor9/selected_%d.txt" % ind)
        test = np.array(test)

        #predict test
        pred = rf.predict(test)
        write_file("erStacking/test/cor9/er_submission_%d.csv" % ind, molecules2, pred)
예제 #20
0
def run():
    cycles = load_and_munge_training_data('train.csv')
    inputs = ['holiday', 'workingday', 'temp', 'atemp',
              'humidity', 'windspeed', 'month', 'hour']

    x_train, x_test, y_train, y_test = train_test_split(cycles[inputs],
                                                        cycles['count'],
                                                        test_size=0.25)
    scaler_x = StandardScaler().fit(x_train)
    scaler_y = StandardScaler().fit(y_train)
    x_train  = scaler_x.transform(x_train)
    y_train  = scaler_y.transform(y_train)
    x_test   = scaler_x.transform(x_test)
    y_test   = scaler_y.transform(y_test)

    techniques = {}

    clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None)
    clf_sgd.fit(x_train, y_train)
    techniques['Linear - no penalty'] = evaluate(clf_sgd, x_train, y_train)

    clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2')
    clf_sgd1.fit(x_train, y_train)
    techniques['Linear - squared sums of the coefficients penalisation'] = \
        evaluate(clf_sgd1, x_train, y_train)

    clf_svr = svm.SVR(kernel='linear')
    clf_svr.fit(x_train, y_train)
    techniques['SVR - linear'] = evaluate(clf_svr, x_train, y_train)

    clf_svr_poly = svm.SVR(kernel='poly')
    clf_svr_poly.fit(x_train, y_train)
    techniques['SVR - poly'] = evaluate(clf_svr_poly, x_train, y_train)

    clf_svr_rbf = svm.SVR(kernel='rbf')
    clf_svr_rbf.fit(x_train, y_train)
    techniques['SVR - RBF'] = evaluate(clf_svr_rbf, x_train, y_train)

    clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True)
    clf_et.fit(x_train, y_train)
    techniques['Random forest'] = evaluate(clf_et, x_train, y_train)

    clf_lr = LinearRegression()
    clf_lr.fit(x_train, y_train)
    techniques['Linear regression'] = evaluate(clf_lr, x_train, y_train)

    return sorted(techniques.iteritems(), key=operator.itemgetter(1))
예제 #21
0
def predict_for(output, cycles, tests, raw_tests, inputs):
    x_train, x_test, y_train, y_test = train_test_split(cycles[inputs],
                                                        cycles[output],
                                                        test_size=0.25,
                                                        random_state=33)
    scaler_x  = StandardScaler().fit(x_train)
    scaler_t  = StandardScaler().fit(tests)
    x_train   = scaler_x.transform(x_train)
    x_test    = scaler_x.transform(x_test)
    tests     = scaler_t.transform(tests)

    clf_et = ExtraTreesRegressor(n_estimators=10,
                                 compute_importances=True, random_state=42)
    clf_et.fit(x_train, y_train)

    ps = clf_et.predict(tests)
    return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
def buildModelOheETR(train_data, eval_data, train_labels, seed):
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)
    clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\
        max_features='auto', n_jobs=-1, random_state=seed, verbose=1)
    clf.fit(train_data, train_labels)
    preds = clf.predict(eval_data)
    preds = np.expm1(preds)

    # transform -ve preds to 0
    for i in range(preds.shape[0]):
        if preds[i] < 0:
            preds[i] = 0
            
    # convert back to log1p
    preds = np.log1p(preds)
            
    return((model,preds))
예제 #23
0
def get_forest(X_names=Xs, y_names=ys, num_trees=256, data=data):
    forest = ExtraTreesRegressor(
        n_estimators=num_trees, n_jobs=62, bootstrap=True)
    X = data.loc[:, [i for i in X_names]]
    y = data.loc[:, [i for i in y_names]]
    start = time()
    rfr = forest.fit(X, y)
    end = time()
    return(rfr, end-start)
    def fit(self, X, Y):
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        self.n_estimators = int(self.n_estimators)
        self.min_samples_leaf = int(self.min_samples_leaf)
        self.min_samples_split = int(self.min_samples_split)
        self.max_features = float(self.max_features)
        self.bootstrap = check_for_bool(self.bootstrap)
        self.n_jobs = int(self.n_jobs)
        self.verbose = int(self.verbose)

        if check_none(self.max_leaf_nodes):
            self.max_leaf_nodes = None
        else:
            self.max_leaf_nodes = int(self.max_leaf_nodes)

        if check_none(self.max_depth):
            self.max_depth = None
        else:
            self.max_depth = int(self.max_depth)

        self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        estimator = ExtraTreesRegressor(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            random_state=self.random_state)

        estimator.fit(X, Y)
        self.preprocessor = SelectFromModel(estimator=estimator,
                                            threshold='mean',
                                            prefit=True)

        return self
    def fit(self, X, Y):
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        preprocessor = ExtraTreesRegressor(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            random_state=self.random_state)
        preprocessor.fit(X, Y)
        self.preprocessor = SelectFromModel(preprocessor, prefit=True)

        return self
예제 #26
0
    def predict(class_id):
        print "predicting: ", class_id
        salaries_idx = np.where(salaries_enc == class_id)
        valid_idx = np.where(valid_salaries_enc == class_id)

        if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0:
            return [], None

        classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                        verbose=0,
                                        n_jobs=4, # 2 jobs on submission / 4 on valid test
                                        oob_score=False,
                                        min_samples_split=min_samples_split,
                                        random_state=3465343)

        print features[salaries_idx[0], :].shape
        print salaries[salaries_idx].shape
        classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx])
        predictions_part = classifier.predict(validation_features[valid_idx[0]])
        return predictions_part, valid_idx
예제 #27
0
def get_result():
    ngram_range = (1, 2)
    max_df = 0.75
    max_features = 2000
    v = CountVectorizer(
        ngram_range=ngram_range,
        max_df=max_df,
        max_features=max_features)
    x = v.fit_transform(rats_tr.comments.fillna('')).todense()
    y = rats_tr.quality
    n_estimators = 40
    max_depth = 20
    clf = ExtraTreesRegressor(n_estimators=n_estimators,
                              max_depth=max_depth,
                              random_state=0)
    clf.fit(x, y)

    t_x = v.transform(rats_te.comments.fillna('')).todense()
    t_y = clf.predict(t_x)
    submit = pd.DataFrame(data={'id': rats_te.id, 'quality': t_y})
    submit.to_csv('ridge_submit.csv', index=False)
class ModelERT:

    def __init__(self, model_set_name, i_fold):
        self.model_set_name = model_set_name
        self.i_fold = i_fold

    def set_params(self, prms):
        self.prms = prms

    def set_data(self, labels_tr, labels_te, data_tr, data_te):
        self.labels_tr = labels_tr
        self.labels_te = labels_te
        self.data_tr = data_tr
        self.data_te = data_te

    def train(self):
        print "start ert"
        self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"],
                                         verbose=1,
                                         random_state=self.prms["random_state"],
                                         n_estimators=int(self.prms["n_estimators"]),
                                         max_features=self.prms["max_features"])
        self.model.fit(self.data_tr.values, self.labels_tr)

    def predict(self):
        return self.model.predict(self.data_te.values)

    def predict_train(self):
        return self.model.predict(self.data_tr.values)

    def dump_model(self):
        pass

    def dump_pred(self, pred, name):
        folder = config.get_model_folder(self.model_set_name, self.i_fold)
        Files.mkdir(folder)
        path = config.get_model_path(self.model_set_name, name, self.i_fold)
        joblib.dump(pred, path)
    def predict(class_id, param):
        print "predicting: ", class_id
        param += "\npredicting: %s\n" % (le_features[col_index].classes_[class_id],)
        salaries_idx = np.where(feature_category == class_id)
        valid_idx = np.where(validation_features_category == class_id)
        param += "Salaries len: %d, valid len: %d\n" % (len(salaries_idx[0]), len(valid_idx[0]))

        if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0:
            return [], None, param

        classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                        verbose=0,
                                        n_jobs=4, # 2 jobs on submission / 4 on valid test
                                        oob_score=False,
                                        min_samples_split=min_samples_split,
                                        random_state=3465343)

        print features[salaries_idx[0], :].shape
        print salaries[salaries_idx].shape
        print validation_features[0].shape
        classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx])
        predictions_part = classifier.predict(validation_features[valid_idx[0]])
        return predictions_part, valid_idx, param
예제 #30
0
파일: views.py 프로젝트: vinay1a/busproject
def load_model():#make it load once when the service starts. called only once.
	#load_the model
         f = open('bpinall.txt','r').readlines()
         num_rows=len(f)
         num_col=len(f[0].split(','))
         x = np.zeros((num_rows,num_col),dtype=float)
         y=np.zeros((num_rows),dtype=float)
         for i,line in enumerate(f):
           line=line.strip('\r\n').strip()
           if line.count(',')>0:
            x[i]=[float(p) for p in line.split(',')]
         f2=open('bpoutall.txt','r').readlines()
         for i,line in enumerate(f2):
             line=line.strip('\r\n')
             y[i]=float(line)
         clf=ExtraTreesRegressor(verbose=0)
         print (x)
         clf.fit(x[:-1],y[:-1])
         pq=clf.predict(x[-1])
         print (pq,y[-1])
         #global clfp
         pickle.dump(clf,open('modelb.pkl','wb'))
         return pq
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

#Creating the model using randomforest
from sklearn.ensemble import RandomForestRegressor
reg_rfr = RandomForestRegressor(max_depth=19)
reg_rfr.fit(X_train, y_train)
y_pred1 = reg_rfr.predict(X_test)
S2 = reg_rfr.score(X_train, y_train)

from sklearn.ensemble import ExtraTreesRegressor
reg_etr = ExtraTreesRegressor(max_depth=20)
reg_etr.fit(X_train, y_train)
y_pred2 = reg_etr.predict(X_test)
S1 = reg_etr.score(X_train, y_train)

from sklearn.svm import SVR
reg_svr = SVR()
reg_svr.fit(X_train, y_train)
y_pred3 = reg_svr.predict(X_test)
S = reg_svr.score(X_train, y_train)

from sklearn.grid_search import GridSearchCV
parameters = [{'max_depth': np.arange(1, 21)}]
CV = GridSearchCV(estimator=reg_etr, param_grid=parameters, cv=10)
CV.fit(X_train, y_train)
CV_score = CV.score(X_train, y_train)
best_score = CV.best_score_
예제 #32
0
    'Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min',
    'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins'
]
x3 = x2.loc[:, a]
x3 = pd.concat([x3, y2], axis=1)

# Finds correlation between Independent and dependent attributes

plt.figure(figsize=(18, 18))
sns.heatmap(x3.corr(), annot=True, cmap="RdYlGn")

plt.show()

from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor(random_state=0)
selection.fit(x2, y2)
##############
print(selection.feature_importances_)
########
plt.figure(figsize=(12, 8))
feat_importances = pd.Series(selection.feature_importances_, index=x2.columns)
feat_importances.nlargest(20).plot(kind='barh')
feat_importances.nlargest(20).index

plt.show()

feature = [
    'Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min',
    'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins',
    'Airline_Air India', 'Airline_IndiGo', 'Airline_Jet Airways',
    'Airline_Jet Airways Business', 'Airline_Multiple carriers',
예제 #33
0
final_dataset.drop(['Year'], axis=1, inplace=True)
final_dataset.drop(['Current_Year'], axis=1, inplace=True)
final_dataset = pd.get_dummies(final_dataset, drop_first=True)
#print(final_dataset.head(10))

corrmat = final_dataset.corr()
top_corr_fetures = corrmat.index
#plt.figure(figsize=(20,20))
g = snb.heatmap(final_dataset[top_corr_fetures].corr(), annot=True, cmap="RdYlGn")
#plt.show()

X = final_dataset.iloc[:,1:]
Y = final_dataset.iloc[:,0]

model = ExtraTreesRegressor()
model.fit(X,Y)

#print(model.feature_importances_)
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
예제 #34
0
                  index=False)

preds_RF_py = np.exp(clf_RF.predict(pte[feature_names])) - 1
RF_py_sub = pd.DataFrame({'Id': ID.Id, 'Sales': preds_RF_py})
RF_py_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/RF_subs.csv", index=False)

# Extreemly Randomized Trees #
reg_ET = ExtraTreesRegressor(n_estimators=1000,
                             max_features=0.75,
                             max_depth=8,
                             min_samples_split=12,
                             n_jobs=-1,
                             random_state=737,
                             verbose=2)

reg_ET = reg_ET.fit(x_train, y_train)

preds_h = reg_ET.predict(pth[feature_names])
ET_holdout = pd.DataFrame({
    'Date': pth.Date,
    'Dow': pth.DayOfWeek,
    'Actual': np.exp(pth.Sales) - 1,
    'Predicted': np.exp(preds_h) - 1
})
ET_holdout.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_holdout.csv",
                  index=False)

preds_ET = np.exp(reg_ET.predict(pte[feature_names])) - 1
ET_sub = pd.DataFrame({'Id': ID.Id, 'Sales': preds_ET})
ET_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_subs.csv", index=False)
예제 #35
0
ET = ExtraTreesRegressor(n_estimators=1200,
                         random_state=1,
                         n_jobs=-1,
                         min_samples_split=2,
                         min_samples_leaf=2,
                         max_depth=20,
                         max_features='sqrt',
                         bootstrap=0)

#rfe = RFE(estimator=ET,n_features_to_select=180,step=5).fit(train_x.values, train_y.icol(0).values)
#train_x = rfe.transform(train_x.values)
#test_x = rfe.transform(test_x.values)

#sfm = SelectFromModel(estimator=ET,threshold='median').fit(train_x.values, train_y.icol(1).values)
#train_x = sfm.transform(train_x.values)
#test_x = sfm.transform(test_x.values)

#ET.fit(train_x,train_y)
#pre = (ET.predict(test_x)).round()

pre = DataFrame()
for i in range(7):
    ET.fit(train_x, list(train_y.icol(i).values))
    pre['col_' + str(i)] = (ET.predict(test_x)).round()
    tmp_score = calculate_score(pre.icol(i).values, test_y.icol(i).values)
    print str(i) + ': ', tmp_score

score = calculate_score(pre.values, test_y.values)
print score

#draw_feature_importance(train_x,ET)
예제 #36
0
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)

print salaries.shape
#a=5/0
for n_trees in [40]:
    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (min_samples_split, n_trees)
    print name
    classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                    verbose=2,
                                    n_jobs=2, # 2 jobs on submission / 4 on valid test
                                    oob_score=False,
                                    min_samples_split=min_samples_split,
                                    random_state=3465343)

    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        #dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
        classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
    def feature_importance(self, xg_boost=True, extra_trees=False):
        """
        function that displays feature importance using XG-Boost and Extra Trees
        Note: This function performs analysis using X and y
        * xg_boost=True, extra_trees=False: will perform feature importance using XG Boost only
        * xg_boost=False, extra_trees=True: will perform feature importance using Extra Trees only
        * xg_boost=True, extra_trees=True: will perform feature importance using both XG Boost and Extra Trees
        * xg_boost=False, extra_trees=False: Nothing will happen. Avoid this if you want to use feature selection.
        """
        output_folder = self.output_folder
        feature_names = self.feature_names

        X = self.X_df
        y = self.y_df

        if xg_boost:
            print('\n********** Method 4: Calculating the feature importance using XGBoost. **********\n')
            ''' feature importance using XGBoost '''
            feature_names = feature_names
            housing_dmatrix = xgb.DMatrix(X, y, feature_names=feature_names)
            # Create the parameter dictionary: params
            params = {"objective": "reg:squarederror", "max_depth": "4"}
            # Train the model: xg_reg
            xg_reg = xgb.train(dtrain=housing_dmatrix, params=params, num_boost_round=10)

            feature_imp = dict(
                sorted(xg_reg.get_score(importance_type='weight').items(), key=lambda kv: kv[1], reverse=True))
            print('\nFeatures - Importance\n')
            for key, value in feature_imp.items():
                print('%s: %.5f' % (key, value))
            print('\n')

            # Plot the feature importances
            xgb.plot_importance(xg_reg)

            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            fig = plt.gcf()
            fig.set_size_inches(15, 10.5)
            plt.title('XGBoost Feature Importance')
            fig.savefig(output_folder + 'xgb_fs', dpi=100)
            plt.close()
            print('saved plot in {}/{}'.format(output_folder, 'xgb_fs'))

        if extra_trees:
            print('\n********** Method 5: Calculating the feature importance using Extra Trees. **********\n')
            model = ExtraTreesRegressor(n_estimators=100, random_state=42)
            model.fit(X, y)
            feature_imp = {}
            for i in range(len(model.feature_importances_)):
                # print('%s: %.5f' % (columns[i], model.feature_importances_[i]))
                feature_imp[feature_names[i]] = model.feature_importances_[i]
            feature_imp = dict(sorted(feature_imp.items(), key=lambda kv: kv[1], reverse=True))
            print('\nFeatures - Importance\n')
            for key, value in feature_imp.items():
                print('%s: %.5f' % (key, value))
            print('\n')
            # print(model.feature_importances_)
            # use inbuilt class feature_importances of tree based classifiers
            # plot graph of feature importances for better visualization
            feat_importances = pd.Series(model.feature_importances_, index=X.columns)
            feat_importances.nlargest(20).plot(kind='barh')
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            fig = plt.gcf()
            fig.set_size_inches(15, 10.5)
            plt.title('Extra Trees Feature Importance')
            fig.savefig(output_folder + 'extratrees_fs.png', dpi=100)
            plt.close()
            print('saved plot in {}/{}'.format(output_folder, 'extratrees_fs.png'))
예제 #38
0
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_y_predict = uni_knr.predict(X_test)
print("K近邻(平均回归)性能评估:", uni_knr.score(X_test, y_test))

dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_y_predict = dis_knr.predict(X_test)
print("K近邻(距离加权回归)性能评估:", dis_knr.score(X_test, y_test))

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_y_predict = dtr.predict(X_test)
print("单一回归树性能评估:", dtr.score(X_test, y_test))

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr = rfr.predict(X_test)
#print("随机森林性能评估:",rfr.score(X_test,y_test))

etr = ExtraTreesRegressor()
etr.fit(X_train, y_train)
etr_y_predict = etr.predict(X_test)
print("极端随机森林性能评估:", etr.score(X_test, y_test))

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)
print("梯度提升性能评估:", gbr.score(X_test, y_test))
예제 #39
0

model = ExtraTreesRegressor(n_estimators=100, max_features=0.7, max_depth=10)

for i in folds_item_ids.keys():

    # Determine train and val folds
    fit_mask = X_train['item_id'].isin(folds_item_ids[i]['fit'])
    val_mask = X_train['item_id'].isin(folds_item_ids[i]['val'])
    X_fit = X_train[fit_mask].drop('item_id', axis='columns')
    y_fit = y_train[fit_mask]
    X_val = X_train[val_mask].drop('item_id', axis='columns')
    y_val = y_train[val_mask]

    # trick for ram saving
    model.fit(X_fit.astype(dtype='float32'), y_fit.astype(dtype='float32'))

    fit_predict = model.predict(X_fit)
    val_predict = model.predict(X_val)
    test_predict = model.predict(X_test)
    fit_scores.append(rmse(y_fit, fit_predict))
    val_scores.append(rmse(y_val, val_predict))
    sub['deal_probability'] *= test_predict

    # Save out-of-fold predictions
    name = 'folds/extra_tree_val_{}.csv'.format(i)
    pd.Series(val_predict).to_csv(name, index=False)
    # Save test predictions
    name = 'folds/extra_tree_test_{}.csv'.format(i)
    pd.Series(test_predict).to_csv(name, index=False)
예제 #40
0
Bagging = BaggingRegressor()
Bagging.fit(combined_train, Y_train)
Bagging_predict_train = Bagging.predict(combined_train)
Bagging_predict_test = Bagging.predict(combined_test)
print("Root mean squared error for train: %.2f" %
      math.sqrt(mean_squared_error(Y_train, Bagging_predict_train)))
#Root mean squared error for train 369.99
print("Root mean squared error for test: %.2f" %
      math.sqrt(mean_squared_error(Y_test, Bagging_predict_test)))
#Root mean squared error for test: 875.30

#16th model, ExtraTrees regression
from sklearn.ensemble import ExtraTreesRegressor

ExtraTrees = ExtraTreesRegressor()
ExtraTrees.fit(combined_train, Y_train)
ExtraTrees_predict_train = ExtraTrees.predict(combined_train)
ExtraTrees_predict_test = ExtraTrees.predict(combined_test)
print("Root mean squared error for train: %.2f" %
      math.sqrt(mean_squared_error(Y_train, ExtraTrees_predict_train)))
#Root mean squared error for train 2.99
print("Root mean squared error for test: %.2f" %
      math.sqrt(mean_squared_error(Y_test, ExtraTrees_predict_test)))
#Root mean squared error for test: 885.29
'''
External Weather API call: 
    WeatherStartLoc_StartTime, WeatherEndLoc_StartTime, 

Other ideas to consider: 
    - driver
        age, years of driving experience, years of driving experience in current city, avg driving speed-highway/local, driver ratings, #cars for this driver 
예제 #41
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    kf = KFold(n_splits=6)

    sj_model_list = []
    sj_err_list = []
    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        sj_etr = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1)
        sj_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = sj_etr.predict(X_val.drop('total_cases', axis=1))
        sj_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        sj_model_list.append(sj_etr)
        loop += 1
    print(sj_err_list)
    argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0]
    print(argmax)
    sj_best_model = sj_model_list[argmax]

    iq_model_list = []
    iq_err_list = []
    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        iq_etr = ETR(n_estimators=400, max_depth=4, random_state=0)
        iq_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = iq_etr.predict(X_val.drop('total_cases', axis=1))
        iq_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        iq_model_list.append(iq_etr)

        loop += 1
    print(iq_err_list)
    argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0]
    print(argmax)
    iq_best_model = iq_model_list[argmax]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_best_model.predict(sj_test)
    iq_predictions = iq_best_model.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/ext_new.csv")
    '''
예제 #42
0
print("--------------------------------------")
print('MAE is {}'.format(test_score_mae))
print('MSE is {}'.format(test_score_mse))
print('EVS is {}'.format(test_score_evs))
print('ME is {}'.format(test_score_me))
print('R2 score is {}'.format(test_score_r2))
print()
print("Best parameters set found on development set:")
print(gs.best_params_)
print()

# Re-train with best parameters
regr = ExtraTreesRegressor(**gs.best_params_)

t0 = time.time()
regr.fit(x_train, y_train.ravel())
regr_fit = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" %
      regr_fit)

t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

with open('output.log', 'w') as f:
    print("Training time: %.6f s" % regr_fit, file=f)
    print("Prediction time: %.6f s" % regr_predict, file=f)
    print(" ", file=f)
    print("The model performance for training set", file=f)
    print("--------------------------------------", file=f)
예제 #43
0
    #
    #
    # pred = pd.DataFrame(pred.reshape(-1,2000).T)
    # real = pd.DataFrame(test_Y.reshape(-1,2000).T)
    # score = np.sum(np.sum(np.abs((np.round(pred)-real)/(np.round(pred)+real))))/(2000*7)
    # print score



    ####出答案
    train_Y = pd.read_csv('train_label.csv', index_col=False, header=None).values
    print train_Y.shape

    train_X = pd.read_csv('train_feature.csv', index_col=False).values
    print train_X.shape

    test_X = pd.read_csv('test_feature.csv', index_col=False).values
    print test_X.shape

    model = ExtraTreesRegressor(n_estimators=1000, random_state=1, n_jobs=-1,
                                min_samples_split=3, min_samples_leaf=1, max_depth=100)


    model.fit(train_X, train_Y)
    pred = model.predict(test_X)

    pred = np.round(pred).reshape((-1,2000)).T
    answer = np.zeros([2000, 15])
    answer[:,0] = range(1,2001)
    answer[:,1:] = pred
    pd.DataFrame(answer, dtype=int).to_csv('ExtRandomTree_n1000_pred.csv', header=None, index=False)
예제 #44
0
class ForestEmbeddingsCounterfactual:
    """
    Counterfactual estimation using forest embeddings.

    Given explanatory variables X, target variable y and treatment variable W, 
    this class implements an individual counterfactual estimation model. 
    We can break down the process in four steps:

    1 - model step) Fit and validate an ensemble of trees (ET, RF, etc) from X to y
    2 - embedding step) Build a supervised embedding using forest's trees leaves
    3 - kNN step) For each sample, find K nearest neighbors in this new space 
    4 - comparison step) Compare W and y for each of the neighborhoods to determine the counterfactuals for each sample

    Parameters
    ----------

    model : object, optinal (default=None)

    Forest-based model which implements sklearn's API, particularly the .apply() method. 
    Must be already configured. Classification and regression models accepted.

    If None, model will be ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=5, bootstrap=True, n_jobs=-1).

    n_neighbors : int, optional (default=200)

    Number of neighbors to be considered at the kNN step. There's a bias-variance tradeoff here: 
    set n_neighbors too low, estimates will be volatile and unreliable. 
    Set n_neighbors too high, and the estimate will be biased (neighbors won't be comparable). 

    min_sample_effect : int, optional (default=10)

    The minimum number of samples in a neighborhood for the counterfactual estimate to be valid, for a given W. 
    If there's less treated/untreated elements than min_sample_effect in a neighborhood, the counterfactual will be NaN.

    save_explanatory : bool, optional (default=False)

    Save explanatory variables for explaining predictions. May cause large memory overhead.

    random_state : int, optional (default=None)

    If int, random_state is the seed used by the random number generator;
    If RandomState instance, random_state is the random number generator;
    If None, the random number generator is the RandomState instance used
    by `np.random`.
    
    """

    # initializing
    def __init__(self,
                 model=None,
                 n_neighbors=200,
                 min_sample_effect=10,
                 save_explanatory=False,
                 random_state=None):

        # storing model
        if model == None:
            self.model = ExtraTreesRegressor(n_estimators=1000,
                                             min_samples_leaf=5,
                                             bootstrap=True,
                                             n_jobs=-1)
        else:
            self.model = model

        # storing variables
        self.n_neighbors = int(n_neighbors)
        self.min_sample_effect = int(min_sample_effect)
        self.save_explanatory = save_explanatory
        self.random_state = random_state

    # method for computing embedding
    def _get_forest_embed(self, X):
        """
        Wrapper for extracting embeddings from forests given selected mode.
        Model must be fitted.
        """

        # applying the model to get leaves
        this_embed = self.model.apply(X)

        # returning forest embedding
        return this_embed

    # fit model and neighbors
    def fit(self, X, W, y, verbose=0):
        """
        Fit a counterfactual estimation model given explanatory variables X, treatment variable W and target y
        This method fits a forest-based model, extracts a supervised embedding from its leaves, 
        and builds an nearest neighbor index on the embedding

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        W : array-like, shape = [n_samples] 

        Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in this variable.
        Should not exceed 10 values.

        y: array-like, shape = [n_samples]
    
        Target variable. 

        verbose : int, optional (default=0)

        Verbosity level.

        Returns
        -------

        self: object

        """

        # checking if W has too many unique values
        if len(np.unique(W)) > 10:
            raise ValueError(
                'More than 10 unique values for W. Too many unique values will make the process very expensive.'
            )

        # fitting the model
        self.model.fit(X, y)

        # getting forest embedding from model
        self.train_embed_ = self._get_forest_embed(X)

        # create neighbor index
        self.nn_index = NNDescent(self.train_embed_, metric='hamming')

        # creating a df with treatment assignments and outcomes
        self.train_outcome_df = pd.DataFrame({
            'neighbor': range(X.shape[0]),
            'y': y,
            'W': W
        })

        # saving explanatory variables
        if self.save_explanatory:
            self.X_train = X.assign(W=W, y=y)

        # return self
        return self

    # method for predicting counterfactuals
    def predict(self, X, verbose=0):
        """
        Predict counterfactual outcomes for X. 
        This method will search the nearest neighbor index built using .fit(), and estimate
        counterfactual outcomes using kNN

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        verbose : int, optional (default=0)

        Verbosity level.

        Returns
        -------
        
        counterfactual_df : pd.DataFrame

        Counterfactual outcomes per sample.

        """

        # getting forest embedding from model
        X_embed_ = self._get_forest_embed(X)

        # getting nearest neighbors and distances from index
        neighs, dists = self.nn_index.query(X_embed_, k=self.n_neighbors + 1)

        # creating a df for neighbor ids
        neighs_df = (pd.DataFrame(neighs).reset_index().melt(
            id_vars='index').rename(columns={
                'index': 'id',
                'value': 'neighbor'
            }).reset_index(drop=True))

        # creating a df for the similarities
        similarities_df = (pd.DataFrame(1 - dists).reset_index().melt(
            id_vars='index').rename(columns={
                'index': 'id',
                'value': 'weight'
            }).reset_index(drop=True))

        # joining the datasets and adding weighted y variable
        nearest_neighs_df = (neighs_df.merge(similarities_df).drop(
            'variable', axis=1).merge(
                self.train_outcome_df, on='neighbor', how='left').assign(
                    y_weighted=lambda x: x.y * (x.weight)).sort_values('id'))

        # processing to get the effects
        counterfactual_df = nearest_neighs_df.assign(count=1).groupby(
            ['id', 'W']).sum()
        #counterfactual_df['y_hat'] = counterfactual_df['y']/counterfactual_df['count']
        counterfactual_df['y_hat'] = counterfactual_df[
            'y_weighted'] / counterfactual_df['weight']
        counterfactual_df.loc[
            counterfactual_df['count'] < self.min_sample_effect,
            'y_hat'] = np.nan
        counterfactual_df = counterfactual_df.pivot_table(values=['y_hat'],
                                                          columns='W',
                                                          index='id')

        # returning counterfactual df
        return counterfactual_df

    # running CV for model parameters
    def get_cross_val_scores(self, X, y, scoring=None, verbose=0):
        """
        Estimate model generalization power with 5-fold CV.

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        y: array-like, shape = [n_samples]

        Target variable. 
        
        scoring : string, callable or None, optional, default: None
        
        Scoring method for sklearn's cross_val_score function:

        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)`` which should return only
        a single value.

        Similar to :func:`cross_validate`
        but only a single metric is permitted.

        If None, the estimator's default scorer (if available) is used.
        
        verbose : int, optional (default=0)

        Verbosity level for sklearn's function cross_val_score.

        Returns
        -------
        
        scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.
        
        """

        # CV method
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)

        # generating validation predictions
        scores = cross_val_score(self.model,
                                 X,
                                 y,
                                 cv=kf,
                                 scoring=scoring,
                                 verbose=verbose)

        # calculating result
        return scores

    # generating manifold with UMAP
    def get_umap_embedding(self, X, verbose=0):
        """
        Compute a 2D manifold from the forest embedding for validation and criticism.

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        verbose : int, optional (default=0)

        Verbosity level for UMAP.

        Returns
        -------
        
        reduced_embed : array of shape = [n_samples, 2]

        2D representation of forest embedding using UMAP. 

        """

        # getting forest embedding from model
        X_embed_ = self._get_forest_embed(X)

        # reducing embedding to 2 dimensions
        reduced_embed = (UMAP(metric='hamming',
                              verbose=verbose).fit_transform(X_embed_))

        # returning
        return reduced_embed

    # method for explaning predictions
    def explain(self, sample):
        """
        Explain predcitions of counterfactual outcomes for one sample. 
        This method shows diagnostics and comparables so you can trust
        and explain counterfactual predictions to others

        Parameters
        ----------
        
        sample : array-like or sparse matrix of shape = [1, n_features]
        
        Sample that you want to get explanations for

        Returns
        -------
        
        comparables_table : pd.DataFrame

        Table of comparable elements.

        """

        # getting forest embedding from model
        sample_embed = self._get_forest_embed(sample)

        # getting nearest neighbors and distances from index
        neighs, dists = self.nn_index.query(sample_embed,
                                            k=self.n_neighbors + 1)

        # querying comparables
        if self.save_explanatory:
            comparables_table = self.X_train.iloc[neighs[0]]
        else:
            raise ValueError(
                'Model did not store training samples to get explanations from. Setting save_explanatory=True will solve the issue'
            )

        # returning comparables table
        return comparables_table
예제 #45
0
train_df: pandas.DataFrame
if not use_full_df:
    train_df = pcs_data_loader.load_corn_rows_sample_shaped_pickle_gz()
else:
    train_df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_pickle_gz())

# load training data and train et model
y = train_df['Dry_Yield']
X = train_df.drop(['Dry_Yield', 'Area'], axis=1)
scaler = StandardScaler()
scaler.fit(X)

print('fitting model')
model = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, verbose=99)
model.fit(scaler.transform(X), y)

model_path_ = f'{result_base_path}/et_model_{run_id}.pickle'
with open(model_path_, 'wb') as f:
    pickle.dump(model, f)
    print(f'model saved: {model_path_}')

scaler_path_ = f'{result_base_path}/et_scaler_{run_id}.pickle'
with open(scaler_path_, 'wb') as f:
    pickle.dump(scaler, f)
    print(f'model saved: {scaler_path_}')

results = []
for idx, elb_data in enumerate(pcs_data_loader.load_cached_elbs(df.columns)):
    year_id, elb_X, elb_y, extra_cols = elb_data
    print(f'comparing elb year id: {year_id}, index: {idx}')
예제 #46
0
 def model_extra_tree(self):
     model = ExtraTreesRegressor(n_estimators=self.n_est) #, max_depth=16, random_state=42)
     model.fit(self.train_x, self.train_y)
     self.y_pred = model.predict(self.test_x)
예제 #47
0
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi']]
X.head()

y=data_train.iloc[:,1]
y.head()

#Find the correlation
plt.figure(figsize=(18,18))
sns.heatmap(train_data.corr(),annot=True, cmap='RdYlGn')
plt.show()

from sklearn.ensemble import ExtraTreesRegressor
selection=ExtraTreesRegressor()
selection.fit(X,y)

#plot the importance of feature
plt.figure(figsize=(12,18))
fea_importance=pd.Series(selection.feature_importances_, index=X.columns)
fea_importance.nlargest(20).plot(kind='barh')
plt.show()

#Fitting Random Forest model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.ensemble import RandomForestRegressor
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train, y_train)
예제 #48
0
파일: model.py 프로젝트: Rachneet/car_price
# top_corr_features = corr.index
# plt.figure(figsize=(20,20))
# g = sns.heatmap(mod_df[top_corr_features].corr(), annot=True, cmap='RdYlGn')
# plt.show()

X = mod_df.iloc[:, 1:]
y = mod_df.iloc[:, 0]

# print(X.head())
# print(y.head())

# feature importance
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor()
model.fit(X, y)
# print(model.feature_importances_)

# visualize feature importances
# feat_imp = pd.Series(model.feature_importances_, index=X.columns)
# feat_imp.nlargest(n=5).plot(kind='barh')
# plt.show()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
# print(X_train.shape)

from sklearn.ensemble import RandomForestRegressor

rf_random = RandomForestRegressor()
    rgrs_1 = RandomForestRegressor(n_estimators=500,
                                   max_features=10,
                                   max_depth=15,
                                   min_samples_leaf=4,
                                   n_jobs=-1)
    rgrs_1.fit(train_raw[l1], train_y[l1])
    pred_1 = rgrs_1.predict(train_raw[l2])
    pred_1_test = rgrs_1.predict(test_raw)

    print 'generating et ...'
    rgrs_2 = ExtraTreesRegressor(n_estimators=500,
                                 max_features=15,
                                 max_depth=15,
                                 min_samples_leaf=4,
                                 n_jobs=-1)
    rgrs_2.fit(train_raw[l1], train_y[l1])
    pred_2 = rgrs_2.predict(train_raw[l2])
    pred_2_test = rgrs_2.predict(test_raw)

    # xgb on raw
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.01
    params["max_depth"] = 7
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.8
    params["min_child_weight"] = 5
    params["silent"] = 1
    plst = list(params.items())

    eval_rat = int(0.8 * len(l1))
예제 #50
0
df = df.drop(['Expected', 'Id'], axis=1)

# print "Prepare folds for cross validation"
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    data, label, test_size=0.8, random_state=23435)

# conf = sklearn.metrics.confusion_matrix(df['missing_values'], df['sample_weights'])
# plt.imshow(conf, cmap='binary', interploation='None')

print "RandomForestRegressor..."
clf = sklearn.ensemble.RandomForestRegressor(verbose=2, n_jobs=2)
clf.fit(x_train, y_train)
print mean_squared_error(clf.predict(x_test), y_test)
# with open('models/RandomForestRegressor.pkl', 'wb') as fid:
#     cpk.dump(clf, fid)

print "GradientBoostingRegressor..."
clf = sklearn.ensemble.GradientBoostingRegressor(verbose=2)
clf.fit(x_train, y_train)
print mean_squared_error(clf.predict(x_test), y_test)
# with open('models/GradientBoostingRegressor.pkl', 'wb') as fid:
#     cpk.dump(clf, fid)

print "ExtraTreesRegressor..."

clf = ExtraTreesRegressor(n_estimators=20, verbose=2, n_jobs=-1)
clf.fit(x_train, y_train)
print mean_squared_error(clf.predict(x_test), y_test)
# with open('models/ExtraTreesRegressor.pkl', 'wb') as fid:
#     cpk.dump(clf, fid)
예제 #51
0
class Model():
    def __init__(self, model_type, features=[]):
        self.model = None
        self.model_type = model_type
        self.features = features

    # initialize and fit xgboost model
    def xgb_model(self,
                  train_X,
                  train_y,
                  val_X=None,
                  val_y=None,
                  seed_val=seed,
                  num_rounds=2500):

        param = {}
        param['objective'] = 'binary:logistic'
        param['eval_metric'] = 'logloss'
        param['eta'] = 0.03
        param['max_depth'] = 6
        param['silent'] = 1
        param['subsample'] = 0.8
        param['colsample_bytree'] = 0.8
        param['min_child_weight'] = 8
        param['scale_pos_weight'] = 0.360

        # param['nthread'] = 4
        param['seed'] = seed_val
        num_rounds = num_rounds

        plst = list(param.items())

        # model = xgb.train(plst, xgtrain, num_rounds, verbose_eval=True)

        if val_X is not None and val_y is not None:
            xgtrain = xgb.DMatrix(train_X, label=train_y)
            xgval = xgb.DMatrix(val_X, label=val_y)

            watchlist = [(xgtrain, 'train'), (xgval, 'val')]
            model = xgb.train(plst,
                              xgtrain,
                              num_rounds,
                              watchlist,
                              early_stopping_rounds=20,
                              verbose_eval=True)
        else:
            _train_X, _val_X, _train_y, _val_y = sklearn.cross_validation.train_test_split(
                train_X, train_y, test_size=0.1, random_state=seed)

            xgtrain = xgb.DMatrix(_train_X, label=_train_y)
            xgval = xgb.DMatrix(_val_X, label=_val_y)

            watchlist = [(xgtrain, 'train'), (xgval, 'val')]
            model = xgb.train(plst,
                              xgtrain,
                              num_rounds,
                              watchlist,
                              early_stopping_rounds=20,
                              verbose_eval=True)

        return model

    # initialize and fit lightgbm model
    def lgb_model(self,
                  train_X,
                  train_y,
                  val_X=None,
                  val_y=None,
                  seed_val=seed,
                  num_rounds=2500):

        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 0,
            'num_threads': 64,
            'scale_pos_weight': 0.360
        }

        # model = lgb.train(params, lgb_train, num_boost_round=num_rounds)

        if val_X is not None and val_y is not None:
            lgb_train = lgb.Dataset(train_X, train_y)
            lgb_val = lgb.Dataset(val_X, val_y, reference=lgb_train)

            model = lgb.train(params,
                              lgb_train,
                              num_boost_round=num_rounds,
                              valid_sets=lgb_val,
                              early_stopping_rounds=20)
        else:
            _train_X, _val_X, _train_y, _val_y = sklearn.cross_validation.train_test_split(
                train_X, train_y, test_size=0.1, random_state=seed)

            lgb_train = lgb.Dataset(_train_X, _train_y)
            lgb_val = lgb.Dataset(_val_X, _val_y, reference=lgb_train)

            model = lgb.train(params,
                              lgb_train,
                              num_boost_round=num_rounds,
                              valid_sets=lgb_val,
                              early_stopping_rounds=20)

        return model

    # get predictions
    def predict(self, test_X, test_y=None):

        if self.features:
            test_X = test_X[self.features]

        if self.model:

            if self.model_type == 'xgboost':
                xgtest = xgb.DMatrix(test_X)
                preds = self.model.predict(xgtest)
                preds = preds.reshape(-1, 1)

            elif self.model_type == 'lgb':
                preds = self.model.predict(test_X)
                preds = preds.reshape(-1, 1)

            elif self.model_type == 'ExtraTreesRegressor':
                preds = self.model.predict(test_X)
                preds = preds.reshape(-1, 1)

            else:
                preds = self.model.predict_proba(test_X)[:, 1]
                preds = preds.reshape(-1, 1)

            if test_y is not None:
                print('log_loss: ', log_loss(test_y, preds))

            return preds
        else:
            assert (
                'No trained model was found... You have to first fit the model'
            )

    # fit model on full feature set or subset if provided
    def fit(self, train_X, train_y, val_X=None, val_y=None):

        if self.features:
            train_X = train_X[self.features]

        if self.model_type == 'xgboost':
            self.model = self.xgb_model(train_X, train_y, val_X, val_y)

        elif self.model_type == 'lgb':
            self.model = self.lgb_model(train_X, train_y, val_X, val_y)

        elif self.model_type == 'RandomForestClassifier':
            self.model = RandomForestClassifier(n_estimators=150,
                                                n_jobs=-1,
                                                class_weight={
                                                    1: 0.472001959,
                                                    0: 1.309028344
                                                })
            self.model.fit(train_X, train_y)

        elif self.model_type == 'LogisticRegression':
            self.model = LogisticRegression(C=0.1,
                                            solver='sag',
                                            class_weight={
                                                1: 0.472001959,
                                                0: 1.309028344
                                            })
            self.model.fit(train_X, train_y)

        elif self.model_type == 'svm':
            self.model = SVC(random_state=seed,
                             probability=True,
                             verbose=True,
                             class_weight={
                                 1: 0.472001959,
                                 0: 1.309028344
                             })
            self.model.fit(train_X, train_y)

        elif self.model_type == 'fastFM':
            self.model = sgd.FMClassification(n_iter=1000,
                                              init_stdev=0.1,
                                              rank=2,
                                              step_size=0.1)
            self.model.fit(train_X, train_y)
            #To be completed => http://arogozhnikov.github.io/2016/02/15/TestingLibFM.html

        elif self.model_type == 'KNeighborsClassifier':
            self.model = KNeighborsClassifier(n_neighbors=5,
                                              weights='uniform',
                                              algorithm='auto',
                                              leaf_size=30,
                                              p=2,
                                              metric='minkowski',
                                              metric_params=None,
                                              n_jobs=-1)
            self.model.fit(train_X, train_y)

        elif self.model_type == 'AdaBoostClassifier':
            self.model = AdaBoostClassifier(n_estimators=1000,
                                            random_state=seed)
            self.model.fit(train_X, train_y)

        elif self.model_type == 'ExtraTreesClassifier':
            self.model = ExtraTreesClassifier(n_estimators=200,
                                              max_depth=None,
                                              min_samples_split=2,
                                              n_jobs=-1,
                                              class_weight={
                                                  1: 0.472001959,
                                                  0: 1.309028344
                                              })
            self.model.fit(train_X, train_y)

        elif self.model_type == 'ExtraTreesRegressor':
            self.model = ExtraTreesRegressor(n_estimators=200,
                                             max_depth=None,
                                             min_samples_split=2,
                                             n_jobs=-1)
            self.model.fit(train_X, train_y)
# train 485 reg,  test 162 reg , eval 162 reg

#ExtraTreesRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,accuracy_score,r2_score

modelo= ExtraTreesRegressor(bootstrap=True, ccp_alpha=0.1, criterion='mse',
                    max_depth=None, max_features=None, max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.30,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=None, oob_score='False',
                    random_state=None, verbose=0, warm_start=True)

 modelo.fit(X, y)
 y_test_predict = modelo.predict(X_val)
 resultados=y_test_predict

  #Obtención de coeficiente de determinación:
r2_score(y_val, resultados)

res=pd.DataFrame()
res['predicho']=resultados
res['real']=y_val

res['errorAbs']=res['predicho']-res['real']
res['errorCuad']=(res['predicho']-res['real'])**(2)

#Visualización del resultado mediante histograma de error absoluto  <<< Ilustración 7.31>>>
plt.title('Histograma de errores absolutos (árboles extremadamente aleatorios)')
        axis=1,
        errors='ignore',
        inplace=True)
X = df.drop(['Dry_Yield'], axis=1, errors='ignore')
y = df['Dry_Yield']
label_mask = numpy.isin(X.columns, label_cols)

enc = DummyEncoder(label_mask)
enc.fit(X)

scaler = StandardScaler()
scaler.fit(X.loc[:, ~label_mask].fillna(0))

model = ExtraTreesRegressor(verbose=99, min_samples_leaf=7, n_jobs=-1)
X_scaled = transform(X, enc, label_mask)
model.fit(X_scaled, y)

elb_results = []
for idx, (year_id, elb_df) in enumerate(load_cached_elbs()):
    elb_df.drop(['Year', 'YearId', 'ProcessedLayerUID', 'Area'],
                axis=1,
                errors='ignore',
                inplace=True)
    elb_df = elb_df[df.columns]
    elb_X = elb_df.drop(['Dry_Yield'], axis=1)
    elb_X_scaled = transform(elb_X, enc, label_mask)
    elb_y = elb_df['Dry_Yield']

    predictions = model.predict(elb_X_scaled)
    elb_score = ScoreReport(elb_y.values, predictions)
    elb_results.append(elb_score)