def build_models(self): self.remove_columns( [ "institute_latitude", "institute_longitude", "institute_state", "institute_country", "var10", "var11", "var12", "var13", "var14", "var15", "instructor_past_performance", "instructor_association_industry_expert", "secondary_area", "var24", ] ) model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8) model2 = RandomForestRegressor(n_estimators=50) model3 = ExtraTreesRegressor(n_estimators=50) model1.fit(self.X, self.y) model2.fit(self.X, self.y) model3.fit(self.X, self.y) return [model1, model2, model3]
def do_etrees(filename): df, Y = create_merged_dataset(filename) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED) X = df.drop(['driver', 'trip'], 1) etree.fit(X, Y) probs = etree.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def fit(self, X, y, weights = None, **kwargs): if weights is None: weights = np.ones(y.shape[0]) data = np.hstack((y.reshape(y.shape[0],1),X)) S = wcov(data, weights) corr = wcorr(data, weights) wsd = np.sqrt(S.diagonal()) ExtraTrees = ExtraTreesRegressor(**kwargs) ExtraTrees.fit(X,y, sample_weight=weights) Rsquare = ( S[0,1:].dot(np.linalg.inv(S[1:,1:]).dot(S[1:,0])) )/S[0,0] # assign proportion of Rsquare to each covariate dep. on importance self.importances = ExtraTrees.feature_importances_ * Rsquare model = self.constrained_optimization( corr ) if self.fit_intercept: w = np.diagflat( weights/np.sum(weights),k=0) wmean = np.sum(w.dot(data), axis=0) self.intercept_ = wmean[0] - wsd[0]*np.sum(wmean[1:]*model.x/wsd[1:]) self.coef_ = wsd[0]*model.x/wsd[1:] return self
def cal_important_features(batch=10, threshold=1e-4): X_samples, Y_samples, scaler = dat.data_prepare('ocpm', 'lifetime_ecpm', outlier=0.05) tot_goot_atrs = {} for a in ATRS[5:]: tot_goot_atrs[a] = {} for i in np.arange(1,batch+1): Ts = timeit.default_timer() model = ExtraTreesRegressor(n_jobs=6) model.fit(X_samples, Y_samples) print "Totally %i features." % len(model.feature_importances_) print "[Labels] %i categories, %i interests, %i client_names, %i auto_tags" % (num.categories_len, num.interests_len, num.client_names_len, num.auto_tags_len) good_atrs = show_important_features(model.feature_importances_, threshold) for a in reversed(ATRS[5:]): for b in good_atrs[a]: if b in tot_goot_atrs[a]: tot_goot_atrs[a][b] += 1 else: tot_goot_atrs[a][b] = 1 print "%i batch finished in %.1f secs." % (i, (timeit.default_timer() - Ts)) print "------------------------------------------------" # show performances for atr in reversed(ATRS[5:]): print "-------[%s]-----------------------" % atr for j in np.arange(1,batch+1): good_keys = [k for k,v in tot_goot_atrs[atr].items() if (v >= j)] print "%i keys occurs > %i times." % (len(good_keys), j) return tot_goot_atrs
def predict_with_one(X, out_file_name): n_samples, n_features = X.shape iter_num = 3 div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0) model = ExtraTreesRegressor(n_estimators=5) score_matrix = np.zeros((n_features, n_features)) t = time() round_num = 0 for train, test in div: round_num += 1 train_samples = X[np.array(train)] test_samples = X[np.array(test)] for i in range(n_features): for j in range(n_features): X_train = train_samples[:, i:i+1] X_test = test_samples[:, i:i+1] y_train = train_samples[:, j] y_test = test_samples[:, j] # for i in range(len(fl)): # for j in range(len(fl)): # if fl[j][1]-fl[j][0] != 1: # continue # X_train = train_samples[:, fl[i][0]:fl[i][1]] # X_test = test_samples[:, fl[i][0]:fl[i][1]] # y_train = train_samples[:, fl[j][0]] # y_test = test_samples[:, fl[j][0]] model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) score_matrix[i, j] += mae print('Round', round_num, '|', i, j, mae, time()-t) np.savetxt(os.path.join(CODE_PATH, out_file_name), score_matrix/iter_num, fmt='%.3f', delimiter=',')
def mul_dtree(X, Y2): forest = ExtraTreesRegressor(n_estimators=5, compute_importances=True, random_state=0) forest.fit(X[:200], Y2[:200]) forest.predict(X[200:]) print Y2[200:]
def fit(self, X, y, **kwargs): for key, value in kwargs.iteritems(): if key in self.INITPARAMS.keys(): self.INITPARAMS[key] = value model = ExtraTreesRegressor(**self.INITPARAMS) model.fit(X, y) self.model = model
def classify(self): """Perform classification""" clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2) #pca = PCA(n_components = 400) #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata) #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata) #print self._ClassifyDriver__traindata.shape clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
def build_extra_tree_regressor(X_test, X_train_full, y_train_full): print "Building ExtraTrees regressor..." etr = ExtraTreesRegressor(n_estimators=500) etr.fit(X_train_full, y_train_full) etr_predict = etr.predict(X_test) return etr_predict
def reg_skl_etr(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_tr, y_reg_tr) pred = etr.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
def extra_trees_regressor(x, y, n_estimators, max_depth): kf = KFold(len(x), n_folds=3) scores = [] for train_index, test_index in kf: X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0) clf.fit(X_train, y_train) scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5) return np.mean(scores)
class MyExtraTreeReg(MyRegressor): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesRegressor(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesRegressor(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) def predict(self, Xtest, option = None): return self._extree.predict(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7): # 训练模型 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) gbr = ExtraTreesRegressor(n_estimators=80) gbr.fit(X=rescaledX, y=Y_train) # 评估算法模型 rescaledX_validation = scaler.transform(X_validation) predictions = gbr.predict(rescaledX_validation) print(mean_squared_error(Y_validation, predictions))
def estimate(): from loadData import loadSets from helper import splitDataset, separateTargetFromTrain from sklearn.ensemble import ExtraTreesRegressor import numpy as np import math best_rmsle = 2 best_i = 0 trainingSet, testingSet = loadSets() testingSet = None trainingData, testingData = splitDataset(trainingSet, 0.6) testingData, validationData = splitDataset(testingData, 0.5) trainingSet = None trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData) testingTarget, testingFeatures = separateTargetFromTrain(testingData) validationTarget, validationFeatures = separateTargetFromTrain(validationData) testingTarget = testingTarget.values validationTarget = validationTarget.values trainingData = None testingData = None validationData = None for i in range(2000, 3001, 1000): model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(testingFeatures) cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print i, " estimators: ", rmsle if rmsle < best_rmsle: best_rmsle = rmsle best_i = i print "Best: ", best_i, " estimators with rmsle: ", best_rmsle model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(validationFeatures) cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2) rmsle = math.sqrt(np.mean(cost)) print "Final model cost: ", rmsle
def dummie_columns_extra_trees(train, test): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") predicting_columns = list(train._get_numeric_data().columns.values) predicting_columns.remove("LISTPRICE") predicting_columns.remove("SOLDPRICE") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1) rf.fit(train[predicting_columns], train["SOLDPRICE"]) score = rf.score(test[predicting_columns], test["SOLDPRICE"]) predictions = rf.predict(test[predicting_columns]) sample_predictions(test, predictions) print "Accuracy: {}\n".format(score) return score, predictions
def main(): # X,Y = make_top_dataset(100000,30) X, Y = make_friedman1_random_attr(n_samples=100000, n_features=10) tX, tY = make_friedman1_random_attr(n_samples=100, n_features=10) start_time = time.time() ext = ETRs(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1) # ext = RFR(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1) ext.fit(X, Y) elapsed_time = time.time() - start_time print elapsed_time print score(ext, tX, tY)
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y): from sklearn.ensemble import ExtraTreesRegressor print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks") rf = ExtraTreesRegressor( n_estimators=300, n_jobs=-1 ) rf.fit(data_train_x, data_train_y) sample_predictions(rf.predict(data_test_x), data_test_y) score = rf.score(data_test_x, data_test_y) cross_validated_scores = cross_val_score( rf, data_test_x, data_test_y, cv=5) print "MSE Accuracy: {}".format(score) print "MSE Across 5 Folds: {}".format(cross_validated_scores) print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
def main(): for ind in range(1, 15+1): #for ind in [3,4,5,7,9,11,12,13,14,15]: # no 1,2,6,8,10 print "TrainingSet/ACT%d_competition_training.csv" % ind #read in data, parse into training and target sets cols, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind) target = np.array( [x[0] for x in train] ) train = filter_cols(train, cols, "../selected/selected_%d.txt" % ind) #print("Train: ", len(train), " cols:", len(train[0])) train = np.array( train ) #In this case we'll use a random forest, but this could be any classifier cfr = ExtraTreesRegressor(n_estimators=1000, max_features=(len(train[0])//3), n_jobs=8, random_state=1279) #Simple K-Fold cross validation. 10 folds. cv = cross_validation.KFold(len(train), k=10, indices=False, shuffle=True) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: ft = cfr.fit(train[traincv], target[traincv]) score = ft.score(train[testcv], target[testcv]) results.append(score) print "\tFold %d: %f" % (len(results), score) #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() )
def main(): for ind in range(1, 15+1): print "TrainingSet/ACT%d_competition_training.csv" % ind #read in data, parse into training and target sets cols, molecules1, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind) target = np.array( [x[0] for x in train] ) #load train train = filter_cols(train, cols, "../selected/cor9/selected_%d.txt" % ind) train = np.array(train) #print("Train: ", len(train), " cols:", len(train[0])) # seeds used: orig=1279, cor8=1278, cor9=1277 cfr = ExtraTreesRegressor(n_estimators=2000, max_features=(len(train[0])//3), n_jobs=8, random_state=1277) #min_samples_leaf=2, min_samples_split=2, random_state=1279) rf = cfr.fit(train, target) #predict train pred = rf.predict(train) write_file("erStacking/cor9/er_stacking_%d.csv" % ind, molecules1, pred) #load test cols, molecules2, test = read_data("../TestSet/ACT%d_competition_test.csv" % ind) test = filter_cols(test, cols, "../selected/cor9/selected_%d.txt" % ind) test = np.array(test) #predict test pred = rf.predict(test) write_file("erStacking/test/cor9/er_submission_%d.csv" % ind, molecules2, pred)
def run(): cycles = load_and_munge_training_data('train.csv') inputs = ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour'] x_train, x_test, y_train, y_test = train_test_split(cycles[inputs], cycles['count'], test_size=0.25) scaler_x = StandardScaler().fit(x_train) scaler_y = StandardScaler().fit(y_train) x_train = scaler_x.transform(x_train) y_train = scaler_y.transform(y_train) x_test = scaler_x.transform(x_test) y_test = scaler_y.transform(y_test) techniques = {} clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None) clf_sgd.fit(x_train, y_train) techniques['Linear - no penalty'] = evaluate(clf_sgd, x_train, y_train) clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2') clf_sgd1.fit(x_train, y_train) techniques['Linear - squared sums of the coefficients penalisation'] = \ evaluate(clf_sgd1, x_train, y_train) clf_svr = svm.SVR(kernel='linear') clf_svr.fit(x_train, y_train) techniques['SVR - linear'] = evaluate(clf_svr, x_train, y_train) clf_svr_poly = svm.SVR(kernel='poly') clf_svr_poly.fit(x_train, y_train) techniques['SVR - poly'] = evaluate(clf_svr_poly, x_train, y_train) clf_svr_rbf = svm.SVR(kernel='rbf') clf_svr_rbf.fit(x_train, y_train) techniques['SVR - RBF'] = evaluate(clf_svr_rbf, x_train, y_train) clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True) clf_et.fit(x_train, y_train) techniques['Random forest'] = evaluate(clf_et, x_train, y_train) clf_lr = LinearRegression() clf_lr.fit(x_train, y_train) techniques['Linear regression'] = evaluate(clf_lr, x_train, y_train) return sorted(techniques.iteritems(), key=operator.itemgetter(1))
def predict_for(output, cycles, tests, raw_tests, inputs): x_train, x_test, y_train, y_test = train_test_split(cycles[inputs], cycles[output], test_size=0.25, random_state=33) scaler_x = StandardScaler().fit(x_train) scaler_t = StandardScaler().fit(tests) x_train = scaler_x.transform(x_train) x_test = scaler_x.transform(x_test) tests = scaler_t.transform(tests) clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True, random_state=42) clf_et.fit(x_train, y_train) ps = clf_et.predict(tests) return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
def buildModelOheETR(train_data, eval_data, train_labels, seed): train_data = sparse.csr_matrix(train_data) eval_data = sparse.csr_matrix(eval_data) clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\ max_features='auto', n_jobs=-1, random_state=seed, verbose=1) clf.fit(train_data, train_labels) preds = clf.predict(eval_data) preds = np.expm1(preds) # transform -ve preds to 0 for i in range(preds.shape[0]): if preds[i] < 0: preds[i] = 0 # convert back to log1p preds = np.log1p(preds) return((model,preds))
def get_forest(X_names=Xs, y_names=ys, num_trees=256, data=data): forest = ExtraTreesRegressor( n_estimators=num_trees, n_jobs=62, bootstrap=True) X = data.loc[:, [i for i in X_names]] y = data.loc[:, [i for i in y_names]] start = time() rfr = forest.fit(X, y) end = time() return(rfr, end-start)
def fit(self, X, Y): from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel self.n_estimators = int(self.n_estimators) self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, min_weight_fraction_leaf=self.min_weight_fraction_leaf, random_state=self.random_state) estimator.fit(X, Y) self.preprocessor = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) return self
def fit(self, X, Y): from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) preprocessor = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) preprocessor.fit(X, Y) self.preprocessor = SelectFromModel(preprocessor, prefit=True) return self
def predict(class_id): print "predicting: ", class_id salaries_idx = np.where(salaries_enc == class_id) valid_idx = np.where(valid_salaries_enc == class_id) if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0: return [], None classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=0, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) print features[salaries_idx[0], :].shape print salaries[salaries_idx].shape classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx]) predictions_part = classifier.predict(validation_features[valid_idx[0]]) return predictions_part, valid_idx
def get_result(): ngram_range = (1, 2) max_df = 0.75 max_features = 2000 v = CountVectorizer( ngram_range=ngram_range, max_df=max_df, max_features=max_features) x = v.fit_transform(rats_tr.comments.fillna('')).todense() y = rats_tr.quality n_estimators = 40 max_depth = 20 clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0) clf.fit(x, y) t_x = v.transform(rats_te.comments.fillna('')).todense() t_y = clf.predict(t_x) submit = pd.DataFrame(data={'id': rats_te.id, 'quality': t_y}) submit.to_csv('ridge_submit.csv', index=False)
class ModelERT: def __init__(self, model_set_name, i_fold): self.model_set_name = model_set_name self.i_fold = i_fold def set_params(self, prms): self.prms = prms def set_data(self, labels_tr, labels_te, data_tr, data_te): self.labels_tr = labels_tr self.labels_te = labels_te self.data_tr = data_tr self.data_te = data_te def train(self): print "start ert" self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"], verbose=1, random_state=self.prms["random_state"], n_estimators=int(self.prms["n_estimators"]), max_features=self.prms["max_features"]) self.model.fit(self.data_tr.values, self.labels_tr) def predict(self): return self.model.predict(self.data_te.values) def predict_train(self): return self.model.predict(self.data_tr.values) def dump_model(self): pass def dump_pred(self, pred, name): folder = config.get_model_folder(self.model_set_name, self.i_fold) Files.mkdir(folder) path = config.get_model_path(self.model_set_name, name, self.i_fold) joblib.dump(pred, path)
def predict(class_id, param): print "predicting: ", class_id param += "\npredicting: %s\n" % (le_features[col_index].classes_[class_id],) salaries_idx = np.where(feature_category == class_id) valid_idx = np.where(validation_features_category == class_id) param += "Salaries len: %d, valid len: %d\n" % (len(salaries_idx[0]), len(valid_idx[0])) if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0: return [], None, param classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=0, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) print features[salaries_idx[0], :].shape print salaries[salaries_idx].shape print validation_features[0].shape classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx]) predictions_part = classifier.predict(validation_features[valid_idx[0]]) return predictions_part, valid_idx, param
def load_model():#make it load once when the service starts. called only once. #load_the model f = open('bpinall.txt','r').readlines() num_rows=len(f) num_col=len(f[0].split(',')) x = np.zeros((num_rows,num_col),dtype=float) y=np.zeros((num_rows),dtype=float) for i,line in enumerate(f): line=line.strip('\r\n').strip() if line.count(',')>0: x[i]=[float(p) for p in line.split(',')] f2=open('bpoutall.txt','r').readlines() for i,line in enumerate(f2): line=line.strip('\r\n') y[i]=float(line) clf=ExtraTreesRegressor(verbose=0) print (x) clf.fit(x[:-1],y[:-1]) pq=clf.predict(x[-1]) print (pq,y[-1]) #global clfp pickle.dump(clf,open('modelb.pkl','wb')) return pq
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) #Creating the model using randomforest from sklearn.ensemble import RandomForestRegressor reg_rfr = RandomForestRegressor(max_depth=19) reg_rfr.fit(X_train, y_train) y_pred1 = reg_rfr.predict(X_test) S2 = reg_rfr.score(X_train, y_train) from sklearn.ensemble import ExtraTreesRegressor reg_etr = ExtraTreesRegressor(max_depth=20) reg_etr.fit(X_train, y_train) y_pred2 = reg_etr.predict(X_test) S1 = reg_etr.score(X_train, y_train) from sklearn.svm import SVR reg_svr = SVR() reg_svr.fit(X_train, y_train) y_pred3 = reg_svr.predict(X_test) S = reg_svr.score(X_train, y_train) from sklearn.grid_search import GridSearchCV parameters = [{'max_depth': np.arange(1, 21)}] CV = GridSearchCV(estimator=reg_etr, param_grid=parameters, cv=10) CV.fit(X_train, y_train) CV_score = CV.score(X_train, y_train) best_score = CV.best_score_
'Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins' ] x3 = x2.loc[:, a] x3 = pd.concat([x3, y2], axis=1) # Finds correlation between Independent and dependent attributes plt.figure(figsize=(18, 18)) sns.heatmap(x3.corr(), annot=True, cmap="RdYlGn") plt.show() from sklearn.ensemble import ExtraTreesRegressor selection = ExtraTreesRegressor(random_state=0) selection.fit(x2, y2) ############## print(selection.feature_importances_) ######## plt.figure(figsize=(12, 8)) feat_importances = pd.Series(selection.feature_importances_, index=x2.columns) feat_importances.nlargest(20).plot(kind='barh') feat_importances.nlargest(20).index plt.show() feature = [ 'Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours', 'Duration_mins', 'Airline_Air India', 'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business', 'Airline_Multiple carriers',
final_dataset.drop(['Year'], axis=1, inplace=True) final_dataset.drop(['Current_Year'], axis=1, inplace=True) final_dataset = pd.get_dummies(final_dataset, drop_first=True) #print(final_dataset.head(10)) corrmat = final_dataset.corr() top_corr_fetures = corrmat.index #plt.figure(figsize=(20,20)) g = snb.heatmap(final_dataset[top_corr_fetures].corr(), annot=True, cmap="RdYlGn") #plt.show() X = final_dataset.iloc[:,1:] Y = final_dataset.iloc[:,0] model = ExtraTreesRegressor() model.fit(X,Y) #print(model.feature_importances_) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(5, 30, num = 6)] min_samples_split = [2, 5, 10, 15, 100] min_samples_leaf = [1, 2, 5, 10] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf} X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
index=False) preds_RF_py = np.exp(clf_RF.predict(pte[feature_names])) - 1 RF_py_sub = pd.DataFrame({'Id': ID.Id, 'Sales': preds_RF_py}) RF_py_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/RF_subs.csv", index=False) # Extreemly Randomized Trees # reg_ET = ExtraTreesRegressor(n_estimators=1000, max_features=0.75, max_depth=8, min_samples_split=12, n_jobs=-1, random_state=737, verbose=2) reg_ET = reg_ET.fit(x_train, y_train) preds_h = reg_ET.predict(pth[feature_names]) ET_holdout = pd.DataFrame({ 'Date': pth.Date, 'Dow': pth.DayOfWeek, 'Actual': np.exp(pth.Sales) - 1, 'Predicted': np.exp(preds_h) - 1 }) ET_holdout.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_holdout.csv", index=False) preds_ET = np.exp(reg_ET.predict(pte[feature_names])) - 1 ET_sub = pd.DataFrame({'Id': ID.Id, 'Sales': preds_ET}) ET_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_subs.csv", index=False)
ET = ExtraTreesRegressor(n_estimators=1200, random_state=1, n_jobs=-1, min_samples_split=2, min_samples_leaf=2, max_depth=20, max_features='sqrt', bootstrap=0) #rfe = RFE(estimator=ET,n_features_to_select=180,step=5).fit(train_x.values, train_y.icol(0).values) #train_x = rfe.transform(train_x.values) #test_x = rfe.transform(test_x.values) #sfm = SelectFromModel(estimator=ET,threshold='median').fit(train_x.values, train_y.icol(1).values) #train_x = sfm.transform(train_x.values) #test_x = sfm.transform(test_x.values) #ET.fit(train_x,train_y) #pre = (ET.predict(test_x)).round() pre = DataFrame() for i in range(7): ET.fit(train_x, list(train_y.icol(i).values)) pre['col_' + str(i)] = (ET.predict(test_x)).round() tmp_score = calculate_score(pre.icol(i).values, test_y.icol(i).values) print str(i) + ': ', tmp_score score = calculate_score(pre.values, test_y.values) print score #draw_feature_importance(train_x,ET)
if not submission: valid_salaries = dio.get_salaries(type_v, log=True) print salaries.shape #a=5/0 for n_trees in [40]: name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (min_samples_split, n_trees) print name classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=2, n_jobs=2, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) #dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
def feature_importance(self, xg_boost=True, extra_trees=False): """ function that displays feature importance using XG-Boost and Extra Trees Note: This function performs analysis using X and y * xg_boost=True, extra_trees=False: will perform feature importance using XG Boost only * xg_boost=False, extra_trees=True: will perform feature importance using Extra Trees only * xg_boost=True, extra_trees=True: will perform feature importance using both XG Boost and Extra Trees * xg_boost=False, extra_trees=False: Nothing will happen. Avoid this if you want to use feature selection. """ output_folder = self.output_folder feature_names = self.feature_names X = self.X_df y = self.y_df if xg_boost: print('\n********** Method 4: Calculating the feature importance using XGBoost. **********\n') ''' feature importance using XGBoost ''' feature_names = feature_names housing_dmatrix = xgb.DMatrix(X, y, feature_names=feature_names) # Create the parameter dictionary: params params = {"objective": "reg:squarederror", "max_depth": "4"} # Train the model: xg_reg xg_reg = xgb.train(dtrain=housing_dmatrix, params=params, num_boost_round=10) feature_imp = dict( sorted(xg_reg.get_score(importance_type='weight').items(), key=lambda kv: kv[1], reverse=True)) print('\nFeatures - Importance\n') for key, value in feature_imp.items(): print('%s: %.5f' % (key, value)) print('\n') # Plot the feature importances xgb.plot_importance(xg_reg) if not os.path.exists(output_folder): os.makedirs(output_folder) fig = plt.gcf() fig.set_size_inches(15, 10.5) plt.title('XGBoost Feature Importance') fig.savefig(output_folder + 'xgb_fs', dpi=100) plt.close() print('saved plot in {}/{}'.format(output_folder, 'xgb_fs')) if extra_trees: print('\n********** Method 5: Calculating the feature importance using Extra Trees. **********\n') model = ExtraTreesRegressor(n_estimators=100, random_state=42) model.fit(X, y) feature_imp = {} for i in range(len(model.feature_importances_)): # print('%s: %.5f' % (columns[i], model.feature_importances_[i])) feature_imp[feature_names[i]] = model.feature_importances_[i] feature_imp = dict(sorted(feature_imp.items(), key=lambda kv: kv[1], reverse=True)) print('\nFeatures - Importance\n') for key, value in feature_imp.items(): print('%s: %.5f' % (key, value)) print('\n') # print(model.feature_importances_) # use inbuilt class feature_importances of tree based classifiers # plot graph of feature importances for better visualization feat_importances = pd.Series(model.feature_importances_, index=X.columns) feat_importances.nlargest(20).plot(kind='barh') if not os.path.exists(output_folder): os.makedirs(output_folder) fig = plt.gcf() fig.set_size_inches(15, 10.5) plt.title('Extra Trees Feature Importance') fig.savefig(output_folder + 'extratrees_fs.png', dpi=100) plt.close() print('saved plot in {}/{}'.format(output_folder, 'extratrees_fs.png'))
uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, y_train) uni_y_predict = uni_knr.predict(X_test) print("K近邻(平均回归)性能评估:", uni_knr.score(X_test, y_test)) dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, y_train) dis_y_predict = dis_knr.predict(X_test) print("K近邻(距离加权回归)性能评估:", dis_knr.score(X_test, y_test)) dtr = DecisionTreeRegressor() dtr.fit(X_train, y_train) dtr_y_predict = dtr.predict(X_test) print("单一回归树性能评估:", dtr.score(X_test, y_test)) rfr = RandomForestRegressor() rfr.fit(X_train, y_train) rfr = rfr.predict(X_test) #print("随机森林性能评估:",rfr.score(X_test,y_test)) etr = ExtraTreesRegressor() etr.fit(X_train, y_train) etr_y_predict = etr.predict(X_test) print("极端随机森林性能评估:", etr.score(X_test, y_test)) gbr = GradientBoostingRegressor() gbr.fit(X_train, y_train) gbr_y_predict = gbr.predict(X_test) print("梯度提升性能评估:", gbr.score(X_test, y_test))
model = ExtraTreesRegressor(n_estimators=100, max_features=0.7, max_depth=10) for i in folds_item_ids.keys(): # Determine train and val folds fit_mask = X_train['item_id'].isin(folds_item_ids[i]['fit']) val_mask = X_train['item_id'].isin(folds_item_ids[i]['val']) X_fit = X_train[fit_mask].drop('item_id', axis='columns') y_fit = y_train[fit_mask] X_val = X_train[val_mask].drop('item_id', axis='columns') y_val = y_train[val_mask] # trick for ram saving model.fit(X_fit.astype(dtype='float32'), y_fit.astype(dtype='float32')) fit_predict = model.predict(X_fit) val_predict = model.predict(X_val) test_predict = model.predict(X_test) fit_scores.append(rmse(y_fit, fit_predict)) val_scores.append(rmse(y_val, val_predict)) sub['deal_probability'] *= test_predict # Save out-of-fold predictions name = 'folds/extra_tree_val_{}.csv'.format(i) pd.Series(val_predict).to_csv(name, index=False) # Save test predictions name = 'folds/extra_tree_test_{}.csv'.format(i) pd.Series(test_predict).to_csv(name, index=False)
Bagging = BaggingRegressor() Bagging.fit(combined_train, Y_train) Bagging_predict_train = Bagging.predict(combined_train) Bagging_predict_test = Bagging.predict(combined_test) print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, Bagging_predict_train))) #Root mean squared error for train 369.99 print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, Bagging_predict_test))) #Root mean squared error for test: 875.30 #16th model, ExtraTrees regression from sklearn.ensemble import ExtraTreesRegressor ExtraTrees = ExtraTreesRegressor() ExtraTrees.fit(combined_train, Y_train) ExtraTrees_predict_train = ExtraTrees.predict(combined_train) ExtraTrees_predict_test = ExtraTrees.predict(combined_test) print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, ExtraTrees_predict_train))) #Root mean squared error for train 2.99 print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, ExtraTrees_predict_test))) #Root mean squared error for test: 885.29 ''' External Weather API call: WeatherStartLoc_StartTime, WeatherEndLoc_StartTime, Other ideas to consider: - driver age, years of driving experience, years of driving experience in current city, avg driving speed-highway/local, driver ratings, #cars for this driver
def main(): ### parsing and Data pre-processing # load the provided data train_features_path = os.path.join(data_path, 'dengue_features_train.csv') train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv') ### pre-processing data sj_train, iq_train = preprocess_data(train_features_path, labels_path=train_labels_path) #print(sj_train.describe()) #print(iq_train.describe()) kf = KFold(n_splits=6) sj_model_list = [] sj_err_list = [] loop = 1 for train_index, val_index in kf.split( sj_train ): #The index will be split into [train_index] and [val_index] X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] sj_etr = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1) sj_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases']) predictions = sj_etr.predict(X_val.drop('total_cases', axis=1)) sj_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) sj_model_list.append(sj_etr) loop += 1 print(sj_err_list) argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0] print(argmax) sj_best_model = sj_model_list[argmax] iq_model_list = [] iq_err_list = [] loop = 1 for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] iq_etr = ETR(n_estimators=400, max_depth=4, random_state=0) iq_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases']) predictions = iq_etr.predict(X_val.drop('total_cases', axis=1)) iq_err_list.append( eval_measures.meanabs(predictions, X_val.total_cases)) iq_model_list.append(iq_etr) loop += 1 print(iq_err_list) argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0] print(argmax) iq_best_model = iq_model_list[argmax] ##Accessing testing data test_features_path = os.path.join(data_path, 'dengue_features_test.csv') sj_test, iq_test = preprocess_data(test_features_path) #Calculate the k-fold validation error sj_score = [] for train_index, val_index in kf.split(sj_train): X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index] train_predict = np.array( sj_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) sj_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of sj_score is {} (+/- {})".format( kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score))) iq_score = [] for train_index, val_index in kf.split(iq_train): X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index] train_predict = np.array( iq_best_model.predict(X_val.drop('total_cases', axis=1))).astype(int) iq_score.append(eval_measures.meanabs(train_predict, X_val.total_cases)) print("Mean of {} cross validation of iq_score is {} (+/- {})".format( kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score))) ##Use the model sj_lr and iq_lr trained before to predict the testing data print("Predicting testing data...") sj_predictions = sj_best_model.predict(sj_test) iq_predictions = iq_best_model.predict(iq_test) sj_predictions = np.array(sj_predictions).astype(int) iq_predictions = np.array(iq_predictions).astype(int) print("Creating submit file...") ##Use submission_format as template to write the answer sample_path = os.path.join(data_path, 'submission_format.csv') submission = pd.read_csv(sample_path, index_col=[0, 1, 2]) submission.total_cases = np.concatenate([sj_predictions, iq_predictions]) submission.to_csv("./data/ext_new.csv") '''
print("--------------------------------------") print('MAE is {}'.format(test_score_mae)) print('MSE is {}'.format(test_score_mse)) print('EVS is {}'.format(test_score_evs)) print('ME is {}'.format(test_score_me)) print('R2 score is {}'.format(test_score_r2)) print() print("Best parameters set found on development set:") print(gs.best_params_) print() # Re-train with best parameters regr = ExtraTreesRegressor(**gs.best_params_) t0 = time.time() regr.fit(x_train, y_train.ravel()) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) with open('output.log', 'w') as f: print("Training time: %.6f s" % regr_fit, file=f) print("Prediction time: %.6f s" % regr_predict, file=f) print(" ", file=f) print("The model performance for training set", file=f) print("--------------------------------------", file=f)
# # # pred = pd.DataFrame(pred.reshape(-1,2000).T) # real = pd.DataFrame(test_Y.reshape(-1,2000).T) # score = np.sum(np.sum(np.abs((np.round(pred)-real)/(np.round(pred)+real))))/(2000*7) # print score ####出答案 train_Y = pd.read_csv('train_label.csv', index_col=False, header=None).values print train_Y.shape train_X = pd.read_csv('train_feature.csv', index_col=False).values print train_X.shape test_X = pd.read_csv('test_feature.csv', index_col=False).values print test_X.shape model = ExtraTreesRegressor(n_estimators=1000, random_state=1, n_jobs=-1, min_samples_split=3, min_samples_leaf=1, max_depth=100) model.fit(train_X, train_Y) pred = model.predict(test_X) pred = np.round(pred).reshape((-1,2000)).T answer = np.zeros([2000, 15]) answer[:,0] = range(1,2001) answer[:,1:] = pred pd.DataFrame(answer, dtype=int).to_csv('ExtRandomTree_n1000_pred.csv', header=None, index=False)
class ForestEmbeddingsCounterfactual: """ Counterfactual estimation using forest embeddings. Given explanatory variables X, target variable y and treatment variable W, this class implements an individual counterfactual estimation model. We can break down the process in four steps: 1 - model step) Fit and validate an ensemble of trees (ET, RF, etc) from X to y 2 - embedding step) Build a supervised embedding using forest's trees leaves 3 - kNN step) For each sample, find K nearest neighbors in this new space 4 - comparison step) Compare W and y for each of the neighborhoods to determine the counterfactuals for each sample Parameters ---------- model : object, optinal (default=None) Forest-based model which implements sklearn's API, particularly the .apply() method. Must be already configured. Classification and regression models accepted. If None, model will be ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=5, bootstrap=True, n_jobs=-1). n_neighbors : int, optional (default=200) Number of neighbors to be considered at the kNN step. There's a bias-variance tradeoff here: set n_neighbors too low, estimates will be volatile and unreliable. Set n_neighbors too high, and the estimate will be biased (neighbors won't be comparable). min_sample_effect : int, optional (default=10) The minimum number of samples in a neighborhood for the counterfactual estimate to be valid, for a given W. If there's less treated/untreated elements than min_sample_effect in a neighborhood, the counterfactual will be NaN. save_explanatory : bool, optional (default=False) Save explanatory variables for explaining predictions. May cause large memory overhead. random_state : int, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ # initializing def __init__(self, model=None, n_neighbors=200, min_sample_effect=10, save_explanatory=False, random_state=None): # storing model if model == None: self.model = ExtraTreesRegressor(n_estimators=1000, min_samples_leaf=5, bootstrap=True, n_jobs=-1) else: self.model = model # storing variables self.n_neighbors = int(n_neighbors) self.min_sample_effect = int(min_sample_effect) self.save_explanatory = save_explanatory self.random_state = random_state # method for computing embedding def _get_forest_embed(self, X): """ Wrapper for extracting embeddings from forests given selected mode. Model must be fitted. """ # applying the model to get leaves this_embed = self.model.apply(X) # returning forest embedding return this_embed # fit model and neighbors def fit(self, X, W, y, verbose=0): """ Fit a counterfactual estimation model given explanatory variables X, treatment variable W and target y This method fits a forest-based model, extracts a supervised embedding from its leaves, and builds an nearest neighbor index on the embedding Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. W : array-like, shape = [n_samples] Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in this variable. Should not exceed 10 values. y: array-like, shape = [n_samples] Target variable. verbose : int, optional (default=0) Verbosity level. Returns ------- self: object """ # checking if W has too many unique values if len(np.unique(W)) > 10: raise ValueError( 'More than 10 unique values for W. Too many unique values will make the process very expensive.' ) # fitting the model self.model.fit(X, y) # getting forest embedding from model self.train_embed_ = self._get_forest_embed(X) # create neighbor index self.nn_index = NNDescent(self.train_embed_, metric='hamming') # creating a df with treatment assignments and outcomes self.train_outcome_df = pd.DataFrame({ 'neighbor': range(X.shape[0]), 'y': y, 'W': W }) # saving explanatory variables if self.save_explanatory: self.X_train = X.assign(W=W, y=y) # return self return self # method for predicting counterfactuals def predict(self, X, verbose=0): """ Predict counterfactual outcomes for X. This method will search the nearest neighbor index built using .fit(), and estimate counterfactual outcomes using kNN Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. verbose : int, optional (default=0) Verbosity level. Returns ------- counterfactual_df : pd.DataFrame Counterfactual outcomes per sample. """ # getting forest embedding from model X_embed_ = self._get_forest_embed(X) # getting nearest neighbors and distances from index neighs, dists = self.nn_index.query(X_embed_, k=self.n_neighbors + 1) # creating a df for neighbor ids neighs_df = (pd.DataFrame(neighs).reset_index().melt( id_vars='index').rename(columns={ 'index': 'id', 'value': 'neighbor' }).reset_index(drop=True)) # creating a df for the similarities similarities_df = (pd.DataFrame(1 - dists).reset_index().melt( id_vars='index').rename(columns={ 'index': 'id', 'value': 'weight' }).reset_index(drop=True)) # joining the datasets and adding weighted y variable nearest_neighs_df = (neighs_df.merge(similarities_df).drop( 'variable', axis=1).merge( self.train_outcome_df, on='neighbor', how='left').assign( y_weighted=lambda x: x.y * (x.weight)).sort_values('id')) # processing to get the effects counterfactual_df = nearest_neighs_df.assign(count=1).groupby( ['id', 'W']).sum() #counterfactual_df['y_hat'] = counterfactual_df['y']/counterfactual_df['count'] counterfactual_df['y_hat'] = counterfactual_df[ 'y_weighted'] / counterfactual_df['weight'] counterfactual_df.loc[ counterfactual_df['count'] < self.min_sample_effect, 'y_hat'] = np.nan counterfactual_df = counterfactual_df.pivot_table(values=['y_hat'], columns='W', index='id') # returning counterfactual df return counterfactual_df # running CV for model parameters def get_cross_val_scores(self, X, y, scoring=None, verbose=0): """ Estimate model generalization power with 5-fold CV. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. y: array-like, shape = [n_samples] Target variable. scoring : string, callable or None, optional, default: None Scoring method for sklearn's cross_val_score function: A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)`` which should return only a single value. Similar to :func:`cross_validate` but only a single metric is permitted. If None, the estimator's default scorer (if available) is used. verbose : int, optional (default=0) Verbosity level for sklearn's function cross_val_score. Returns ------- scores : array of float, shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. """ # CV method kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state) # generating validation predictions scores = cross_val_score(self.model, X, y, cv=kf, scoring=scoring, verbose=verbose) # calculating result return scores # generating manifold with UMAP def get_umap_embedding(self, X, verbose=0): """ Compute a 2D manifold from the forest embedding for validation and criticism. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. verbose : int, optional (default=0) Verbosity level for UMAP. Returns ------- reduced_embed : array of shape = [n_samples, 2] 2D representation of forest embedding using UMAP. """ # getting forest embedding from model X_embed_ = self._get_forest_embed(X) # reducing embedding to 2 dimensions reduced_embed = (UMAP(metric='hamming', verbose=verbose).fit_transform(X_embed_)) # returning return reduced_embed # method for explaning predictions def explain(self, sample): """ Explain predcitions of counterfactual outcomes for one sample. This method shows diagnostics and comparables so you can trust and explain counterfactual predictions to others Parameters ---------- sample : array-like or sparse matrix of shape = [1, n_features] Sample that you want to get explanations for Returns ------- comparables_table : pd.DataFrame Table of comparable elements. """ # getting forest embedding from model sample_embed = self._get_forest_embed(sample) # getting nearest neighbors and distances from index neighs, dists = self.nn_index.query(sample_embed, k=self.n_neighbors + 1) # querying comparables if self.save_explanatory: comparables_table = self.X_train.iloc[neighs[0]] else: raise ValueError( 'Model did not store training samples to get explanations from. Setting save_explanatory=True will solve the issue' ) # returning comparables table return comparables_table
train_df: pandas.DataFrame if not use_full_df: train_df = pcs_data_loader.load_corn_rows_sample_shaped_pickle_gz() else: train_df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_pickle_gz()) # load training data and train et model y = train_df['Dry_Yield'] X = train_df.drop(['Dry_Yield', 'Area'], axis=1) scaler = StandardScaler() scaler.fit(X) print('fitting model') model = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, verbose=99) model.fit(scaler.transform(X), y) model_path_ = f'{result_base_path}/et_model_{run_id}.pickle' with open(model_path_, 'wb') as f: pickle.dump(model, f) print(f'model saved: {model_path_}') scaler_path_ = f'{result_base_path}/et_scaler_{run_id}.pickle' with open(scaler_path_, 'wb') as f: pickle.dump(scaler, f) print(f'model saved: {scaler_path_}') results = [] for idx, elb_data in enumerate(pcs_data_loader.load_cached_elbs(df.columns)): year_id, elb_X, elb_y, extra_cols = elb_data print(f'comparing elb year id: {year_id}, index: {idx}')
def model_extra_tree(self): model = ExtraTreesRegressor(n_estimators=self.n_est) #, max_depth=16, random_state=42) model.fit(self.train_x, self.train_y) self.y_pred = model.predict(self.test_x)
'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata', 'Destination_New Delhi']] X.head() y=data_train.iloc[:,1] y.head() #Find the correlation plt.figure(figsize=(18,18)) sns.heatmap(train_data.corr(),annot=True, cmap='RdYlGn') plt.show() from sklearn.ensemble import ExtraTreesRegressor selection=ExtraTreesRegressor() selection.fit(X,y) #plot the importance of feature plt.figure(figsize=(12,18)) fea_importance=pd.Series(selection.feature_importances_, index=X.columns) fea_importance.nlargest(20).plot(kind='barh') plt.show() #Fitting Random Forest model from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) from sklearn.ensemble import RandomForestRegressor reg_rf = RandomForestRegressor() reg_rf.fit(X_train, y_train)
# top_corr_features = corr.index # plt.figure(figsize=(20,20)) # g = sns.heatmap(mod_df[top_corr_features].corr(), annot=True, cmap='RdYlGn') # plt.show() X = mod_df.iloc[:, 1:] y = mod_df.iloc[:, 0] # print(X.head()) # print(y.head()) # feature importance from sklearn.ensemble import ExtraTreesRegressor model = ExtraTreesRegressor() model.fit(X, y) # print(model.feature_importances_) # visualize feature importances # feat_imp = pd.Series(model.feature_importances_, index=X.columns) # feat_imp.nlargest(n=5).plot(kind='barh') # plt.show() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) # print(X_train.shape) from sklearn.ensemble import RandomForestRegressor rf_random = RandomForestRegressor()
rgrs_1 = RandomForestRegressor(n_estimators=500, max_features=10, max_depth=15, min_samples_leaf=4, n_jobs=-1) rgrs_1.fit(train_raw[l1], train_y[l1]) pred_1 = rgrs_1.predict(train_raw[l2]) pred_1_test = rgrs_1.predict(test_raw) print 'generating et ...' rgrs_2 = ExtraTreesRegressor(n_estimators=500, max_features=15, max_depth=15, min_samples_leaf=4, n_jobs=-1) rgrs_2.fit(train_raw[l1], train_y[l1]) pred_2 = rgrs_2.predict(train_raw[l2]) pred_2_test = rgrs_2.predict(test_raw) # xgb on raw params = {} params["objective"] = "reg:linear" params["eta"] = 0.01 params["max_depth"] = 7 params["subsample"] = 0.8 params["colsample_bytree"] = 0.8 params["min_child_weight"] = 5 params["silent"] = 1 plst = list(params.items()) eval_rat = int(0.8 * len(l1))
df = df.drop(['Expected', 'Id'], axis=1) # print "Prepare folds for cross validation" x_train, x_test, y_train, y_test = cross_validation.train_test_split( data, label, test_size=0.8, random_state=23435) # conf = sklearn.metrics.confusion_matrix(df['missing_values'], df['sample_weights']) # plt.imshow(conf, cmap='binary', interploation='None') print "RandomForestRegressor..." clf = sklearn.ensemble.RandomForestRegressor(verbose=2, n_jobs=2) clf.fit(x_train, y_train) print mean_squared_error(clf.predict(x_test), y_test) # with open('models/RandomForestRegressor.pkl', 'wb') as fid: # cpk.dump(clf, fid) print "GradientBoostingRegressor..." clf = sklearn.ensemble.GradientBoostingRegressor(verbose=2) clf.fit(x_train, y_train) print mean_squared_error(clf.predict(x_test), y_test) # with open('models/GradientBoostingRegressor.pkl', 'wb') as fid: # cpk.dump(clf, fid) print "ExtraTreesRegressor..." clf = ExtraTreesRegressor(n_estimators=20, verbose=2, n_jobs=-1) clf.fit(x_train, y_train) print mean_squared_error(clf.predict(x_test), y_test) # with open('models/ExtraTreesRegressor.pkl', 'wb') as fid: # cpk.dump(clf, fid)
class Model(): def __init__(self, model_type, features=[]): self.model = None self.model_type = model_type self.features = features # initialize and fit xgboost model def xgb_model(self, train_X, train_y, val_X=None, val_y=None, seed_val=seed, num_rounds=2500): param = {} param['objective'] = 'binary:logistic' param['eval_metric'] = 'logloss' param['eta'] = 0.03 param['max_depth'] = 6 param['silent'] = 1 param['subsample'] = 0.8 param['colsample_bytree'] = 0.8 param['min_child_weight'] = 8 param['scale_pos_weight'] = 0.360 # param['nthread'] = 4 param['seed'] = seed_val num_rounds = num_rounds plst = list(param.items()) # model = xgb.train(plst, xgtrain, num_rounds, verbose_eval=True) if val_X is not None and val_y is not None: xgtrain = xgb.DMatrix(train_X, label=train_y) xgval = xgb.DMatrix(val_X, label=val_y) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20, verbose_eval=True) else: _train_X, _val_X, _train_y, _val_y = sklearn.cross_validation.train_test_split( train_X, train_y, test_size=0.1, random_state=seed) xgtrain = xgb.DMatrix(_train_X, label=_train_y) xgval = xgb.DMatrix(_val_X, label=_val_y) watchlist = [(xgtrain, 'train'), (xgval, 'val')] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20, verbose_eval=True) return model # initialize and fit lightgbm model def lgb_model(self, train_X, train_y, val_X=None, val_y=None, seed_val=seed, num_rounds=2500): params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0, 'num_threads': 64, 'scale_pos_weight': 0.360 } # model = lgb.train(params, lgb_train, num_boost_round=num_rounds) if val_X is not None and val_y is not None: lgb_train = lgb.Dataset(train_X, train_y) lgb_val = lgb.Dataset(val_X, val_y, reference=lgb_train) model = lgb.train(params, lgb_train, num_boost_round=num_rounds, valid_sets=lgb_val, early_stopping_rounds=20) else: _train_X, _val_X, _train_y, _val_y = sklearn.cross_validation.train_test_split( train_X, train_y, test_size=0.1, random_state=seed) lgb_train = lgb.Dataset(_train_X, _train_y) lgb_val = lgb.Dataset(_val_X, _val_y, reference=lgb_train) model = lgb.train(params, lgb_train, num_boost_round=num_rounds, valid_sets=lgb_val, early_stopping_rounds=20) return model # get predictions def predict(self, test_X, test_y=None): if self.features: test_X = test_X[self.features] if self.model: if self.model_type == 'xgboost': xgtest = xgb.DMatrix(test_X) preds = self.model.predict(xgtest) preds = preds.reshape(-1, 1) elif self.model_type == 'lgb': preds = self.model.predict(test_X) preds = preds.reshape(-1, 1) elif self.model_type == 'ExtraTreesRegressor': preds = self.model.predict(test_X) preds = preds.reshape(-1, 1) else: preds = self.model.predict_proba(test_X)[:, 1] preds = preds.reshape(-1, 1) if test_y is not None: print('log_loss: ', log_loss(test_y, preds)) return preds else: assert ( 'No trained model was found... You have to first fit the model' ) # fit model on full feature set or subset if provided def fit(self, train_X, train_y, val_X=None, val_y=None): if self.features: train_X = train_X[self.features] if self.model_type == 'xgboost': self.model = self.xgb_model(train_X, train_y, val_X, val_y) elif self.model_type == 'lgb': self.model = self.lgb_model(train_X, train_y, val_X, val_y) elif self.model_type == 'RandomForestClassifier': self.model = RandomForestClassifier(n_estimators=150, n_jobs=-1, class_weight={ 1: 0.472001959, 0: 1.309028344 }) self.model.fit(train_X, train_y) elif self.model_type == 'LogisticRegression': self.model = LogisticRegression(C=0.1, solver='sag', class_weight={ 1: 0.472001959, 0: 1.309028344 }) self.model.fit(train_X, train_y) elif self.model_type == 'svm': self.model = SVC(random_state=seed, probability=True, verbose=True, class_weight={ 1: 0.472001959, 0: 1.309028344 }) self.model.fit(train_X, train_y) elif self.model_type == 'fastFM': self.model = sgd.FMClassification(n_iter=1000, init_stdev=0.1, rank=2, step_size=0.1) self.model.fit(train_X, train_y) #To be completed => http://arogozhnikov.github.io/2016/02/15/TestingLibFM.html elif self.model_type == 'KNeighborsClassifier': self.model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=-1) self.model.fit(train_X, train_y) elif self.model_type == 'AdaBoostClassifier': self.model = AdaBoostClassifier(n_estimators=1000, random_state=seed) self.model.fit(train_X, train_y) elif self.model_type == 'ExtraTreesClassifier': self.model = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=2, n_jobs=-1, class_weight={ 1: 0.472001959, 0: 1.309028344 }) self.model.fit(train_X, train_y) elif self.model_type == 'ExtraTreesRegressor': self.model = ExtraTreesRegressor(n_estimators=200, max_depth=None, min_samples_split=2, n_jobs=-1) self.model.fit(train_X, train_y)
# train 485 reg, test 162 reg , eval 162 reg #ExtraTreesRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None) from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer,accuracy_score,r2_score modelo= ExtraTreesRegressor(bootstrap=True, ccp_alpha=0.1, criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.30, min_impurity_split=None, min_samples_leaf=2, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score='False', random_state=None, verbose=0, warm_start=True) modelo.fit(X, y) y_test_predict = modelo.predict(X_val) resultados=y_test_predict #Obtención de coeficiente de determinación: r2_score(y_val, resultados) res=pd.DataFrame() res['predicho']=resultados res['real']=y_val res['errorAbs']=res['predicho']-res['real'] res['errorCuad']=(res['predicho']-res['real'])**(2) #Visualización del resultado mediante histograma de error absoluto <<< Ilustración 7.31>>> plt.title('Histograma de errores absolutos (árboles extremadamente aleatorios)')
axis=1, errors='ignore', inplace=True) X = df.drop(['Dry_Yield'], axis=1, errors='ignore') y = df['Dry_Yield'] label_mask = numpy.isin(X.columns, label_cols) enc = DummyEncoder(label_mask) enc.fit(X) scaler = StandardScaler() scaler.fit(X.loc[:, ~label_mask].fillna(0)) model = ExtraTreesRegressor(verbose=99, min_samples_leaf=7, n_jobs=-1) X_scaled = transform(X, enc, label_mask) model.fit(X_scaled, y) elb_results = [] for idx, (year_id, elb_df) in enumerate(load_cached_elbs()): elb_df.drop(['Year', 'YearId', 'ProcessedLayerUID', 'Area'], axis=1, errors='ignore', inplace=True) elb_df = elb_df[df.columns] elb_X = elb_df.drop(['Dry_Yield'], axis=1) elb_X_scaled = transform(elb_X, enc, label_mask) elb_y = elb_df['Dry_Yield'] predictions = model.predict(elb_X_scaled) elb_score = ScoreReport(elb_y.values, predictions) elb_results.append(elb_score)