def __init__(self, config_file=''): # Parse config file self.parser = SafeConfigParser() self.parser.read(config_file) # machine learning specific variables self.classify = constants.DO_CLASSIFICATION # Regress or classify? self.vars_features = constants.fixed_vars self.vars_target = constants.ML_TARGETS if self.classify: self.var_target = constants.ML_TARGETS self.task = 'classification' self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) else: self.var_target = constants.ML_TARGETS self.task = 'regression' self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) # SVR() # Get path to input self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl # Output directory is <dir>_<classification>_<2014> self.path_out_dir = constants.out_dir utils.make_dir_if_missing(self.path_out_dir) # Model pickle self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'
def RandomForest(x_train,y_train,x_test,degree): params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train, y_train) y_predict = clf.predict(x_test) #plt.plot(x_test,y_predict,color='red') return y_predict
def RF_ST(trainFileName,testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1','2','3','4','5'] res = [] for i in store: train_X = [];train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [];items = [] context = testData[i] for array in context: items.append((array[0],array[1])) array = [float(x) for x in array[2:] ] test_X.append((array[2:])) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)]) return res
def RF_ALL(trainFileName,testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i],'all','%.4f'%max(pred_y[i],0)]) return res
def __init__(self, sig_weight=1., pow_sig=1., pow_bg=1., gap=1., n_estimators=10, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto", bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None): RandomForestRegressor.__init__(self) # Everything should be set via set_params self.sig_weight = sig_weight self.pow_bg = pow_bg self.pow_sig = pow_sig self.gap = gap
def run(self): print "Reading device separations..." indexes = np.load("indexesTrain.npy") self.train = self.train.values print "Getting attributes..." trainFeatures = [self.getMainFeatures(self.train, indexes, i) for i in range(len(indexes))] for i in range(len(indexes)): (trainVect, targetVect) = self.getAttributes(trainFeatures, indexes, i) classifier = RandomForestRegressor(n_estimators=500, verbose=2, n_jobs=4, random_state=1) classifier.fit(trainVect, targetVect) pickle.dump(classifier, open("models/models" + str(i) + ".mod", "w"))
def RandomForest(x_train,y_train,x_test,y_test): degree = [1,2,3,4,7] result = {} rmse_list = [] for d in degree: params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train[:, np.newaxis], y_train) y_predict = clf.predict(x_test[:, np.newaxis]) rmsevalue = rmse(y_test,y_predict) result[rmsevalue] = [y_predict,d] rmse_list.append(rmsevalue) rmseMin = min(rmse_list) return rmsevalue,result[rmseMin]
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2): alg = RandomForestRegressor(random_state=1) alg.fit(train_set[predictors], train_target) #importances = alg.feature_importances_ #print("Original ",numpy.argsort(importances)) #indices = numpy.argsort(importances)[::-1] #print (" importances ",importances) #print (" indices ",indices) #for f in range(train_set.shape[1]-2): # print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]], # importances[indices[f]])) predictions = alg.predict(test_set[predictors]) return predictions;
def __init__(self): super(ItemSetModel, self).__init__() #self.clf = DecisionTreeRegressor() #self.clf = Lasso(0.1) #self.clf = SVR(kernel='rbf') #self.clf = ElasticNetCV() self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'): print "Loading data..." train_data = pd.read_csv(train_file) test_data = pd.read_csv(test_file) y = np.array(train_data[["ACTION"]]) #X = np.array(train_data.ix[:,1:-1]) # Ignores ACTION, ROLE_CODE X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]]) X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE SEED = 4 #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y) clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y) print clf.feature_importances_ #Try feature selection mean_auc = 0.0 n = 10 for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions clf.fit(X_train, y_train) preds = clf.predict(X_cv) # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc/n) predictions = clf.predict_(X_test) #print predictions #print 'Writing predictions to %s...' % (output_file) create_test_submission(output_file, predictions) return 0
def run(self): # extract data from the batch df_train = pd.read_csv(self.input().path, header=[0, 1]) X, y = preprocess2(df_train, snr=10.) # train regressor reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9, n_jobs=-1) # reg = KNeighborsRegressor(algorithm="auto") # reg = LinearRegression() # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.) # reg = LinearSaO2Unmixing() reg.fit(X, y.values) # reg = LinearSaO2Unmixing() # save regressor regressor_file = self.output().open('w') pickle.dump(reg, regressor_file) regressor_file.close()
def make_models(self, missing_columns): available_table = self.full_table.copy() #clear out the table for column in missing_columns: del available_table[column] available_features = available_table.as_matrix() clfs = {} #build a model for each missing column for column in missing_columns: labels = self.full_table.as_matrix(columns = [column]) labels = np.reshape(labels, (len(labels))) #unnest the arrays clf = RandomForestRegressor(n_estimators = 100) clf.fit(available_features, labels, available_table['WGTP']) clfs[column] = clf return clfs
def predict_per_cpu_full(): data, target = load_data() data, target, labels = normalize_data(data, target) data = data[['C0', 'cpuFull']] data['target'] = target split_by_types = dict() cpu_groups = data.groupby('cpuFull') for name, group in cpu_groups: X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target']) split_by_types[str(name)] = { 'train': { 'data': X_train, 'target': y_train }, 'test': { 'data': X_test, 'target': y_test } } # print split_by_types summ = 0.0 for cpu, data_set in split_by_types.iteritems(): plt.figure() # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0) reg = RandomForestRegressor(n_estimators=5) reg.fit(data_set['train']['data'], data_set['train']['target']) test_data = data_set['test']['data'] y_pred = reg.predict(test_data) print mape(data_set['test']['target'], y_pred), cpu plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual') plt.scatter(test_data, y_pred, s=3, color='r', label='predicted') plt.legend(loc='upper left') plt.ylabel('mul time') plt.title('Category: {}'.format(cpu)) plt.savefig('imgs/{}.png'.format(cpu))
def train(data,val_ind,indices): max_numb = val_ind.shape[1] regs = [] for i in range(max_numb): regs.append(0) for i in indices: # print i # reg = sklearn.linear_model.Lasso(max_iter=3000) reg = RandomForestRegressor() # reg=skl.tree.DecisionTreeRegressor() # reg = skl.linear_model.LinearRegression() # reg = AdaBoostRegressor() # print val_ind.shape # print val_ind[:,i] # print data.shape # print data[0] # print len(val_ind[:,i]) reg.fit(data,val_ind[:,i]) regs[i]=reg return regs
y = household.as_matrix(columns = ['KWH']) y = np.reshape(y, (len(y))) del household['KWH'] #del household['ST'] #del household['DIVISION'] #del household['ELEP'] #if 'CDD' in household.columns: # del household['CDD'] # del household['HDD'] X = household.as_matrix() X = np.nan_to_num(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 10, n_jobs = 8) clf.fit(X_train, y_train) print(metrics.mean_squared_error(y_test, clf.predict(X_test))) print(metrics.r2_score(y_test, clf.predict(X_test))) predictions = clf.predict(X_test)[:50] ''' features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) ''' pums = pd.read_csv("../joined_weather.csv") pums = pums.sample(1000) pums_puma_vector = pums.as_matrix(columns = ['PUMA']) left_matrix = pums[['PUMA', 'WGTP', 'SERIALNO']] del pums['PUMA']
import pandas as pd from sklearn.ensemble.forest import RandomForestRegressor import time dset = pd.read_csv("./data/concrete_data.csv") X = dset.iloc[:, 0:7] y = dset.iloc[:, 8] estimator = RandomForestRegressor(max_features = 3, n_estimators = 50, n_jobs = 1, oob_score = True) t0 = time.time() estimator.fit(X, y) print(time.time() - t0)
def predict(self, X): return RandomForestRegressor.predict(self, X)[:, numpy.newaxis]
s = s + '' + str(valoresDiff[i + j]) + ',' d.append(valoresDiff[i + j]) for j in range(0, 15): s = s + '' + str(valoresVol[i + j]) + ',' d.append(valoresVol[i + j]) #maxValores = max(valoresCopiar[longValores:i+j]) # Esta cambia porque no debemos tener los valores #maxValores = max(valoresCopiar[i+j:i+j+5]) maxValores = max(valoresCopiar[i + 14 + longValoresTest:i + longValoresTest + 14 + 5]) #s = s + str(150) #d.append(150) X_test.append(d) y_test.append(maxValores) # Entrenamos regr = RandomForestRegressor() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) regr2 = SVR(kernel='rbf', C=1e3, gamma=0.1) regr2.fit(X_train, y_train) y_pred2 = regr2.predict(X_test) regr3 = linear_model.LinearRegression() regr3.fit(X_train, y_train) y_pred3 = regr3.predict(X_test) # Votacion # Si todos OK => Se invierte votacion = 0 max_pred_array = [max(y_pred), max(y_pred2), max(y_pred3)]
def doEval(dayNight, landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) columnsToUse = [] if landuse == True: columnsToUse.append('leisure_area') columnsToUse.append('landuse_area') if topo == True: columnsToUse.append('buildings_number') columnsToUse.append('buildings_area') if traffic_static == True: columnsToUse.append('lane_length') columnsToUse.append('length') if traffic_dynamic == True: columnsToUse.append('traffic_length_car') columnsToUse.append('traffic_length_lgv') columnsToUse.append('traffic_length_hgv') if weather == True: columnsToUse.append('winddirection') columnsToUse.append('windspeed') columnsToUse.append('temperature') columnsToUse.append('rain') columnsToUse.append('pressure') if time == True: columnsToUse.append('hour') columnsToUse.append('day_of_week') columnsToUse.append('month') columnsToUse.append('bank_holiday') columnsToUse.append('race_day') data = {} columns = [] loadData(dataFile, ['timestamp'], data, columns) locationValues = findOutKForValidation("location", data) for location in locationValues: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columnsToUse, "target", dayNight) print("\t" + str(len(trainX)) + "," + str(len(testX))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction) print("\t" + str(rmse)) output.write(str(dayNight) + ",") output.write(groupName + ",") output.write(str(rmse[1]) + "\n") output.flush()
btc_ld = np.log(btc_df) - np.log(btc_df.shift(1)) btc_ld = btc_ld.dropna() # split the dataframe into train and test train = btc_ld.loc[datetime.date(year=2014,month=1,day=1):datetime.date(year=2017,month=12,day=31)] test = btc_ld.loc[datetime.date(year=2018,month=1,day=1):datetime.date(year=2018,month=1,day=31)] # split into input and output? trainX = np.asarray(train.drop(columns='BTC')) trainY = np.asarray(train.BTC) testX = np.asarray(test.drop(columns='BTC')) testY = np.asarray(test.BTC) # Define the RF model RF_Model = RandomForestRegressor(n_estimators=100, max_features=1, oob_score=True) # Fit the model rf_fitted = RF_Model.fit(trainX,trainY) # predict the trained data trainY_predict = rf_fitted.predict(trainX) trainY_predict = trainY_predict.reshape(-1,1) #reshape (transpose) # Plot the predicted training data train_plot_df = pd.DataFrame(trainY_predict, columns='Predicted BTC') train_plot_df = train_plot_df.set_index(train.index) train_plot_df['BTC'] = train.BTC # test from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from math import sqrt r2_train = r2_score(train.BTC,trainY_predict)
import numpy as np from sklearn.ensemble.forest import RandomForestRegressor import stockPlot as sp # Initiate the monthly trade object monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013) # Download data from Yahoo finance monthData.monthlyDataDownload() # Pre-processing of training an testing data monthData.trainFeaturePre() # Read pre-processed data from hard drive # monthData.trainFeaturePreHd() # Number of training months trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan # Initiate a random forest regressor clf = RandomForestRegressor(n_estimators=10) # totalReturn = 1 predictedReturn = np.zeros(monthData.stockNum) monthlyReturn = np.zeros(monthData.testSpan) aggReturn = np.zeros(monthData.testSpan+1) aggReturn[0] = 1 # rolling training and testing for j in range(0, monthData.testSpan): for i in range(0, monthData.stockNum): clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i]) predictedReturn[i] = clf.predict(monthData.xTest[j, :, i]) monthlyReturn[j] = monthData.por10Returns(j, predictedReturn) yearReturn = totalReturn * (monthlyReturn[j]+1) aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j])
x_test = x_test.drop(['segment_id'], axis=1) # prepare models models = [] # models.append(('LR', LogisticRegression())) # models.append(('LDA', LinearDiscriminantAnalysis())) # models.append(('KNN', KNeighborsClassifier())) # models.append(('CART', DecisionTreeClassifier())) # models.append(('NB', GaussianNB())) svReg = SVR(C=20.299419990722537, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.06841395086207253, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True); randForReg = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=100, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) models.append(('LassoReg', Lasso(alpha=0.1))) models.append(('SVM', svReg)) models.append(('LinearReg', LinearRegression())) models.append(('randForest', randForReg)) mas = make_scorer(mean_absolute_error, greater_is_better=False); # evaluate each model in turn results = [] names = [] scoring = 'accuracy' for name, model in models: kfold = KFold(n_splits=10, random_state=7)
# for j in range(np.shape(res)[1]): # if res[i][j] == 100: # res[i][j] = 0 # else: # res[i][j] = -0.01 * res[i][j] # return res # def normalizeY(arr): # arr=arr/100 # return arr if __name__ == '__main__': train_x, test_x, train_y, test_y, x_data, y_data = load(train_data_path) rf_model = RandomForestRegressor() rf_model.fit(x_data, y_data) with open(filename, 'wb') as file: pickle.dump(rf_model, file) rf_train_score = rf_model.score(x_data, y_data) rf_test_score = rf_model.score(test_x, test_y) print("RF train score:",rf_train_score) print("RF test score:",rf_test_score) dt_model = DecisionTreeRegressor() dt_model.fit(x_data, y_data) with open(filename2, 'wb') as file: pickle.dump(dt_model, file) dt_train_score = dt_model.score(x_data, y_data) dt_test_score = dt_model.score(test_x, test_y) print("DT train score:",dt_train_score)
def create_model(): return RandomForestRegressor(min_samples_leaf=2, n_estimators=400, n_jobs=-1, random_state=42)
build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto( OptimalLGBMRegressor(objective="regression", n_estimators=17, num_iteration=11), "LGBMAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75), "LinearRegressionEnsembleAuto") build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3), "RandomForestAuto", flat=True) build_auto(RidgeCV(), "RidgeAuto") build_auto(OptimalXGBRegressor(objective="reg:linear", ntree_limit=31), "XGBAuto") auto_na_X, auto_na_y = load_auto("AutoNA.csv") auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int) auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int) auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int) def build_auto_na(regressor, name): mapper = DataFrameMapper(
'name': 'Linear Model', 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(n_estimators=100), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_) }, ] } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90
def train(training, k): model = RandomForestRegressor(n_estimators=k, n_jobs=-1) model.fit(training[:,:-1], training[:,-1]) return model
x_max = np.max(np.array(X), axis=0) outputs = ot.NumericalSample.ImportFromTextFile(base_dir + 'outputs.txt', '\t') y = np.array(outputs).reshape((1, len(outputs)))[0] x_min = np.min(X, axis=0) x_max = np.max(X, axis=0) n_train = 5000 X_train = np.array(X)[:n_train, :d] y_train = y[:n_train] X_test = np.array(X)[n_train:, :d] y_true = y[n_train:] reg = AdaBoostRegressor(RandomForestRegressor(), n_estimators=50) #, random_state=rng) fit_train = reg.fit(X_train, y_train) #plt.plot(y_true,fit_train.predict(X_test),'.') #plt.plot(y_true,y_true,color="red",lw=2) reg = AdaBoostRegressor(RandomForestRegressor(), n_estimators=20) #, random_state=rng) fit_all = reg.fit(X, y) s = 33 #plt.plot(y,fit_all.predict(X),'.') #plt.plot(y,y,color="red",lw=2)
return False return True #return column in ['BDSP', 'RMSP', 'HFL', 'BLD', 'AGEP', 'NP', 'YBL', 'HINCP', 'HDD', 'CDD'] household = household[[column for column in household.columns if select_column(column)]] X = household.as_matrix() print(household.columns) #X = household.as_matrix() with open("kwh_model_features.json", "w") as f: json.dump(list(household.columns), f, indent = True) print(y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 50, n_jobs = 8) clf.fit(X_train, y_train) print(y_test[:100]) print(np.sqrt(metrics.mean_squared_error(y_test, clf.predict(X_test)))) print(metrics.r2_score(y_test, clf.predict(X_test))) features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) with open("kwh_model.pkl", 'wb') as f: pickle.dump(clf, f)
'name': 'Linear Model', 'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_) }, ] } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90
import numpy as np import json household = pd.read_csv("../household_complete_one_hot.csv") if 'KWH' in household.columns: del household['KWH'] X_columns = [column for column in household.columns if column != "ELEP"] X = household.as_matrix(columns = X_columns) y = [label[0] for label in household.as_matrix(columns = ["ELEP"])] #print(y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) print(metrics.r2_score(y_test, clf.predict(X_test))) features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) #fill spaces in ELEP normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',') print('pums shape', normalized_pums.shape) with open("../vectorized_puma_regions/puma_list.json") as f:
# "Huber", "Linear", "Passive Aggressive", "SGD", "Theil-Sen", "RANSAC", "K-Neighbors", "Radius Neighbors", "MLP", "Decision Tree", "Extra Tree", "SVR" ] classifiers = [ RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), # GradientBoostingRegressor(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] # HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor(random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor(weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate),
class Estimators: """ Estimators class. This class (i) fits charging duration and energy consumption model with designated regressor type (Random Forest, Extra-Random Forest, or Decision Tree Regressor). (ii) predicts charging duration and energy consumption with trained models """ def __init__(self, filePath, durationModelType="RF", energyEstimatorType="RF"): # Load and process data data = self.loadData(filePath) # Set regression type for charging duration and energy consumption self.durationModelType = durationModelType self.energyEstimatorType = energyEstimatorType # Parse attributes and target columns attrColumns = [ 'Start Time Seconds From Midnight', 'Vehicle Battery Capacity' ] self.X = data[attrColumns] # Vehicle Battery Capacity # Station Start time # OPTIONAL: User Id? # OPTIONAL: Vehicle Model Year? self.durationData = pd.to_numeric(data["Charging Time Secs"], downcast='float') / 60.0 # mins self.energyData = pd.to_numeric(data["Total Charge"], downcast='float') # kW? # Fit estimators self.fitDurationEstimator(durationModelType) self.fitEnergyEstimator(energyEstimatorType) ### Validate the models ### -- TO BE DEVELOPED... # K = 5 # k-folds cross-validation # y = self.energyData # R2 = cross_val_score(self.energyEstimator, self.X, y=np.ravel(y), cv=KFold(y.size, K), n_jobs=1, scoring="accuracy").mean() # self.R2 = R2 # print "The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2) print "Done: fitting estimators" def loadData(self, filePath): """ Load and process raw data """ ### Load data ### print "loading data..." timeStart = time.clock() if "xlsx" in filePath: data = pd.read_excel(filePath) elif "csv" in filePath: data = pd.read_csv(filePath) else: raise ValueError("Wrong file path: " + filePath) print "Done: loading data. Time: " + str(time.clock() - timeStart) ### Process data ### # - convert time in string to seconds from midnight # - filter out rows containing nan elements print "processing data..." timeStart = time.clock() # Get Start Time in seconds from midnight startTimeSec = [ (datetime.strptime(startTime, "%m/%d/%Y %H:%M") - datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace( hour=0, minute=0, second=0, microsecond=0)).total_seconds() for startTime in data["Station Start Time"] ] print "Done: processing data. Time: " + str(time.clock() - timeStart) # Add new coloumn for start time in seconds from midnight data['Start Time Seconds From Midnight'] = startTimeSec # Remove indices with NaN elements from columns that you want to use in training nanIndices = data["Vehicle Battery Capacity"].isnull().values dataFiltered = data[np.invert(nanIndices)] # {COPY THE ABOVE TWO LINES AND CHANGE THE INDEX KEY IF YOU HAVE ANOTHER COLUMN TO FILTER OUT} return dataFiltered def fitDurationEstimator(self, modelType="RF"): """ Fit duration model with specified regressor type (Random forest by default) """ print "fitting charging duration model..." if modelType == "RF": self.durationEstimator = RandomForestRegressor(random_state=0, n_estimators=50, max_depth=50) self.durationEstimator.fit(self.X, self.durationData) # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR} def fitEnergyEstimator(self, modelType="RF"): """ Fit energy consumption model with specified regressor type (Random forest by default) """ print "fitting energy consumption model..." # Stack energy consumption data to attribute data before fitting the energy model. # i.e., we use a charging duration to predict an energy consumption X = self.X X["chargingDuration"] = self.durationData # Fit energy consumption regressor if modelType == "RF": self.energyEstimator = RandomForestRegressor(random_state=0, n_estimators=50, max_depth=50) self.energyEstimator.fit(X, self.energyData) # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR} def estimateChargingDuration(self, Xq): return self.durationEstimator.predict(Xq) def estimateEnergyConsumptions(self, Xq): return self.energyEstimator.predict(Xq) def predict(self, df): """ Returns predicted charging duration and energy consumption based on the trained estimators Params df: dataframe with 'Start Time Seconds From Midnight', 'Vehicle Battery Capacity' columns NOTE: add more if needed """ # Get start time in seconds from midnight startTime = df["Station Start Time"][0] startTimeSec = ( datetime.strptime(startTime, "%m/%d/%Y %H:%M") - datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace( hour=0, minute=0, second=0, microsecond=0)).total_seconds() # Build input Dataframe Xq = pd.DataFrame() Xq['Start Time Seconds From Midnight'] = [startTimeSec] Xq['Vehicle Battery Capacity'] = df["Vehicle Battery Capacity"] # Estimate charging duration estDuration = self.estimateChargingDuration(Xq) # Estimate energy consumption Xq["ChargingDuration"] = [estDuration] estEnergy = self.estimateEnergyConsumptions(Xq) return estDuration[0], estEnergy[0]
def ts_rf(n, fea, step, ntrees, njobs): #Random Forest Model for time series prediction #from sklearn import svm import math from sklearn import metrics import matplotlib.pyplot as plt from scipy.linalg import hankel import numpy as np from sklearn.ensemble.forest import RandomForestRegressor #input data from csv file #use n datapoints #n=1100 # # of features of training set ## fre=50 # # how many steps to predict #step=29 #fea=50 path = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls.txt' path1 = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls_nor.txt' result_tem = [] date = [] with open(path) as f: next(f) for line in f: item = line.replace('\n', '').split(' ') result_tem.append(float(item[1])) date.append(item[2]) mean = np.mean(result_tem) sd = np.std(result_tem) result = (result_tem - mean) / sd #form hankel matrix X = hankel(result[0:-fea - step + 1], result[-1 - fea:-1]) y = result[fea + step - 1:] #split data into training and testing Xtrain = X[:n] ytrain = y[:n] Xtest = X[n:] ytest = y[n:] # random forest rf = RandomForestRegressor(n_estimators=ntrees, n_jobs=njobs) rf_pred = rf.fit(Xtrain, ytrain).predict(Xtest) #a = rf.transform(Xtrain,'median') #plot results LABELS = [ x[-6:] for x in date[n + fea + step - 1:n + fea + step - 1 + len(ytest)] ] t = range(n, n + len(ytest)) # plt.show() # plt.plot(t,y_lin1,'r--',t,ytest,'b^-') # plt.plot(t,y_lin2,'g--',t,ytest,'b^-') ypred = rf_pred * sd + mean ytest = ytest * sd + mean line1, = plt.plot(t, ypred, 'r*-') plt.xticks(t, LABELS) line2, = plt.plot(t, ytest, 'b*-') # plt.xlim([500,510]) plt.legend([line1, line2], ["Predicted", "Actual"], loc=2) #plt.show() #plt.plot(xrange(n),result[0:n],'r--',t,y_lin3,'b--',t,ytest,'r--') y_true = ytest y_pred = ypred metrics_result = { 'rf_MAE': metrics.mean_absolute_error(y_true, y_pred), 'rf_MSE': metrics.mean_squared_error(y_true, y_pred), 'rf_MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100 } print metrics_result
def train_random_forest(X, Y): rf = RandomForestRegressor(n_estimators=20) rf.fit(X, Y) return rf
x_test = X[-7:] y_test = Y[-7:] ### RigeLinearCV = linear_model.RidgeCV(cv=10) rcv = RigeLinearCV.fit(x_train, y_train) y_pre_rcv = rcv.predict(x_oob) ### params_rf = { 'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2, 'n_jobs': 4 } rf = RandomForestRegressor(**params_rf) rf.fit(x_train, y_train) y_pre_rf = rf.predict(x_oob) ### y_pre_diff = mean_normal_weekend_diff(Y[-21:-14], xday[-35:-14], xweekend[-35:-14], -14, -7) ### Y_test.append(y_test) #y_pre_diff = mean_normal_weekend_diff(Y,xday,xweekend,-21,-14) ### loss_rcv = Evaluation([y_pre_rcv], [y_oob]) loss_rf = Evaluation([y_pre_rf], [y_oob]) loss_diffmean = Evaluation([y_pre_diff], [y_oob])
def _2011x2011_ (data_path): ##### LOADING ##### sys.stdout.write("Loading data... ") # Load data from .csv file with open(data_path+'_X.csv') as data_file: reader = csv.reader(data_file) # Initialize lists for data and class labels data =[] # skip header next(reader, None) # For each row of the csv file for row in reader: data.append([float(x) for x in row]) with open(data_path+'_y.csv') as labels_file: reader = csv.reader(labels_file) # Initialize lists for data and class labels val_ind =[] # skip header next(reader, None) # For each row of the csv file for row in reader: val_ind.append(row) sys.stdout.write("done\n") ##### TRAINING ##### # splitting data_train, data_test, val_ind_train, val_ind_test \ = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42) # Cutting date/ ASS/ number value from labels date_train = [x[0] for x in val_ind_train] # ASS_train = [x[1] for x in val_ind_train] val_train = [float(x[1]) for x in val_ind_train] date_test = [x[0] for x in val_ind_test] # ASS_test = [x[1] for x in val_ind_test] val_test = [float(x[1]) for x in val_ind_test] sys.stdout.write("Training regressor... ") reg = RandomForestRegressor() # reg = skl.tree.DecisionTreeRegressor() # reg = skl.linear_model.LinearRegression() reg.fit(data_train, val_train) sys.stdout.write("done\n") ##### PREDICTION ##### sys.stdout.write("Predicting... ") val_predicted = reg.predict(data_test) sys.stdout.write("done\n") ##### ERROR ##### df = pd.DataFrame() df['date'] = pd.to_datetime(date_test) # df['ASS'] = ASS_test df['original'] = val_test df['predicted'] = val_predicted.tolist() df = df.set_index('date') # df = df.loc[df['ASS'] == 'CAT'] # one example df.info() df.plot() plt.show() print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))
matplotlib=True) else: # model.skl_model should be RandomForestClassifier features = [ feature.name for feature in self.dataset.domain.attributes ] explainer = shap.TreeExplainer(model.skl_model) shap_values = explainer.shap_values(X) for c in range(len(shap_values)): shap.force_plot(explainer.expected_value[c], shap_values[c][idx, :], X[idx, :], feature_names=features, matplotlib=True) class Task: """ Task class to perform computations in parallels """ def __init__(self): self.future = None self.watcher = None if __name__ == "__main__": # pragma: no cover data = Table('housing') rf = SKL_RF(n_estimators=10) rf.fit(data.X, data.Y) model_rf = RandomForestRegressor(rf) WidgetPreview(OWShapSingle).run(set_data=data, set_model=model_rf)
auto = auto_mapper.fit_transform(auto_df) store_pkl(auto_mapper, "Auto.pkl") auto_X = auto[:, 0:7] auto_y = auto[:, 7] print(auto_X.dtype, auto_y.dtype) def predict_auto(regressor): mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"]) return mpg auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5) auto_tree.fit(auto_X, auto_y) store_pkl(auto_tree, "DecisionTreeAuto.pkl") store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv") auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5) auto_forest.fit(auto_X, auto_y) store_pkl(auto_forest, "RandomForestAuto.pkl") store_csv(predict_auto(auto_forest), "RandomForestAuto.csv") auto_regression = LinearRegression() auto_regression.fit(auto_X, auto_y) store_pkl(auto_regression, "RegressionAuto.pkl") store_csv(predict_auto(auto_regression), "RegressionAuto.csv")
def RandomForest(df, queryFile): model = RandomForestRegressor(random_state=0, n_estimators=200, n_jobs=-1) MLRegression(model, df, queryFile) return
def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) trainPreds = defaultdict(list) testPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2( location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) for x in testPrediction: testPreds[tag].append(x) t2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2( location, location2, "location", data, all_features, "target") t2Y = t2Y + trainY2 labelt2Y = [] for i in range(0, len(t2Y)): bestModel = 0 bestAbs = abs(t2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(t2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelt2Y.append(bestModel) print("#labelt2Y:" + str(len(labelt2Y))) tX2 = [] testX = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, tX, testY = splitDataForXValidationSampled2( location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for row in tX: testX.append(row) for pred in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelt2Y) for pred in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
def predict_proba(self, X): pred = RandomForestRegressor.predict(self, X) result = numpy.zeros([len(X), 2]) result[:, 1] = special.expit(pred / 1000.) result[:, 0] = 1. - result[:, 1] return result
output.close() all_tags, all_features = getTagAndFeatures(['T','W', 'A', 'R', 'L', 'B']) print(str(all_features)) for location in locations: print("location: " + str(location)) # save down trainX, trainY, testX, testY trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, all_features, "target", timestampData) print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features, trainX) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features, testX) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv", ["target"], trainY) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv", ["target"], testY) for dataGroup in generateAllDataGroups(): tag, features = getTagAndFeatures(dataGroup) trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, features, "target", timestampData) model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX, trainY) trainPrediction = model.predict(trainX) testPrediction = model.predict(testX) trainRmse = str(rmseEval(trainY, trainPrediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t" + tag + ": #train: " + str(len(trainY)) + ", #test:" + str(len(testY)) + ", trainRMSE: " + trainRmse + ", testRMSE: " + testRmse) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv", ["trainPred_" + tag], trainPrediction) writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv", ["testPred_" + tag], testPrediction)
store = store.drop("Assortment", 1).join( pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x)) ) train["StateHoliday"] = [mychange(x) for x in train.StateHoliday] test["StateHoliday"] = [mychange(x) for x in test.StateHoliday] train = train.drop("StateHoliday", 1).join( pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) test = test.drop("StateHoliday", 1).join( pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) train = pd.merge(train, store, on="Store") test = pd.merge(test, store, on="Store") repeat = 1 print("Splitting data...") for i in range(repeat): features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]] rf = RandomForestRegressor(n_estimators=100) print("Starting training...") rf.fit(train[features].fillna(-1), train.LogSale) test["mypred"] = rf.predict(test[features].fillna(-1)) test["mypred"] = np.exp(test["mypred"]) - 1 test["Sales"] = test.mypred test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
# In[10]: Train = Train.fillna(0) Test = Test.fillna(0) # print Train.head() # print Test.head() # In[11]: print 'Train Random Forests!' from sklearn.ensemble.forest import RandomForestRegressor RF = RandomForestRegressor(n_estimators = 500, random_state = 0) # In[12]: Rows = np.random.choice(Train.index.values, 400000) Sampled_Train = Train.ix[Rows] Sample_Train_Target = Train_Target.ix[Rows] # RF.fit(Sampled_Train, Sample_Train_Target) RF.fit(Train, Train_Target) # In[ ]: print 'Predict!'
store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv") build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = True) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto", compact = True) build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", compact = True) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto") build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", compact = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", compact = True) auto_na_X, auto_na_y = load_auto("AutoNA.csv") auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int) auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int) auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int) def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] + [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([
for i in range(repeat): newtrain, newtest = train_test_split(train, test_size = 0.2) newtrain = pd.DataFrame(newtrain, columns = cols) newtest = pd.DataFrame(newtest, columns = cols) #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day'])) #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x))) #newtest = pd.merge(newtest,store, on="Store") #newtest.drop(['Date'],axis = 1,inplace=True) #assert(np.sum(newtrain.var()==0)==0) # #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) ) features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']] # rf = RandomForestRegressor(n_estimators=100) print('Starting training...') rf.fit(newtrain[features].fillna(-1),newtrain.LogSale) print('Predicting train values...') newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1)) newtrain['mypred'] = np.exp(newtrain['mypred'])-1 train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred) print('train set error',train_error) newtest['mypred'] = rf.predict(newtest[features].fillna(-1)) newtest['mypred'] = np.exp(newtest['mypred'])-1 test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred) print('test set error',test_error) train_results.append(train_error) test_results.append(test_error) print('mean train error', np.mean(train_results))
# We aren't clasifying or separating samples according to their metadata, so we can just use all of the samples x = data.abun_df[otu].values y = [float(data.meta_df.loc[smpl, 'age']) for smpl in data.abun_df.index] # note: want to make sure our y vector is in the same order as the x vector (i.e. that each x and y are for the same sample) r, p_val = scipy.stats.stats.spearmanr(x,y) # Look at scatter plot of OTU abundance vs. age to visualize the correlation fig, ax= plt.subplots() ax.scatter(x, y) ax.set_xlabel('OTU #' + otu) ax.set_ylabel('Age') ax.text(0.01,0.95, r'$\rho$ = {:.2f}'.format(r), transform=ax.transAxes) #%% 3. Build a Random Forest Regressor ## 3.1 Build the regressor rfreg = RandomForestRegressor(n_estimators=1000, oob_score=True) # We aren't classifying samples here, so we can just use the whole OTU table to build our regression X = data.abun_df.values Y = [float(data.meta_df.loc[smpl, 'BMI']) for smpl in data.abun_df.index] rfreg = rfreg.fit(X,Y) ## 3.1.1 Look at true vs. predicted values from out of bag estimations fig, ax = plt.subplots() ax.scatter(Y, rfreg.oob_prediction_) ax.set_xlabel('True') ax.set_ylabel('Predicted') ax.set_title('RF regression on BMI') ## 3.2 Look at the important features in the regression by inspecting their coefficient weights feats = pd.DataFrame(index=data.abun_df.columns, columns=['importance'], data=rfreg.feature_importances_)
output = open(OUTPUT_FILE, 'w') output.write("step,rmse_tw,rmse_twa,rmse_combined,accuracy\n") output_log = open(OUTPUT_LOG_FILE, 'w') log(output_log, "Generating Rmse RF+TW and Rmse RF+TWA") allPredictionTW = [] allPredictionTWA = [] for location in locations: trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, tw_features, "target", timestampData) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPredictionTW = model.predict(testX) rmse = str(rmseEval(testY, testPredictionTW)[1]) log(output_log, "\tTW rmse: " + rmse) for x in testY: allObs.append(x) for x in testPredictionTW: allPredictionTW.append(x) trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, twa_features, "target", timestampData) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42)
features = [ 'Store', 'SchoolHoliday', 'Promo', 'cmp msr', 'IsPromotionMonth', 'Year', 'Month', 'Day', 'DayOfTheWeek', 'WeekOfTheYear', 'StoreType', 'CompetitionOpenSinceMonth', 'CompetitionDistance', 'PromoOpen' ] # Features used for prediction feature_engineering(rossman) feature_engineering(rossman_test) X = rossman[features] y = rossman.Sales # The value we are going to predict train_features, test_features, train_predict, test_predict = train_test_split( X, y) randomForest = RandomForestRegressor(n_estimators=35) randomForest.verbose = True randomForest.fit(X, y) errorValue = cross_validation.cross_val_score(randomForest, rossman[features], y, scoring='mean_squared_error', cv=3) predicted_value = randomForest.predict(test_features) predicted_value = np.array(predicted_value) test_predict = np.array(test_predict) finalResult = randomForest.predict(rossman_test)
class MLCms: """ """ def __init__(self, config_file=''): # Parse config file self.parser = SafeConfigParser() self.parser.read(config_file) # machine learning specific variables self.classify = constants.DO_CLASSIFICATION # Regress or classify? self.vars_features = constants.fixed_vars self.vars_target = constants.ML_TARGETS if self.classify: self.var_target = constants.ML_TARGETS self.task = 'classification' self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) else: self.var_target = constants.ML_TARGETS self.task = 'regression' self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) # SVR() # Get path to input self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl # Output directory is <dir>_<classification>_<2014> self.path_out_dir = constants.out_dir utils.make_dir_if_missing(self.path_out_dir) # Model pickle self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features' def output_model_importance(self, gs, name_gs, num_cols): """ :param gs: :param name_gs: :param num_cols: :return: """ rows_list = [] name_vars = [] feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_ importances = 100.0 * (feature_importance / feature_importance.max()) std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Store feature ranking in a dataframe for f in range(num_cols): dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]} name_vars.append(self.vars_features[indices[f]]) rows_list.append(dict_results) df_results = pd.DataFrame(rows_list) num_cols = 10 if len(indices) > 10 else len(indices) # Plot upto a maximum of 10 features plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols], std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop, title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')', xlabel=name_vars[:num_cols], out_path=self.path_out_dir) df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv') def get_data(self): """ :return: """ df = pd.read_csv(self.path_inp) cols = [col for col in df.columns if col not in self.vars_features] # cols.extend(['DI', 'PI']) # Add information on PI and DI of soils # iterate over each row, get lat and lon # Find corresponding DI and PI lat_lons = zip(df['Long_round'], df['Lat_round']) vals_di = [] vals_pi = [] # for idx, (lon, lat) in enumerate(lat_lons): # print idx, len(lat_lons) # vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif', # lon, lat, replace_ras=False)) # vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif', # lon, lat, replace_ras=False)) # # df['DI'] = vals_di # df['PI'] = vals_pi df = df[cols] data = df.as_matrix(columns=cols[1:]) target = df.as_matrix(columns=[self.var_target]).ravel() # Get training and testing splits splits = train_test_split(data, target, test_size=0.2) return cols, splits def train_ml_model(self): """ :return: """ logger.info('#########################################################################') logger.info('train_ml_model') logger.info('#########################################################################') ###################################################### # Load dataset ###################################################### cols, splits = self.get_data() data_train, data_test, target_train, target_test = splits # clf = ExtraTreesRegressor(500, n_jobs=constants.ncpu) # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1) # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3) # data = df_train.as_matrix(columns=cols[1:]) # convert dataframe column to matrix # #data = preprocessing.scale(data) # target = df_train.as_matrix(columns=[self.var_target]).ravel() # convert dataframe column to matrix # clf.fit(data, target) # # predict_val = clf.predict(after.as_matrix(columns=cols[1:])) # results = compute_stats.ols(predict_val.tolist(), after_target.tolist()) # print results.rsquared # import matplotlib.pyplot as plt # plt.scatter(after_target, predict_val) # plt.show() # pdb.set_trace() if not os.path.isfile(self.path_pickle_model): # For details in scikit workflow: See http://stackoverflow.com/questions/ # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea # TODO Separate out a dataset so that even the grid search cv can be tested ############################ # Select features from model ############################ logger.info('Selecting important features from model') if self.classify: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) else: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) feat_selection = SelectFromModel(rf_feature_imp) pipeline = Pipeline([ ('fs', feat_selection), ('clf', self.model), ]) ################################# # Grid search for best parameters ################################# C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) logger.info('Tuning hyperparameters') param_grid = { 'fs__threshold': ['mean', 'median'], 'fs__estimator__max_features': ['auto', 'log2'], 'clf__max_features': ['auto', 'log2'], 'clf__n_estimators': [1000, 2000] #'clf__gamma': np.logspace(-9, 3, 13), #'clf__C': np.logspace(-2, 10, 13) } gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan) # Fir the data before getting the best parameter combination. Different data sets will have # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination. gs.fit(data_train, target_train) logger.info(gs.best_params_) data_test = pd.DataFrame(data_test, columns=cols[1:]) # Update features that should be used in model selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]]) cols = selected_features[0] data_test = data_test[cols] # Update model with the best parameters learnt in the previous step self.model = gs.best_estimator_.named_steps['clf'] predict_val = self.model.predict(data_test) results = compute_stats.ols(predict_val.tolist(), target_test.tolist()) print results.rsquared print cols plt.scatter(target_test, predict_val) plt.show() pdb.set_trace() ################################################################### # Output and plot importance of model features, and learning curves ################################################################### self.output_model_importance(gs, 'clf', num_cols=len(cols[1:])) if constants.plot_model_importance: train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold, n_jobs=constants.ncpu) plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve', ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir) # Save the model to disk logger.info('Saving model and features as pickle on disk') with open(self.path_pickle_model, 'wb') as f: cPickle.dump(self.model, f) with open(self.path_pickle_features, 'wb') as f: cPickle.dump(self.vars_features, f) else: # Read model from pickle on disk with open(self.path_pickle_model, 'rb') as f: logger.info('Reading model from pickle on disk') self.model = cPickle.load(f) logger.info('Reading features from pickle on disk') self.vars_features = pd.read_pickle(self.path_pickle_features) return df_cc def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'): """ 1. Does classification/regression based on already built model. 2. Plots confusion matrix for classification tasks, scatter plot for regression 3. Plots accuracy statistics for classification/regression :param df_forecast: :param mon_names: :param available_target: Is target array available? :param name_target: Name of target array (defaults to yield) :return: """ data = df_forecast.as_matrix(columns=self.vars_features) # convert dataframe column to matrix predicted = self.model.predict(data) if available_target: expected = df_forecast.as_matrix(columns=[name_target]).ravel() if not self.classify: # REGRESSION # Compute stats results = compute_stats.ols(predicted.tolist(), expected.tolist()) bias = compute_stats.bias(predicted, expected) rmse = compute_stats.rmse(predicted, expected) mae = compute_stats.mae(predicted, expected) # Plot! plot.plot_regression_scatter(expected, np.asarray(predicted), annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' + 'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'), xlabel='Expected yield', ylabel='Predicted yield', title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])), fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop, out_path=self.path_out_dir) # global expected vs predicted if self.debug: # any non-existing index will add row self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names, self.forecast_yr] return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias} else: # CLASSIFICATION # Convert from crop condition class (e.g. 4) to string (e.g. exceptional) expected, predicted = compute_stats.remove_nans(expected, predicted) cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T # Compute and plot class probabilities proba_cc = self.model.predict_proba(data) df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values()) plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop, out_path=self.path_out_dir) # Plot confusion matrix plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop, xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(), out_path=self.path_out_dir) # Normalize and plot confusion matrix cm_normalized = normalize(cm.astype(float), axis=1, norm='l1') plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop, xlabel='True class', ylabel='Predicted class', normalized=True, ticks=self.dict_cc.values(), out_path=self.path_out_dir) score_accuracy = accuracy_score(expected, predicted) * 100.0 score_precision = precision_score(expected, predicted, average='weighted') * 100.0 return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision} else: return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan, 'Nash-Sutcliff': np.nan}
'random_state': 3 } adb = AdaBoostRegressor(**params_adb) adb.fit(x_train, y_train) y_pre_adb = adb.predict(x_test) adb_pre.append(y_pre_adb) params_rf = { 'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2, 'warm_start': True, 'n_jobs': 4, 'oob_score': True, 'max_features': 'log2' } rf = RandomForestRegressor(**params_rf) rf.fit(x_train, y_train) y_pre_rf = rf.predict(x_test) rf_pre.append(y_pre_rf) ### RigeLinearCV = linear_model.RidgeCV(cv=8, normalize=True, gcv_mode='auto', scoring='neg_mean_absolute_error') rcv = RigeLinearCV.fit(x_train, y_train) y_pre_rcv = rcv.predict(x_test) rcv_pre.append(y_pre_rcv) br = BayesianRidge(n_iter=300)
'Soft drinks (inc. fizzy and ready to drink fruit drinks)', 'Alcoholic drink, tobacco and narcotics', 'Alcoholic drinks', 'Spirits and liqueurs (brought home)', 'Wines, fortified wines (brought home)', 'Beer, lager, ciders and perry (brought home)', 'Alcopops (brought home)', 'Tobacco and narcotics1', 'Cigarettes', 'Cigars, other tobacco products and narcotics' ] predictors = feats_of_interest + food_feats1 + food_feats2 target = "admitted" #print len(predictors) # you can change rf to linear_model.LinearRegression() ... RandomForestRegressor() is another version rf = RandomForestRegressor() X = merge_df[predictors] Y = merge_df[target] Y = np.array(Y) ## split data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) #print rf.fit(X_train, Y_train) #print np.mean((rf.predict(X_test) - Y_test) ** 2) # regressor for all persons as target #print regr.fit(X_train, Y_train) #print regr.coef_ #print np.mean((regr.predict(X_test) - Y_test) ** 2) # regressor for male as target #print merge_df.columns #perform random forest with top k features, print the predictor error