def trainRandomForest(data, columns, targetColumn, parameters): modelColumns = [] for column in columns: if column != targetColumn: modelColumns.append(column) modelData = [] for i in range(0, len(data[targetColumn])): record = [] for column in modelColumns: record.append(data[column][i]) modelData.append(record) if "depth" in parameters: model = RandomForestRegressor(max_depth=parameters["depth"], n_estimators=parameters["estimators"], n_jobs=-1, random_state=42) elif "leaf" in parameters: model = RandomForestRegressor(min_samples_leaf=parameters["leaf"], n_estimators=parameters["estimators"], n_jobs=-1, random_state=42) model.fit(modelData, data[targetColumn]) return RandomForestModel(model, modelColumns)
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") if "depth" in parameters: model = RandomForestRegressor( max_depth=parameters["depth"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "leaf" in parameters: model = RandomForestRegressor( min_samples_leaf=parameters["leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "max_leaf" in parameters: model = RandomForestRegressor( max_leaf_nodes=parameters["max_leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def getModels(): models = {} models['dt'] = DecisionTreeRegressor(max_depth=50) models['rf1'] = RandomForestRegressor() models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15) models['gbr'] = GradientBoostingRegressor(n_estimators=128, max_depth=5, learning_rate=1.0) # models['abr'] = AdaBoostRegressor(n_estimators=128) return models
def test_boston_housing_no_fit_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) with self.assertRaises(AssertionError): explainer.predict(x_test, y_test) with self.assertRaises(AssertionError): explainer.score(x_test, y_test)
def test_boston_housing_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) explainer.fit(x_train, y_train) self.assertEqual(explainer.prediction_model.output_shape, (None, np.prod(x_test.shape[1:]))) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def test_boston_housing_confidence_level_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models = 2 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) invalid_confidence_levels = [1.01, -0.5, -0.01] for confidence_level in invalid_confidence_levels: with self.assertRaises(ValueError): explainer.predict(x_test, confidence_level=confidence_level)
def RF_ST(trainFileName, testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1', '2', '3', '4', '5'] res = [] for i in store: train_X = [] train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [] items = [] context = testData[i] for array in context: items.append((array[0], array[1])) array = [float(x) for x in array[2:]] test_X.append((array[2:])) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0], items[i][1], '%.4f' % max(pred_y[i], 0)]) return res
def __init__(self, config_file=''): # Parse config file self.parser = SafeConfigParser() self.parser.read(config_file) # machine learning specific variables self.classify = constants.DO_CLASSIFICATION # Regress or classify? self.vars_features = constants.fixed_vars self.vars_target = constants.ML_TARGETS if self.classify: self.var_target = constants.ML_TARGETS self.task = 'classification' self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) else: self.var_target = constants.ML_TARGETS self.task = 'regression' self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) # SVR() # Get path to input self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl # Output directory is <dir>_<classification>_<2014> self.path_out_dir = constants.out_dir utils.make_dir_if_missing(self.path_out_dir) # Model pickle self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'
def build_other_learners(train_x, train_y): simple_learners = [] simple_learners.append( SimpleLearner( "rf", RandomForestRegressor(n_jobs=-1, max_features=0.6, n_estimators=2, max_depth=8))) simple_learners.append( SimpleLearner( "gb", GradientBoostingRegressor(n_estimators=10, loss='huber', learning_rate=0.5, max_depth=4))) simple_learners.append( SimpleLearner( "linearSVR", LinearSVR(intercept_scaling=64, C=128, max_iter=1000, dual=False, loss='squared_epsilon_insensitive'))) simple_learners.append( SimpleLearner("svr", SVR(C=100, epsilon=0.001, gamma=0.00001))) for sl in simple_learners: if sl.name == "linearSVR" or sl.name == "pa": sl.scaler.fit(train_x) s_train_x = sl.scaler.transform(train_x) sl.fit(s_train_x, train_y) else: sl.fit(train_x, train_y) print "%s: finish to build the model" % sl.name return simple_learners
def test_boston_housing_load_save_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models_settings = [1, 2] for num_models in num_models_settings: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) median_1 = explainer.predict(x_test) tmp_dir_name = tempfile.mkdtemp() explainer.save(tmp_dir_name) with self.assertRaises(ValueError): explainer.save(tmp_dir_name, overwrite=False) explainer.save(tmp_dir_name, overwrite=True) explainer.load(tmp_dir_name) median_2 = explainer.predict(x_test) self.assertTrue(np.array_equal(median_1, median_2)) shutil.rmtree(tmp_dir_name) # Cleanup.
def eval_one(step): if step in cached_results: return cached_results[step] eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target") model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] cached_results[step] = rmse # save down the cached result cache_output = open(CACHE_FILE, "a") step_list = [str(s) for s in step] step_str = ",".join(step_list) cache_output.write(str(rmse) + ";" + step_str + "\n") cache_output.close() return rmse
def __init__(self, features=[], target=[], model='ols', tag='train'): self.tag = tag + '_' + model self.outdir = 'fig/results' self.model = model import os os.system('mkdir -p ' + self.outdir) # setup analysis self.X = features self.y = target # Scale self.scaler = StandardScaler(with_mean=True, with_std=True).fit(self.X) if model == 'ols': self.regr = skl_lm.LinearRegression() elif model == 'huber': self.regr = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=1.35) elif model == 'tree': self.regr = DecisionTreeRegressor(max_depth=6) elif model == 'forest': self.regr = RandomForestRegressor(n_estimators=10, bootstrap=True, criterion='mae', max_depth=10, max_features='auto', min_samples_leaf=5, min_samples_split=10, random_state=0) print self
def test_moving_average_smoothing_estimator(): np.random.seed(1) m = 10000 n = 10 # Simulate an event under constant hazard, with hazard = X * beta and # iid exponentially distributed exposure times. X = np.random.normal(size=(m, n)) beta = np.random.normal(size=(n, 1)) hazard = np.exp(np.dot(X, beta)) exposure = np.random.exponential(size=(m, 1)) rate = np.random.poisson(hazard * exposure) / exposure model = CalibratedEstimatorCV( GLM(sm.families.Gaussian(sm.families.links.log), add_constant=False), ThresholdClassifier( HazardToRiskEstimator( MovingAverageSmoothingEstimator(RandomForestRegressor())))) model.fit(X, rate, exposure=exposure) y_pred = model.predict(X, exposure) assert np.abs((np.sum(y_pred) - np.sum(rate > 0)) / np.sum(rate > 0)) < .1 assert np.max(np.abs(model.estimator_.coef_ - beta[:, 0])) < .1
def __init__(self): super(ItemSetModel, self).__init__() #self.clf = DecisionTreeRegressor() #self.clf = Lasso(0.1) #self.clf = SVR(kernel='rbf') #self.clf = ElasticNetCV() self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)
def baggedModel(X_train, y_train, X_test, y_test, X_holdout, y_holdout): """ INPUT: X_train, y_train, and the dataset you plan on predicting on OUTPUT: The predictions for the unseen dataset """ rf_reg = RandomForestRegressor(max_depth= 20, max_features='sqrt', min_samples_leaf= 4, min_samples_split= 5, n_estimators= 100) boost_rf_rg = AdaBoostRegressor(base_estimator=rf_reg, n_estimators=10, random_state=123) #Trained model fit on training set boost_rf_rg.fit(X_train, y_train) #Prediting on Test Set predictions_testset = boost_rf_rg.predict(X_test) regressor_test_accuracy = boost_rf_rg.score(X_test,y_test) #Predicting on Holdout Set predictions_holdoutset = boost_rf_rg.predict(X_holdout) regressor_holdout_accuracy = boost_rf_rg.score(X_holdout,y_holdout) return predictions_testset, predictions_holdoutset, regressor_test_accuracy, regressor_holdout_accuracy
def post(self): # upload audio file in server voice = self.request.files["audio"][0] extn = os.path.splitext(voice['filename'])[1] fnm = os.path.splitext(voice['filename'])[0] cname = str(uuid.uuid4()) + extn fh = open(__UPLOADS__ + cname, 'w') fh.write(voice['body']) fh.close() # get features from the audio file attr = getAttributes(cname) fdf = mongoTolist(False) train = fdf[:,:-1] target = fdf[:,-1] #RandomForest Regression rf = RandomForestRegressor(n_estimators = 506, n_jobs = -1) rf.fit(train, target) updrs_val = rf.predict([attr]) attr.append(updrs_val[0]) # get the theta from database theta = list(db.theta.find({})) theta1 = theta[0]["theta1"] theta2 = theta[1]["theta2"] # check is the person has Parkinson's Disease isParkinson = octave.classify(theta1, theta2, np.array(attr)) self.render("output.html", ipk = isParkinson, updrs = updrs_val[0])
def eval_one(min_samples_leaf, n_estimators): log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " + str(n_estimators)) all_observations = [] all_pred_ALL = [] for group in range(0, len(groups)): trainStations = [] for i in range(0, len(groups)): if i != group: trainStations.extend(groups[i]) testStations = groups[group] train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, all_features, "target") model = RandomForestRegressor(min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_ALL = model.predict(testX) rmse = rmseEval(testY, prediction_ALL)[1] log("\tALL rmse: " + str(rmse)) all_observations.extend(testY) all_pred_ALL.extend(prediction_ALL) rmse = rmseEval(all_observations, all_pred_ALL)[1] log("\tALL rmse:" + str(rmse)) return rmse
def __init__(self, param_grid, n_evaluations=10, random_state=None, start_evaluations=3, n_attempts=10, regressor=None, maximize=True): """ This general method relies on regression. Regressor will try to predict the best point based on already known result fir different parameters. :param OrderedDict param_grid: the grid with parameters to optimize on :param int n_evaluations: the number of evaluations to do :param random_state: random generator :type random_state: int or RandomState or None :param int start_evaluations: count of random point generation on start :param int n_attempts: this number of points will be compared on each iteration. Regressor is to choose optimal from them. :param regressor: regressor to choose appropriate next point with potential best score (estimated this score by regressor); If None them RandomForest algorithm will be used. """ AbstractParameterGenerator.__init__(self, param_grid=param_grid, n_evaluations=n_evaluations, random_state=random_state, maximize=maximize) if regressor is None: regressor = RandomForestRegressor(max_depth=3, n_estimators=10, max_features=0.7) self.regressor = regressor self.n_attempts = n_attempts self.start_evaluations = start_evaluations
def make_prediction(self, site_id, label): self._create_X_Y_per_site(site_id, label) self.xgbooster(label) clf_RDM = { 'params': { 'n_estimators': [300], 'bootstrap': [True], 'criterion': ['mse'] }, 'clf': RandomForestRegressor() } clf = clf_RDM['clf'] params = clf_RDM['params'] clf_rdm, Xtrain, ytrain, Xtest, ytest, r2 = \ do_classify(clf, params, self.X_training, self.y) prediction = clf_rdm.predict(self.X_test) self.r2 = r2 self.forecast_predictors[label] = pd.Series( prediction, index=self.forecast_predictors.index) self.features_weighted = get_features_importance( clf_rdm, self._get_features()) print self.features_weighted self.forecast_predictors.to_csv(get_file_path( "data/store/" + self.name + "_predictions_" + self.datastore.period + "_" + label + ".csv", fileDir), sep=";")
def randomforestregressor(self, testlen, ntrain, ntrees, nodes): hsmadata = self.hsmadata dates = pd.Series(hsmadata['date'].unique()).sort_values() dates.index = range(0, len(dates)) ntest = len(dates) // testlen hsma = pd.DataFrame() for i in range(ntrain, ntest): traindata = hsmadata[ (hsmadata['date'] >= dates[(i - ntrain) * testlen]) & (hsmadata['date'] < dates[i * testlen - self.day])].copy() testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & ( hsmadata['date'] < dates[(i + 1) * testlen])].copy() traindata = traindata.iloc[:, 2:] traindatax = traindata.drop(['closeratio'], 1) traindatay = traindata['closeratio'] testdatax = testdata[traindatax.columns] treemodel = RandomForestRegressor( n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) treemodel.fit(traindatax, traindatay) testdata['predratio'] = treemodel.predict(testdatax) hsma = pd.concat([hsma, testdata], ignore_index=True) return (hsma)
def train_model(self, X_train, Y_train): print("training model %d_%d.pkl" % (self.frame_len, self.predict_dist)) model_name = "simple_reg_model/%d_%d.pkl" % (self.frame_len, self.predict_dist) self.estimator = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1) self.estimator.fit(X_train, Y_train) print("finish training model") joblib.dump(self.estimator, model_name)
def calcRandomForest(channels_training, channels_testing, target_training, target_testing): clf = RandomForestRegressor(n_estimators=500, max_features=len(channels_training[0])) clf = clf.fit(channels_training, target_training) predictions = clf.predict(channels_testing) comp = [predictions, target_testing] return clf, comp
def __init__(self): from sklearn.pipeline import Pipeline self.clf = Pipeline([ ('vect', DecisionTreeRegressor()), ('tfidf', RandomForestRegressor()), ('clf', BayesianRidge(compute_score=True)), ])
def randomForest(trainFeatures, trainResponses, testFeatures, maxFeatures = 'log2', nTree=100): ## Settings of random forests regressor regModel = RandomForestRegressor(n_estimators=nTree, max_features=maxFeatures) ## Train the random forests regressor regModel.fit(trainFeatures, trainResponses) ## Prediction testResponsesPred = regModel.predict(testFeatures) return testResponsesPred
def evalTrainStationTestStation(trainStation, testStation, features): trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target") _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target") model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX2) rmse = rmseEval(testY2, prediction)[1] print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse)) return rmse
def __init__(self, param_grid, n_evaluations=10, random_state=None, start_evaluations=3, n_attempts=5, regressor=None): AbstractParameterGenerator.__init__(self, param_grid=param_grid, n_evaluations=n_evaluations, random_state=random_state) if regressor is None: regressor = RandomForestRegressor(max_depth=3, n_estimators=10, max_features=0.7) self.regressor = regressor self.n_attempts = n_attempts self.start_evaluations = start_evaluations
def trainModel(self, column): self.prepareTrainingInputs(column) #self.clf = LinearRegression() if (column == 'Salary' or column == 'DOJ' or column == 'DOL'): self.clf = RandomForestRegressor(n_estimators=100, n_jobs=2) print('Regressor') else: self.clf = RandomForestClassifier(n_estimators=100, n_jobs=2) print('Classifier') self.clf = self.clf.fit(self.X_train, self.y_train)
def fitDurationEstimator(self, modelType="RF"): """ Fit duration model with specified regressor type (Random forest by default) """ print "fitting charging duration model..." if modelType == "RF": self.durationEstimator = RandomForestRegressor(random_state=0, n_estimators=50, max_depth=50) self.durationEstimator.fit(self.X, self.durationData)
def RF_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)]) return res
def modeltrain(X_train, y_train, X_test, y_test): from sklearn.ensemble.forest import RandomForestRegressor # Generando el modelo RF_Model = RandomForestRegressor(n_estimators=100,max_features=1) # Ajustando el modelo con X_train y y_train rgr = RF_Model.fit(X_train, y_train) y_train_predict = (rgr.predict(X_train)).astype(int) y_test_predict = (rgr.predict(X_test)).astype(int) return y_train_predict , y_test_predict , rgr