def train_mimic(self, training_data, mimic_env, save_model_dir, log_file): self.model = DecisionTreeRegressor(max_leaf_nodes=self.max_leaf_nodes, criterion= self.criterion, splitter=self.mode) self.model.fit(training_data[0], training_data[1]) # self.print_tree() leaves_number = (self.model.tree_.node_count+1)/2 print("Leaves number is {0}".format(leaves_number)) predict_dictionary = {} predictions = self.model.predict(training_data[0]) for predict_index in range(len(predictions)): predict_value = predictions[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value:[predict_index]}) return_value_log = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return(state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return(state=list(predict_dictionary.values()), apply_variance_reduction=True) mae, rmse = compute_regression_results(predictions=predictions, labels=training_data[1]) # print("Training return:{0} with mae:{1} and rmse:{2}".format(return_value, mae, rmse), file=log_file) with open(save_model_dir, 'wb') as f: pickle.dump(obj=self.model, file=f) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def fit(self, X, y, sample_weight=None): X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) sample_weight = check_sample_weight(y, sample_weight=sample_weight) sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight) self.random_state = check_random_state(self.random_state) self.estimators = [] score = numpy.zeros(len(X), dtype=float) y_signed = 2 * y - 1 self.w_sig = [] self.w_bck = [] for _ in range(self.n_estimators): residual = y_signed # numpy.exp(- y_signed * score) # residual[y > 0.5] /= numpy.mean(residual[y > 0.5]) # residual[y < 0.5] /= -numpy.mean(residual[y < 0.5]) trainX, testX, trainY, testY, trainW, testW, trainR, testR, trainS, testS = \ train_test_split(X, y, sample_weight, residual, score, train_size=self.train_part, test_size=self.test_size, random_state=self.random_state) tree = DecisionTreeRegressor(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state) # fitting tree.fit(trainX, trainR, sample_weight=trainW, check_input=False) # post-pruning self.update_terminal_regions(tree.tree_, testX, testY, testW, testS) # updating score # score += self.learning_rate * tree.predict(X) self.estimators.append(tree)
class DecisionTreeRegressorImpl(): def __init__(self, criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'presort': presort} self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def fit(self, X, y, sample_weight=None): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' self.check_params() self.estimators = [] self.scores = [] n_samples = len(X) n_inbag = int(self.subsample * len(X)) self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) # preparing for fitting in trees X = self.get_train_vars(X) self.n_features = X.shape[1] X, y = check_arrays(X, y) X = X.astype(DTYPE) y_pred = numpy.zeros(len(X), dtype=float) if self.init_estimator is not None: y_signed = 2 * y - 1 self.init_estimator.fit(X, y_signed, sample_weight=sample_weight) y_pred += numpy.ravel(self.init_estimator.predict(X)) for stage in range(self.n_estimators): # tree creation tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state, max_leaf_nodes=self.max_leaf_nodes) # tree learning residual = self.loss.negative_gradient(y_pred) train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False) tree.fit(X[train_indices], residual[train_indices], sample_weight=sample_weight[train_indices], check_input=False) # update tree leaves if self.update_tree: self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight, update_mask=numpy.ones(len(X), dtype=bool), residual=residual) y_pred += self.learning_rate * tree.predict(X) self.estimators.append(tree) self.scores.append(self.loss(y_pred)) return self
def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def getModels(): models = {} models['dt'] = DecisionTreeRegressor(max_depth=50) models['rf1'] = RandomForestRegressor() models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15) models['gbr'] = GradientBoostingRegressor(n_estimators=128, max_depth=5, learning_rate=1.0) # models['abr'] = AdaBoostRegressor(n_estimators=128) return models
def fit_stage(self, i, X, y): """Fit another stage of ``n_classes_`` trees to the boosting model. """ # induce regression tree on residuals tree = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0., max_features=None, max_leaf_nodes=None, random_state=self.random_state, presort=False) tree.fit(X, y, check_input=False, X_idx_sorted=None) # add tree to ensemble self.estimators[i, 0] = tree self.n_estimated = i + 1
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") if "depth" in parameters: model = DecisionTreeRegressor(max_depth = parameters["depth"], random_state=42) elif "leaf" in parameters: model = DecisionTreeRegressor(min_samples_leaf = parameters["leaf"], random_state=42) elif "max_leaf" in parameters: model = DecisionTreeRegressor(max_leaf_nodes = parameters["max_leaf"], random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def sklearn_titanic_regression(): from sklearn.tree.tree import DecisionTreeRegressor from sklearn.preprocessing.label import LabelEncoder import numpy as np total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeRegressor() clf.fit(train_df.drop(['fare'], axis=1), train_df['fare']) pred = clf.predict(test_df.drop(['fare'], axis=1)) truth = test_df['fare'] mse = np.sum(np.square(pred - truth)) / test_df.shape[0] print(mse)
def addBoostIteration(self): rv = self.regressionValues() trees = [] mask = numpy.array([True] * self.nF) for i in range(0, self.nF): mask[:] = True mask[i] = False tree = DecisionTreeRegressor(max_depth=self.max_depth) tree.fit(self.data[:, mask], rv[:, i]) # newpsis[:, i] = tree.predict(self.data[:, mask]) trees.append(tree) self.trees.append(trees)
def __init__(self, criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False): self._hyperparams = { 'criterion': criterion, 'splitter': splitter, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'random_state': random_state, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'presort': presort} self._wrapped_model = Op(**self._hyperparams)
def set_params_dict(self, learner_params): if self.method == 'classification': self.learner = ensemble.AdaBoostClassifier( base_estimator=DecisionTreeClassifier( max_depth=learner_params['base_estimator__max_depth'], max_features=learner_params['base_estimator__max_features'] ), n_estimators=int(learner_params['n_estimators']), learning_rate=learner_params['learning_rate']) elif self.method == 'regression': self.learner = ensemble.AdaBoostRegressor( base_estimator=DecisionTreeRegressor( max_depth=learner_params['base_estimator__max_depth'], max_features=learner_params['base_estimator__max_features'] ), n_estimators=int(learner_params['n_estimators']), learning_rate=learner_params['learning_rate'])
def set_params_list(self, learner_params, i): m_rf_size = int(learner_params[0]) m_learn_rate = learner_params[1] m_dep = int(learner_params[2]) m_feat = learner_params[3] if self.method == 'classification': self.learner = ensemble.AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=m_dep, max_features=m_feat), n_estimators=int(m_rf_size), learning_rate=m_learn_rate) elif self.method == 'regression': self.learner = ensemble.AdaBoostRegressor( base_estimator=DecisionTreeRegressor(max_depth=m_dep, max_features=m_feat), n_estimators=int(m_rf_size), learning_rate=m_learn_rate)
def _hi_level_investigation(data): '''Perform high-level investigation.''' transformers = [ transformer.OneHotTransformer(nucl=False), transformer.AminoAcidTransformer() ] estimators = [ LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor(), SVR(kernel='poly') ] cv = 10 for trnsfrmr, estimator in itertools.product(transformers, estimators): encoded = trnsfrmr.transform(data) X, y = encoded[:, 2:], encoded[:, 1] X = StandardScaler().fit_transform(X) scores = cross_val_score(estimator, X, y, scoring='neg_mean_squared_error', cv=cv, verbose=False) scores = np.sqrt(-scores) print('\t'.join([ trnsfrmr.__class__.__name__, estimator.__class__.__name__, str((scores.mean(), scores.std())) ])) print()
def __regressor__(self, X_train, Y_train): self.ensemble = DecisionTreeRegressor(random_state=56) self.ensemble.fit(X_train, Y_train) print('Ensemble Model Ready')
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers)) self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
class FinalEnsembleModel(HCDRDataScorer): INTERRIM_MODELS = { 'app':'app_model.pkl', 'bureau':'bureau_model.pkl', 'ccb':'ccb_model.pkl', \ 'pcb':'pcb_model.pkl', 'pa':'prev_app_model.pkl', 'ip':'ins_pmt_model.pkl'} # INTERRIM_MODELS = { 'bureau':'bureau_model.pkl' } score_type = 'train' def __init__(self, path_to_data_store): self.path_to_data_store = path_to_data_store app_data = pd.read_csv(path_to_data_store + '/application_train.csv') print('Training Set Size', app_data.shape[0]) y_train = app_data['TARGET'] x_train = pd.DataFrame(self.createEnsembleData(app_data[['SK_ID_CURR' ]]), dtype='float64') print('Training Set Size After Merging', x_train.shape[0]) x_train.to_csv('ensemble_data.csv') # self.__regressor__(x_train, y_train) self.__sv_regressor__(x_train, y_train) def __regressor__(self, X_train, Y_train): self.ensemble = DecisionTreeRegressor(random_state=56) self.ensemble.fit(X_train, Y_train) print('Ensemble Model Ready') def __sv_regressor__(self, data, target): from sklearn.svm.classes import SVR svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(data, target) self.ensemble = svr_rbf def __random_forest_regressor__(self, data, target): from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=5, high=8), } forest_reg = RandomForestRegressor(random_state=42) rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_jobs=4, n_iter=20, cv=10, scoring='neg_mean_squared_error', random_state=42) # sampling_data = data.drop(['SK_ID_CURR'], axis=1) rnd_search.fit(data, target) # cvres = rnd_search.cv_results_ # for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): # print(np.sqrt(-mean_score), params) self.ensemble = rnd_search.best_estimator_ print('Ensemble Model Ready') def __curate__(self, data): data = data.mask(np.isinf(data)) multi_col_na_replace_with_zero = MultiColumnFillNAWithNumericValue( data.columns, 0) data = multi_col_na_replace_with_zero.transform(data) return data def setScoreType(self, score_type): self.score_type = score_type def mean_absolute_percentage_error(self, y_true, y_pred): y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 def score(self): test_target = None test_data = pd.read_csv(self.path_to_data_store + '/application_test.csv') sk_id_curr = test_data[['SK_ID_CURR']] if self.score_type == 'actual': test_data = self.createEnsembleData(test_data[['SK_ID_CURR']]) else: test_target = test_data['TARGET'] test_data = self.createEnsembleData(test_data[['SK_ID_CURR']]) score = self.ensemble.predict(test_data) if test_target is not None: print(mean_squared_error(test_target.values, score)) output = self.format_output(sk_id_curr, score, test_target) output.to_csv('submission.csv', index=False) else: output = self.format_output(sk_id_curr, score) output.to_csv('submission.csv', index=False) def format_output(self, sk_id_curr, preds, target=None): from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) def ignoreNeg(x): if x < 0: return 0 else: return x if target is not None: data = pd.DataFrame() data['SK_ID_CURR'] = sk_id_curr data['PREDS'] = scaler.fit_transform([preds]) data['TARGET'] = target else: data = pd.DataFrame() data['SK_ID_CURR'] = sk_id_curr data['TARGET'] = preds data['TARGET'] = scaler.fit_transform(data[['TARGET']]) return data def createEnsembleData(self, app_data): dataset = None curr_sk_ids = app_data['SK_ID_CURR'].values app_data = None for typ, model_name in self.INTERRIM_MODELS.items(): model = util.load('models/' + model_name) if typ == 'app': score_map = model.score(curr_sk_ids, self.score_type) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'DOC', 'REALTY', 'APP'] print('Application data ready .. ') elif typ == 'bureau': score_map = model.score(curr_sk_ids) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'BUREAU'] print('Bureau data ready .. ') elif typ == 'ccb': score_map = model.score(curr_sk_ids) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'CCB'] print('CCB data ready .. ') elif typ == 'pcb': score_map = model.score(curr_sk_ids) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'PCB'] print('PCB data ready .. ') elif typ == 'pa': score_map = model.score(curr_sk_ids) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'PREV_APP'] print('PA data ready .. ') elif typ == 'ip': score_map = model.score(curr_sk_ids) dataset = pd.DataFrame.from_dict(score_map, orient='index').reset_index() dataset.columns = ['SK_ID_CURR', 'INS_PMT'] print('IP data ready .. ') if app_data is None: app_data = dataset else: app_data = pd.merge(app_data, dataset, how='right', on=['SK_ID_CURR']) return self.__curate__(app_data.drop(['SK_ID_CURR'], axis=1))
pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") if "Auto" in datasets: build_auto( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight( y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum( cl.predict(X) * rate for rate, cl in zip( self.learning_rates, self.classifiers)) self.loss_values.append( self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones( len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones( len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
data['Sex'] = label_encoder.transform(data['Sex']) enc = LabelEncoder() label_encoder = enc.fit(data[pd.notnull(data['Floor'])]['Floor'].values) transformed = label_encoder.transform(data[pd.notnull( data['Floor'])]['Floor'].values) indexes = pd.notnull(data.Floor) data.loc[indexes, 'Floor'] = transformed enc = LabelEncoder() label_encoder = enc.fit(data['Embarked']) data['Embarked'] = label_encoder.transform(data['Embarked']) ## predykcja wieku # TODO: zobaczyć predykcję również tylko po Title regresor = DecisionTreeRegressor() X_train_age = data[pd.notnull(data.Age)][['Title', 'SibSp', 'Parch']] y_train_age = data[pd.notnull(data.Age)][['Age']] regresor.fit(X_train_age, y_train_age) # TODO: sprawdzić tą predykcję wieku, działa chyba ok # data['AgePredicted'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), None) data['Age'] = np.where(pd.isnull(data.Age), regresor.predict(data[['Title', 'SibSp', 'Parch']]), data['Age']) ##predykcja poziomu classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=2) # X_train_floor = data[pd.notnull(data.Floor)][['Embarked', 'Pclass']]
@author: TF ''' import matplotlib.pyplot as plt import numpy as np from numpy import * from sklearn.tree.tree import DecisionTreeRegressor def plotfigure(X, X_test, y, yp): plt.figure() plt.scatter(X, y, c='k', label='data') plt.plot(X_test, yp, c='r', label='max_depth = 5', linewidth=2) plt.xlabel('data') plt.ylabel('target') plt.title('Decision Tree Regression') plt.legend() plt.show() x = np.linspace(-5, 5, 200) siny = np.sin(x) X = mat(x).T y = siny + np.random.rand(1, len(siny)) * 1.5 y = y.tolist()[0] clf = DecisionTreeRegressor(max_depth=3) clf.fit(X, y) X_test = np.arange(-5.0, 5.0, 0.05)[:, np.newaxis] yp = clf.predict(X_test) plotfigure(X, X_test, y, yp)
]) from sklearn.pipeline import FeatureUnion full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = full_pipeline.fit_transform(housing) housing_test_prepared = full_pipeline.fit_transform(housing_test) model_maps = dict() model_maps["Linear_Regression"] = LinearRegression() model_maps["Logistic_Regression"] = LogisticRegression(random_state=42, n_jobs=-1) model_maps["DecisionTreeRegressor"] = DecisionTreeRegressor(random_state=42) model_maps["RandomForestRegressor"] = RandomForestRegressor(random_state=42, n_jobs=-1) model_maps["SupportVectorRegressor"] = SVR(kernel="linear") results = pd.DataFrame(columns=["Hardware", "ExpID", "RMSETrainCF", "RMSETest", "MAPETrainCF", "MAPETest", "p-value", "TrainTime(s)", "TestTime(s)", "Experiment description"]) mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False) def trainStep(algo, indx, name): print("starting " + str(name) + " training") results.loc[indx] = ["Corei3/8GB", indx + 1, 0, 0, 0, 0, 0, 0, 0, "Training " + str(name)] start_time = time.time() algo.fit(housing_prepared, housing_labels) results.loc[indx, "TrainTime(s)"] = time.time() - start_time print("ends " + str(name) + " training")
'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(), 'EmpiricalCovariance':EmpiricalCovariance(), 'ExtraTreeClassifier':ExtraTreeClassifier(), 'ExtraTreeRegressor':ExtraTreeRegressor(), 'ExtraTreesClassifier':ExtraTreesClassifier(), 'ExtraTreesRegressor':ExtraTreesRegressor(), 'FactorAnalysis':FactorAnalysis(), 'FastICA':FastICA(), 'FeatureAgglomeration':FeatureAgglomeration(), 'FunctionTransformer':FunctionTransformer(), 'GMM':GMM(), 'GaussianMixture':GaussianMixture(), 'GaussianNB':GaussianNB(),
X, y = shuffle(boston.data, boston.target) x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] predictions = [] predictions2 = [] predictions3 = [] predictions4 = [] offset = int(0.7 * len(X)) for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred)
# HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate), DecisionTreeRegressor( random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 ExtraTreeRegressor(random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 SVR() # C: 0.25, 0.5, 1, 5, 10 ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, # chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim, ICAP.icap, MRMR.mrmr,
11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13 ] predictions = [] predictions2 = [] predictions3 = [] predictions4 = [] offset = int(0.7 * len(X)) for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred)
from sympy.core.numbers import RealNumber from sympy.functions.elementary.piecewise import Piecewise from sympy.core.symbol import Symbol import pandas from nose.tools import assert_almost_equal # Create some data m = 10000 X = np.random.normal(size=(m, 10)) thresh = np.random.normal(size=10) X_transformed = X * (X > thresh) beta = np.random.normal(size=10) y = np.dot(X_transformed, beta) + np.random.normal(size=m) # Train a decision tree regressor model = DecisionTreeRegressor() model.fit(X, y) print model.score(X, y) # Inspect def _sym_predict_decision_tree(model, names, current_node=0, output_idx=0, class_idx=0): left = model.tree_.children_left[current_node] right = model.tree_.children_right[current_node] if left == -1: assert right == -1 left_expr = RealNumber(model.tree_.value[current_node, output_idx,
class Model(object): """ The machine learning component of the tester. This component stores four different models: 1) A model to decide between different types of events (drags and touches). 2) A model to decide on the starting position for drags. 3) A model to decide on the ending position for drags. 4) A model to decide on the position of the touch. The input data are all the different known UI elements on the screen from the training data and whether or not they are visible on the screen. To acquire this, we first get the stored XML model and record the resource-id and class. We concatenate them into an array and mark as (1) for visible and (0) for not visible. """ def __init__(self): self.symbols = {} self.action_data = None self.action_labels = None self.action_classifier = None self.drag_data = None self.drag_end_labels = None self.drag_end_classifier = None self.drag_start_labels = None self.drag_start_classifier = None self.touch_data = None self.touch_labels = None self.touch_classifier = None self.device_info = device.info def parse_events(self, queue): symbols = {"randomizer": 0} events = [] all_data = [] all_results = [] drag_data = [] drag_start_results = [] drag_end_results = [] touch_data = [] touch_results = [] while not queue.empty(): event = queue.get() events.append(event) lst = event.state.start.as_list(symbols) lst[0] = random() all_data.append(lst) if event.action.is_drag(): drag_data.append(lst) all_results.append(DRAG) start = event.changes.start() end = event.changes.end() drag_start_results.append(start.x * start.y) drag_end_results.append(end.x * end.y) if event.action.is_touch(): touch_data.append(lst) all_results.append(TOUCH) start = event.changes.start() touch_results.append(start.x * start.y) if event.action.is_back(): all_results.append(BACK) data = np.zeros((len(all_data), len(symbols))) for i, item in enumerate(all_data): data[i, : len(item)] = item[:] drags = np.zeros((len(drag_data), len(symbols))) for i, item in enumerate(drag_data): drags[i, : len(item)] = item[:] touches = np.zeros((len(touch_data), len(symbols))) for i, item in enumerate(touch_data): touches[i, : len(item)] = item[:] self.symbols = symbols self.action_data = data self.action_labels = np.array(all_results) self.drag_data = drags self.drag_start_labels = np.array(drag_start_results) self.drag_end_labels = np.array(drag_end_results) self.touch_data = touches self.touch_labels = np.array(touch_results) def train(self): self.action_classifier = DecisionTreeClassifier() self.action_classifier.fit(self.action_data, self.action_labels) self.drag_start_classifier = DecisionTreeRegressor() self.drag_start_classifier.fit(self.drag_data, self.drag_start_labels) self.drag_end_classifier = DecisionTreeRegressor() self.drag_end_classifier.fit(self.drag_data, self.drag_end_labels) self.touch_classifier = DecisionTreeRegressor() self.touch_classifier.fit(self.touch_data, self.touch_labels) def predict(self, state): input = state.as_list(self.symbols, False) input[0] = random() action = Action() type = self.action_classifier.predict(input) width = self.device_info["displayWidth"] if type == DRAG: start = self.drag_start_classifier.predict(input)[0] end = self.drag_end_classifier.predict(input)[0] start = Point(start % width, start / width) end = Point(end % width, end / width) action.init(ACTION_DRAG, start, end, 0.5) elif type == TOUCH: point = self.touch_classifier.predict(input)[0] point = Point(point % width, point / width) action.init(ACTION_TOUCH, point.x, point.y) elif type == BACK: action.init(ACTION_BACK) return action def save(self): pass
'elastic_net': { 'max_iter': [5, 10, 15], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio': np.arange(0.0, 1.0, 0.1) }, 'extra_trees': { "n_estimators": [80], 'max_depth': [ 30, ], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [0.01, 0.05, 0.10], 'min_samples_leaf': [0.005, 0.05, 0.10], }, 'bagging': { "base_estimator": [DecisionTreeRegressor(max_depth=8)], "n_estimators": [200], "max_features": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], }, 'sgd': { "alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0], "penalty": ["l1", "l2"], "loss": [ 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ] }, 'linear_svr': { "C": [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100], "loss": ['epsilon_insensitive', 'squared_epsilon_insensitive']
store_pkl(auto_mapper, "Auto.pkl") auto_X = auto[:, 0:9] auto_y = auto[:, 9] print(auto_X.dtype, auto_y.dtype) def build_auto(regressor, name): regressor = regressor.fit(auto_X, auto_y) store_pkl(regressor, name + ".pkl") mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), "DecisionTreeAuto") build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(
]) auto = auto_mapper.fit_transform(auto_df) store_pkl(auto_mapper, "Auto.pkl") auto_X = auto[:, 0:7] auto_y = auto[:, 7] print(auto_X.dtype, auto_y.dtype) def predict_auto(regressor): mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"]) return mpg auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5) auto_tree.fit(auto_X, auto_y) store_pkl(auto_tree, "DecisionTreeAuto.pkl") store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv") auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5) auto_forest.fit(auto_X, auto_y) store_pkl(auto_forest, "RandomForestAuto.pkl") store_csv(predict_auto(auto_forest), "RandomForestAuto.csv") auto_regression = LinearRegression() auto_regression.fit(auto_X, auto_y) store_pkl(auto_regression, "RegressionAuto.pkl")
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc=None, X_csr=None): """Fit another stage of ``n_classes_`` trees to the boosting model. """ assert sample_mask.dtype == np.bool loss = self.loss_ original_y = y for k in range(loss.K): if loss.is_multi_class: y = np.array(original_y == k, dtype=np.float64) residual = loss.negative_gradient(y, y_pred, k=k, sample_weight=sample_weight) # induce regression tree on residuals tree = DecisionTreeRegressor( criterion=self.criterion, splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, presort=self.presort) if self.subsample < 1.0: # no inplace multiplication! sample_weight = sample_weight * sample_mask.astype(np.float64) if X_csc is not None: tree.fit(X_csc, residual, sample_weight=sample_weight, check_input=False, X_idx_sorted=X_idx_sorted) else: tree.fit(X, residual, sample_weight=sample_weight, check_input=False, X_idx_sorted=X_idx_sorted) # update tree leaves if i == 0: if X_csr is not None: loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) else: loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) # add tree to ensemble self.estimators_[i, k] = tree return y_pred else: if X_csr is not None: loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) else: loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred, sample_weight, sample_mask, function(i), k=k) # add tree to ensemble self.estimators_[i, k] = tree return y_pred
'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'random_forest': { 'clf__n_estimators': range(5, 200, 10), 'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'ada_boost': { 'clf__base_estimator': [DecisionTreeRegressor(max_depth=ii) for ii in range(10, 110, 10)], 'clf__n_estimators': range(50, 200, 10), 'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__loss': ['linear', 'square', 'exponential'], }, 'gradient_boost': { 'clf__loss': ['ls', 'lad', 'huber', 'quantile'], 'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__n_estimators': range(100, 350, 10), 'clf__max_depth': range(5, 200, 10), 'clf__min_samples_split': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'clf__min_samples_leaf': [0.2, 0.3, 0.4, 0.5, 1], 'clf__max_features': ['auto', 'sqrt', 'log2', None], 'clf__max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60] }, 'cat_boost': {