def main(data_dir, log_dir, source='xl-1542M-k40', n_train=500000, n_valid=10000, n_jobs=None, verbose=False): train_texts, train_labels = load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = load_split(data_dir, source, 'valid', n=n_valid) test_texts, test_labels = load_split(data_dir, source, 'test') vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21) train_features = vect.fit_transform(train_texts) valid_features = vect.transform(valid_texts) test_features = vect.transform(test_texts) model = LogisticRegression(solver='liblinear') params = { 'C': [1 / 64, 1 / 32, 1 / 16, 1 / 8, 1 / 4, 1 / 2, 1, 2, 4, 8, 16, 32, 64] } split = PredefinedSplit([-1] * n_train + [0] * n_valid) search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False) search.fit(sparse.vstack([train_features, valid_features]), train_labels + valid_labels) model = model.set_params(**search.best_params_) model.fit(train_features, train_labels) valid_accuracy = model.score(valid_features, valid_labels) * 100. test_accuracy = model.score(test_features, test_labels) * 100. data = { 'source': source, 'n_train': n_train, 'valid_accuracy': valid_accuracy, 'test_accuracy': test_accuracy } print(data) json.dump(data, open(os.path.join(log_dir, f'{source}.json'), 'w'))
def predefined_train_test_split(data, labels, folds, workflow, label_encoder): folds = np.asarray(folds) fold_encoder = LabelEncoder() split_encoded = fold_encoder.fit_transform(folds) num_classes = len(label_encoder.classes_) performance = { 'classes': label_encoder.classes_.tolist(), 'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))} } split = PredefinedSplit(split_encoded) for fold_index, (train_inds, test_inds) in enumerate(split.split()): train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds] test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds] prior_train = [0] * num_classes for yy in train_y: prior_train[yy] += 1 prior_test = [0] * num_classes for yy in test_y: prior_test[yy] += 1 clf = deepcopy(workflow) clf.fit(train_x, train_y) param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()} test_pred = clf.predict(test_x) test_ind = folds[test_inds[0]] performance[test_ind] = { 'accuracy': metrics.accuracy_score(test_y, test_pred), 'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'), 'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'), 'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'), 'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'), 'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'), 'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'), 'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(), 'prior_train': prior_train, 'prior_test': prior_test, 'model': serialise_dict(param_dict) } return serialise_dict(performance)
def Hyper_parameter_tuning(X, y, model_estimator, configs): n_train = len(X) validation_fold_ = [-1] * ((int)(0.8 * n_train)) + [0] * ((int)(0.2 * n_train)) validation_fold = np.random.permutation(validation_fold_) # Using GridSearchCV to tune the hyper-parameters ps = PredefinedSplit(validation_fold) clf = GridSearchCV(model_estimator, configs, return_train_score=True, cv=ps, refit=True, n_jobs=-1, scoring=make_scorer(mean_squared_error, greater_is_better=False)) clf.fit(X, y) return clf
def k_fold_cv(X, y, feature_desc): # since fold 4 will be used as a blind set and not part of training, it is removed from fold_ids list. fold_ids = pd.read_csv( "data/raw_data/CV_fold_ids_trval.csv")['FoldID'][0:132] ps = PredefinedSplit(fold_ids) fold_id = 0 y = y[valence_classifier.label_type] for train_index, test_index in ps.split(): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] clf = tune_on_devset(X_train, y_train, X_test, y_test) joblib.dump( clf, "data/models/" + feature_desc + "_fold" + str(fold_id) + '.pkl') fold_id += 1 return
def train(self): print("- Design the baseline") self.build() print("- Read train and dev data sets") x_traindev, y_traindev, dev_fold = MLcls.read_data(train_file=self.args.train_file, dev_file=self.args.dev_file) print("- Train the baseline...") start = time.time() model = GridSearchCV(self.pipeline, self.parameters, cv=PredefinedSplit(test_fold=dev_fold), verbose=5, scoring='f1_weighted') model.fit(x_traindev, y_traindev) end = time.time() print("\t+ Done: %.4f(s)" % (end - start)) self.best_model = model.best_estimator_ MLcls.save(self.best_model, self.args.model_name)
def get_validation_splits(self, data): """ Create cross validation folds of validation data either by date or randomly. :param data:Dataframe to split :return: Stratified folds of the validation split values. """ if self.file_args['train_test_method'] == 'date': week_folds = self.create_week_cv_folds( data, self.file_args['num_cv_folds']) ps = PredefinedSplit(week_folds) if self.file_args['train_test_method'] == 'random': ps = StratifiedKFold(n_splits=self.file_args['num_cv_folds'], shuffle=False, random_state=self.file_args['seed']) return ps
def rolling_model_RF(X_traindata=X_traindata, Y_traindata_demean=np.ravel(Y_traindata_demean), X_traindata1=X_traindata1, Y_traindata1=np.ravel(Y_traindata1), X_testdata=X_testdata, Y_testdata=np.ravel(Y_testdata), mean_Ytrain=mean_Ytrain): # specify parameters and distributions to sample from split_num = 200 * 60 num_valid_size = split_num test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = {"max_features": sp_randint(5, 100), "max_depth": sp_randint(3, 10), "min_samples_split": sp_randint(10, 1000), "min_samples_leaf": sp_randint(10, 1000), "n_estimators": sp_randint(3, 100), "oob_score": [True, False] } clf_RF = RandomForestRegressor(random_state=100) # run randomized search n_iter_search = 50 estim = RandomizedSearchCV(clf_RF, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2',n_jobs=-1, cv=ps.split(), iid=False, random_state=100) estim.fit(X_traindata, Y_traindata_demean) best_estimator = estim.best_estimator_ best_VIP = best_estimator.feature_importances_ train_predict = best_estimator.predict(X_traindata1) + mean_Ytrain IS_score = r2_score(Y_traindata1, train_predict) test_predict = best_estimator.predict(X_testdata) + mean_Ytrain OOS_score = 1- np.sum((Y_testdata-test_predict)**2)/sum((Y_testdata-mean_Ytrain)**2) return IS_score, OOS_score, best_VIP
def grid_search(dataset_path, verbos, saving_path, min_gamma, max_gamma, num_gamma, min_c, max_c, num_c, kernel, train = None, dev = None, test = None): """ grid search function for SVM see help for explanation about the parameters if, train, dev and test are given, ignores dataset_path and uses them instead """ if train is None or dev is None or test is None: train, dev, test = ML_util.get_dataset(dataset_path) train_squeezed = ML_util.squeeze_clusters(train) train_features, train_labels = ML_util.split_features(train_squeezed) dev_squeezed = ML_util.squeeze_clusters(dev) train_dev = np.concatenate((train_squeezed, dev_squeezed)) train_dev_features, train_dev_labels = ML_util.split_features(train_dev) test_inds = np.concatenate((-1 * np.ones((len(train_squeezed))), np.zeros((len(dev_squeezed))))) ps = PredefinedSplit(test_inds) gammas = np.logspace(min_gamma, max_gamma, num_gamma) cs = np.logspace(min_c, max_c, num_c) print() parameters = {'C': cs, 'gamma': gammas} model = svm.SVC(kernel = 'rbf', class_weight = 'balanced') clf = GridSearchCV(model, parameters, cv = ps) print('Starting grid search...') start = time.time() clf.fit(train_dev_features, train_dev_labels) end = time.time() print('Grid search completed in %.2f seconds, best parameters are:' % (end - start)) print(clf.best_params_) C = clf.best_params_['C'] gamma = clf.best_params_['gamma'] calssifier = svm.SVC(kernel = kernel, class_weight = 'balanced', C = C, gamma = gamma) # need to create another one as the other trains on both train and dev calssifier.fit(train_features, train_labels) if verbos: scores = clf.cv_results_['mean_test_score'] cs = [round(v, 3) for v in cs] gammas = [round(v, 9) for v in gammas] create_heatmap(gammas, cs, 'Gamma', 'C', 'SVM Grid Search', scores.reshape((len(cs), len(gammas))), path = saving_path) print() print('Starting evaluation on test set...') return evaluate_predictions(calssifier, test, verbos)
def support_vector_machine(sampling = False, isNotebook = False): print("="*60) print("Running support vector machine...") DATA_FILE = utils.get_data_directory() # The argument of the function will determine weather we use oversampling or not if(sampling): process_method = preprocess.oversample(DATA_FILE) else: process_method = preprocess.preprocess_data(DATA_FILE) X, y = process_method X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.6) X_val, X_test, y_val, y_test = utils.split_data(X_test, y_test, 0.5) X_grid = np.concatenate((X_train, X_val)) y_grid = np.concatenate((y_train, y_val)) separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val] ps = PredefinedSplit(separation_boundary) param_grid = { 'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': [0.01, 0.10, 1.00, 10.00], 'kernel': ['rbf', 'poly'] } clf = GridSearchCV(SVC(random_state=0, probability=True), param_grid, cv=ps) model = clf.fit(X_grid, y_grid) train_acc = model.score(X_train, y_train) val_acc = model.score(X_val, y_val) test_acc = model.score(X_test, y_test) print(f'training score: {round(train_acc, 3)}') print(f'validation score: {round(val_acc, 3)}') print(f'testing score: {round(test_acc, 3)}') report_dict = classification_report(y_test, model.predict(X_test), output_dict = True, target_names=["No", "Yes"]) weights = permutation_importance(model, X_test, y_test) top_weights = list(sorted(enumerate(weights.importances_mean), key = lambda x: x[1], reverse = True)) if isNotebook: return top_weights, model else: utils.display_metrics(report_dict) utils.log_results(top_weights) utils.generate_report("SVM", "SVM", model, X_test, y_test, report_dict)
def do_grid_search_ridge(X_train, y_train, X_val, y_val): # Now let's use sklearn to help us do hyperparameter tuning # GridSearchCv.fit by default splits the data into training and # validation itself; we want to use our own splits, so we need to stack our # training and validation sets together, and supply an index # (validation_fold) to specify which entries are train and which are # validation. X_train_val = np.vstack((X_train, X_val)) y_train_val = np.concatenate((y_train, y_val)) val_fold = [-1] * len(X_train) + [0] * len(X_val) # 0 corresponds to validation # Now we set up and do the grid search over l2reg. The np.concatenate # command illustrates my search for the best hyperparameter. In each line, # I'm zooming in to a particular hyperparameter range that showed promise # in the previous grid. This approach works reasonably well when # performance is convex as a function of the hyperparameter, which it seems # to be here. param_grid = [ { "l2reg": np.unique( np.concatenate( (10.0 ** np.arange(-6, 1, 0.3), np.arange(0.01, 0.05, 0.005)) ) ) } ] ridge_regression_estimator = RidgeRegression() grid = GridSearchCV( ridge_regression_estimator, param_grid, return_train_score=True, cv=PredefinedSplit(test_fold=val_fold), refit=True, scoring=make_scorer(mean_squared_error, greater_is_better=False), ) grid.fit(X_train_val, y_train_val) df = pd.DataFrame(grid.cv_results_) # Flip sign of score back, because GridSearchCV likes to maximize, # so it flips the sign of the score if "greater_is_better=FALSE" df["mean_test_score"] = -df["mean_test_score"] df["mean_train_score"] = -df["mean_train_score"] cols_to_keep = ["param_l2reg", "mean_test_score", "mean_train_score"] df_toshow = df[cols_to_keep].fillna("-") df_toshow = df_toshow.sort_values(by=["param_l2reg"]) return grid, df_toshow
def main(): """ Runs an extensive grid search for n_reviews = 1, 2, 5, 10, 'all', Hyperparameters C = 0.1, 1, 10, 100, 1000 and max_features = 500, 1000, 5000, 10000. Outputs the best parameters for each n in n_reviews. """ for n_reviews in (1, 2, 5, 10, 'all'): print("n_reviews: ", n_reviews) n_samples = 20_000 dataset_path = f'data/datasets/dataset_{n_reviews}_train.pkl' with open(dataset_path, 'rb') as fd: data = pickle.load(fd) X_train = [' '.join(reviews) for gender, reviews in data][:n_samples] y_train = [gender for gender, reviews in data][:n_samples] validation_size = 0.25 test_size = 0.25 train_size = 1 - test_size - validation_size corpus_train, corpus_test, y_train, y_test = train_test_split( X_train, y_train, test_size=test_size) param_grid = { 'max_features': [500, 1000, 5000, 10000], 'C': [0.01, 0.1, 1, 10, 100, 1000], } ps = PredefinedSplit(test_fold=np.concatenate(( -np.ones(int(train_size * len(X_train))), np.zeros(ceil(validation_size * len(X_train)))))) gs = GridSearchCV(GenderEstimator(), param_grid, cv=ps, n_jobs=4, verbose=3) gs.fit(corpus_train, y_train) best_score = gs.best_score_ best_params = gs.best_params_ print("Best parameters: ", best_params) print("Best score: ", best_score)
def random_forest(cfg): # Load data train_df, valid_df, test_df = get_data(cfg) df = pd.concat([train_df, valid_df]) # Remove columns and split data into (X,y) df = df.drop([ 'State_AL', 'State_NC', 'isNaN_rep_income', 'State_FL', 'State_LA', 'isNaN_uti_card_50plus_pct', 'State_SC', 'State_GA', 'State_MS', 'auto_open_36_month_num', 'card_open_36_month_num', 'ind_acc_XYZ' ], axis=1) X = df.drop("Default_ind", axis=1).values y = df["Default_ind"].values # Below 2 lines needed for cross-validation in RandomizedSearchCV split_index = [-1] * len(train_df) + [0] * len(valid_df) pds = PredefinedSplit(test_fold=split_index) # Create classifier and the hyperparameter search space classifier = RandomForestClassifier(n_jobs=-1, verbose=1) param_grid = { "n_estimators": np.arange(50, 1000, 100), "max_depth": np.arange(1, 20), "criterion": ["gini", "entropy"], "min_samples_split": np.arange(2, 10), "max_features": [0.8, "sqrt", "log2"], "min_samples_leaf": np.arange(1, 5), "bootstrap": [True, False], } model = RandomizedSearchCV( estimator=classifier, param_distributions=param_grid, scoring="f1", n_iter=700, verbose=1, n_jobs=1, cv=pds, ) model.fit(X, y) print(model.best_score_) print(model.best_estimator_.get_params()) with open("rf.pkl", "wb") as f: pickle.dump(model.best_estimator_, f)
def prepare_data_gridCrossvalidation(path_samples): # ------ Fetch samples samples_train = fetch_samples(os.path.join(path_samples, 'train')) samples_test = fetch_samples(os.path.join(path_samples, 'test')) test_fold = [] for sample in samples_train: test_fold.append(sample['fold']) # ------ Create feature vector X_train, X_test, Y_train, Y_test, class_names, fvector_labels = create_fvector_train_test( samples_train, samples_test) folds = PredefinedSplit(test_fold) return X_train, X_test, Y_train, Y_test, class_names, fvector_labels, folds, samples_train, samples_test
def defined(self, test_record_names): """Run evaluation ith previously specified detectors on previously specified records. Do not use cross-validation but do a predefined split taking the given records as test records and all other records as training records. Args: test_record_names (list of str): List of record names to use for testing. All other records known to this evaluator are used for training the detectors. """ test_fold = [ 0 if record.record_name in test_record_names else -1 for record in self.records ] self.cval = PredefinedSplit(test_fold) return self._eval_cross_validator()
def generate_cv_splitter(self, df, p_ids_as_testsets=[]): """DEPRECATED: Since BayesOpt requires at least 2 cv splits, this function will return a cv object that returns indices where the provided profile IDs act as testsets. The given df is expected to hold the original train and testset but not the validation set as it is part of the early stopping criteria. """ assert len(p_ids_as_testsets) > 1, 'provide at least two p_ids that ' \ 'shall act as testsets!' df.loc[:, 'cv_split'] = np.NaN for p_enum, p_id in enumerate(p_ids_as_testsets): df.loc[df[self.PROFILE_ID_COL] == p_id, ['cv_split']] = p_enum df.fillna({'cv_split': -1}, inplace=True) ps = PredefinedSplit(test_fold=df['cv_split'].values) return ps
def preprocessed_data(features_dataframe: pd.DataFrame) -> PreprocessedData: kwargs = { "X_train": features_dataframe[:4].to_numpy(), "y_train": np.array([[0, 0, 0, 1], [1, 0, 0, 1], [1, 0, 1, 0], [1, 1, 0, 0]], dtype=np.float32), "X_test": features_dataframe[4:5].to_numpy(), "y_test": np.array([[0, 0, 0, 1]], dtype=np.float32), "splits": PredefinedSplit([-1, -1, -1, 0]), "lb": MultiLabelBinarizer(), } return PreprocessedData(**kwargs)
def svm_exp(): print("=" * 60) print("Running experiement on SVM...") TRAIN_SET = utils.get_data_directory() TEST_SET = utils.get_data_directory(fileName="/experiment-dataset.csv") X, y = preprocess.oversample(TRAIN_SET) X = np.delete(X, slice(4, 13), 1) X_train, X_val, y_train, y_val = utils.split_data(X, y, 0.8) X_test, y_test = preprocess.preprocess_experiment(TEST_SET) X_grid = np.concatenate((X_train, X_val)) y_grid = np.concatenate((y_train, y_val)) separation_boundary = [-1 for _ in y_train] + [0 for _ in y_val] ps = PredefinedSplit(separation_boundary) param_grid = { 'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': [0.01, 0.10, 1.00, 10.00], 'kernel': ['rbf', 'poly'] } print(X_train.shape) clf = GridSearchCV(SVC(random_state=0), param_grid, cv=ps) model = clf.fit(X_grid, y_grid) train_acc = model.score(X_train, y_train) val_acc = model.score(X_val, y_val) test_acc = model.score(X_test, y_test) print(f'training score: {round(train_acc, 3)}') print(f'validation score: {round(val_acc, 3)}') print(f'testing score: {round(test_acc, 3)}') report_dict = classification_report(y_test, model.predict(X_test), output_dict=True, target_names=["No", "Yes"]) utils.display_metrics(report_dict) imps = permutation_importance(model, X_test, y_test) top_feature_importances = list( sorted(enumerate(imps.importances_mean), key=lambda x: x[1], reverse=True)) utils.log_results(top_feature_importances) utils.generate_report("Experiment SVM", "Experimental SVM", model, X_test, y_test, report_dict)
def validate(self, cv_splits, num_runs): x = pd.concat([self.x_train, self.x_val], axis=0) y = pd.concat([self.y_train, self.y_val], axis=0) if cv_splits == 1: splitter = PredefinedSplit([-1 for _ in range(len(x) - 12)] + [0 for _ in range(12)]) split = list(splitter.split(X=x, y=y)) * num_runs else: splitter = TimeSeriesSplit(cv_splits, max_train_size=len(x) - 12) split = list(splitter.split(X=x, y=y)) * num_runs res = map(self._validate, split) res = np.mean(list(res), axis=0) # K.clear_session() return res[0][0], res[1][0]
def __init__(self, parameters, n_iter=50, n_initial=10, n_jobs=1, scoring=None, iid=True, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=False, results_filename='hps_results.pkl'): fixed_params = {} search_spaces = {} for par, val in parameters.items(): if val.__class__ in (Real, Integer, Categorical): search_spaces[par] = val else: fixed_params[par] = val self.estimator = SklEstimator(**fixed_params) self.results_filename = results_filename # para enganar o scikit-learn self.X = np.arange(10) vfold = np.zeros((10, ), np.int) vfold[:5] = -1 psplit = PredefinedSplit(vfold) super().__init__(self.estimator, search_spaces=search_spaces, optimizer_kwargs=dict(n_initial_points=n_initial), n_iter=n_iter, n_jobs=n_jobs, scoring=scoring, fit_params=None, iid=iid, refit=False, cv=psplit, verbose=verbose, pre_dispatch=pre_dispatch, random_state=random_state, error_score=error_score, return_train_score=return_train_score)
def get_lr_model_with_cv_on_clean(alphas, X_train_all, Y_train_all, ms, clean_task, T): # Spread the weights on sample level and define the CV split all_alphas = np.repeat(alphas*T, ms) indexes_cv = (-1)*np.ones(X_train_all.shape[0]) clean_begins = np.sum(ms[:clean_task]) curr_m = ms[clean_task] all_alphas[clean_begins:(clean_begins + curr_m)] = all_alphas[clean_begins:(clean_begins + curr_m)]*(5/4) for l in range(5): indexes_cv[(clean_begins + l*(int(curr_m/5))):(clean_begins + (l+1)*int((curr_m/5)))] = l ps = PredefinedSplit(indexes_cv) # Train on all data, with 5-fold CV on the clean data lr = LogisticRegressionCV(fit_intercept = False, cv = ps) lr.fit(X_train_all, Y_train_all, sample_weight=all_alphas) best_w = lr.coef_[0] return best_w
def rolling_model_GBRTH(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { "max_features": sp_randint(5, 100), "max_depth": sp_randint(3, 12), "min_samples_split": sp_randint(100, 1000), "min_samples_leaf": sp_randint(100, 1000), "n_estimators": sp_randint(5, 100), "learning_rate": uniform(0.001, 0.1), "subsample": uniform(0.6, 0.4) } clf_GBRT = GradientBoostingRegressor(loss='huber', random_state=100) # run randomized search n_iter_search = 100 estim = RandomizedSearchCV(clf_GBRT, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2', cv=ps.split(), iid=False, random_state=100) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def __fun_param_set(self): """Function: Set parameters to train DNN based on input parameters Input: Output: """ #set the number of neurons in hidden layers #layer-2 is half of layer-1 neuron_num_1st_layer = [int(0.1*self.input_dim), int(0.5*self.input_dim)] neuron_num_2nd_layer = [int(x/2) for x in neuron_num_1st_layer] self.neurons = list(zip(neuron_num_1st_layer, neuron_num_2nd_layer)) self.neurons = [list(x) for x in self.neurons] self.optimizer = Adam() #Set activation function for hidden layer self.activation_hidden = 'relu' #set activation function /loss function for output layer based on output dimensionality if self.output_dim > 1: self.activation_output = 'sigmoid' #multi-class multi-label classification self.loss_fun = 'binary_crossentropy' else: self.activation_output = 'softmax' #binary classfication self.loss_fun = 'categorical_crossentropy' #Set batch size if self.batch_size_flag == True: self.batch_size = [16, 32] #Tune batch size else: self.batch_size = [32] #fix batch size if self.dropout_flag == True: self.dropout_rate = [0.2, 0.4] else: self.dropout_rate = [0.4] #split training data into training and validation (fast version of model training) if self.cv == 1: t_size = int(self.x_train.shape[0]*0.8) self.train_val_split = [-1]*t_size + [0]*(self.x_train.shape[0]-t_size) seed(self.rand_seed) shuffle(self.train_val_split) self.ps = PredefinedSplit(self.train_val_split) else: self.ps = self.cv
def RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用决策树 recommendNum : 推荐数量 max_depth 决策树最大深度 min_samples_split 内部节点划分所需最小样本数 min_samples_leaf 叶子节点最小样本数 class_weight 分类权重 """ """设定判断参数""" """训练集按照3 7开分成训练集和交叉验证集""" """自定义验证集 而不是使用交叉验证""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) grid_parameters = [ {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8], 'class_weight': [None]}] # 调节参数 # # scores = ['precision', 'recall'] # 判断依据 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV clf = DecisionTreeClassifier() clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1) clf.fit(train_data, train_data_y) print(clf.best_params_) # dot_data = export_graphviz(clf, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("DTree") pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def test_fogd_softmax_gridsearch(): print( "========== Tune parameters for FOGD for multiclass classification ==========" ) np.random.seed(random_seed()) (x_train, y_train), (x_test, y_test) = demo.load_iris() print("Number of training samples = {}".format(x_train.shape[0])) print("Number of testing samples = {}".format(x_test.shape[0])) x = np.vstack((x_train, x_test)) y = np.concatenate((y_train, y_test)) params = {'gamma': [0.5, 1.0], 'learning_rate': [0.01, 0.5, 0.1]} ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] + [1] * x_test.shape[0]) clf = FOGD(model_name="FOGD_hinge", D=100, lbd=0.0, gamma=0.5, loss='hinge', catch_exception=True, random_state=random_seed()) gs = GridSearchCV(clf, params, cv=ps, n_jobs=-1, refit=False, verbose=True) gs.fit(x, y) print("Best error {} @ params {}".format(-gs.best_score_, gs.best_params_)) best_clf = clone(clf).set_params(**gs.best_params_) best_clf.fit(x_train, y_train) print("Mistake rate = %.4f" % best_clf.mistake) # offline prediction print("Offline prediction") y_train_pred = best_clf.predict(x_train) y_test_pred = best_clf.predict(x_test) train_err = 1 - metrics.accuracy_score(y_train, y_train_pred) test_err = 1 - metrics.accuracy_score(y_test, y_test_pred) print("Training error = %.4f" % train_err) print("Testing error = %.4f" % test_err)
def test_tfglm_regression_gridsearch(): print( "========== Tune parameters for TensorFlowGLM for regression ==========" ) np.random.seed(random_seed()) (x_train, y_train), (x_test, y_test) = demo.load_housing() print("Number of training samples = {}".format(x_train.shape[0])) print("Number of testing samples = {}".format(x_test.shape[0])) x = np.vstack((x_train, x_test)) y = np.concatenate((y_train, y_test)) params = {'l1_penalty': [0.0, 0.0001], 'l2_penalty': [0.0001, 0.001, 0.01]} ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] + [1] * x_test.shape[0]) clf = TensorFlowGLM( model_name="TensorFlowGLM_regression_gridsearch", task='regression', link='linear', # link function loss='quadratic', # loss function l2_penalty=0.0, # ridge regularization l1_penalty=0.0, # Lasso regularization l1_smooth=1E-5, # smoothing for Lasso regularization l1_method='pseudo_huber', # approximation method for L1-norm learning_rate=0.0001, catch_exception=True, random_state=random_seed()) gs = GridSearchCV(clf, params, cv=ps, n_jobs=1, refit=False, verbose=True) gs.fit(x, y) print("Best MSE {} @ params {}".format(-gs.best_score_, gs.best_params_)) best_clf = clone(clf).set_params(**gs.best_params_) best_clf.fit(x_train, y_train) train_err = -best_clf.score(x_train, y_train) test_err = -best_clf.score(x_test, y_test) print("Training MSE = %.4f" % train_err) print("Testing MSE = %.4f" % test_err) assert abs(test_err + gs.best_score_) < 1e-4
def test_rsrbm_gridsearch(): print( "========== Tuning parameters for the pipeline of " "ReplicatedSoftmaxRBM followed by k-nearest-neighbors (kNN) ==========" ) np.random.seed(random_seed()) from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.model_selection import PredefinedSplit from sklearn.neighbors import KNeighborsClassifier (x_train, y_train), (x_test, y_test) = demo.load_20newsgroups() x = np.vstack([x_train, x_test]) y = np.concatenate([y_train, y_test]) estimators = [('rbm', ReplicatedSoftmaxRBM(num_hidden=15, num_visible=5000, batch_size=32, num_epochs=2, learning_rate=0.001, learning_rate_hidden=0.00001, momentum_method='sudden', weight_cost=2e-4, random_state=random_seed(), verbose=0)), ('knn', KNeighborsClassifier(n_neighbors=4))] params = dict(rbm__num_hidden=[10, 15], rbm__batch_size=[64, 100], knn__n_neighbors=[1, 2]) ps = PredefinedSplit(test_fold=[-1] * x_train.shape[0] + [1] * x_test.shape[0]) clf = Pipeline(estimators) gs = GridSearchCV(clf, params, cv=ps, n_jobs=-1, refit=False, verbose=True) gs.fit(x, y) print("Best error {} @ params {}".format(1.0 - gs.best_score_, gs.best_params_))
def computePredefinedSplit(dataset, parameters): tenFold = TenFoldArffFile(dataset) X = None Y = None foldIdx = 0 while tenFold.loadNextFold(): xTrain, yTrain, xTest, yTest = getFoldData(tenFold) if X is None: X = np.concatenate([xTrain, xTest]) Y = np.concatenate([yTrain, yTest]) indexes = np.full(X.shape[0], -1) xTrain = xTrain.to_numpy() xTest = xTest.to_numpy() for item in xTest: index = np.where((X == item).all(axis=1))[0] indexes[index] = foldIdx foldIdx += 1 return X, Y, PredefinedSplit(indexes)
def get_n_fold_by_drugs(all_drugs, n_splits=5): unique_drugs = np.unique(all_drugs, axis=0) test_folds = np.ones(all_drugs.shape[0]) kf = KFold(n_splits, random_state=15) j = 0 for _, validation_drugs in kf.split(np.arange(unique_drugs.shape[0])): val_inds = [] for drug_ind in validation_drugs: willbe_added = list( np.where((~(all_drugs == unique_drugs[drug_ind, :])).sum( axis=1) == 0)[0]) val_inds += willbe_added test_folds[val_inds] = j j += 1 return PredefinedSplit(test_folds)
def gridsearch_method(self,df): code = df.iloc[0]["product_code"] if code not in self.gs_code: self.gs_code.append(code) df.sort_index(inplace=True) from sklearn.model_selection import GridSearchCV from sklearn.model_selection import PredefinedSplit from sklearn.ensemble import GradientBoostingRegressor train_feature = df.head(df.shape[0] - 18).iloc[:][self.train_col] train_real = df.head(df.shape[0] - 18).iloc[:]["True_volume"] val_split = np.zeros(train_feature.shape[0]) val_split[:(train_feature.shape[0] - 18)] = -1 ps = PredefinedSplit(test_fold=val_split) GBR = GradientBoostingRegressor(random_state=0) self.clf[code] = GridSearchCV(GBR, self.param, scoring='neg_mean_absolute_error', cv=ps) self.clf[code].fit(train_feature, train_real) print(code, self.clf[code].best_params_) pass
def cv_score(self, leaf, lay, bootstrap): balance = {0: 1, 1: 1.4} self.forest = RandomForestClassifier(n_estimators=100, min_samples_leaf=leaf, max_depth=lay, class_weight=balance, bootstrap=bootstrap) cv_score = cross_validate(self.forest, self.train_set_params, self.train_set_labels, cv=PredefinedSplit(self.stock_kfold_idxs), return_train_score=True) train_score, test_score = cv_score["train_score"], cv_score[ "test_score"] print( f"The training score was {np.mean(train_score)} and the validation score {np.mean(test_score)}" )