def test_jdata_fscorer_class(): monkey_patch.run() user_sku_pair, data, target, pred_proba, test_fold, expected_scores = ( get_jdata_test_cases()) pred_map = {} clf = MockEstimatorWithPredefinedPrediction(pred_map) ps = PredefinedSplit(test_fold) for train_index, test_index in ps.split(): print("TRAIN:", train_index, "TEST:", test_index) clf.set(data[train_index, :], pred_proba[train_index]) clf.set(data[test_index, :], pred_proba[test_index]) scoring = { "custom_score_index": JDataScore(), "custom_score_index_with_user_sku_pair": JDataScore(user_sku_pair), } scores = cross_validate(clf, data, target, scoring=scoring, cv=ps, return_estimator=True) for name in scoring.keys(): assert_almost_equal(scores[f"test_{name}"], expected_scores)
def nested_cv(X, y, estimator, scorer, param_grid, num_trials=10, n_splits=3, n_high=5, random_state=42*31415): groups = group_samples_by_threshold(y, [1e3, 1e5]) # Data Storage for CV Scores cv_scores = [] # Arrays to store scores nested_scores = np.full(num_trials, -np.Inf) # Best regression model (return value) rg_best = None for i in tqdm(range(num_trials)): seed = i * random_state inner_cv = PredefinedSplit(split_keep_n_high_grouped(y, groups, folds=n_splits, n_high=n_high, random_state=seed)) outer_cv = PredefinedSplit(split_keep_n_high_grouped(y, groups, folds=n_splits, n_high=n_high, random_state =seed)) # Non_nested parameter search and scoring rg = GridSearchCV(estimator=estimator, param_grid=param_grid, iid=False, cv=inner_cv, scoring=scorer, return_train_score=True) rg.fit(X, y) # Nested CV with parameter optimization nested_score = cross_val_score(rg.best_estimator_, X=X, y=y, cv=outer_cv, scoring=scorer) nested_scores[i] = nested_score.mean() if nested_scores.max() == nested_scores[i]: rg_best = rg.best_estimator_ cv_scores.append({'gs_scores':pd.DataFrame(rg.cv_results_).sort_values('mean_test_score')[['params', 'mean_test_score']], 'ns_scores':nested_score}) return rg_best, cv_scores
def __init__(self, X, p=5, n_splits=2): self.X = X self.p = p self.n_splits = n_splits test_fold = self.X.groupby("user_id").cumcount().apply( lambda x: int(x / p) if x < (n_splits * p) else -1) self.s = PredefinedSplit(test_fold)
def _load(collection, name, dirname=None): """Load dataset.""" filename = _fetch_partition(collection, name, '', dirname=dirname) filename_tr = _fetch_partition(collection, name, '.tr', dirname=dirname) filename_val = _fetch_partition(collection, name, '.val', dirname=dirname) filename_t = _fetch_partition(collection, name, '.t', dirname=dirname) filename_r = _fetch_partition(collection, name, '.r', dirname=dirname) if (filename_tr is not None) and (filename_val is not None) and (filename_t is not None): _, _, X_tr, y_tr, X_val, y_val, X_test, y_test = load_svmlight_files( [filename, filename_tr, filename_val, filename_t]) cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0]) X = sp.sparse.vstack((X_tr, X_val)) y = np.hstack((y_tr, y_val)) X_remaining = y_remaining = None elif (filename_tr is not None) and (filename_val is not None): _, _, X_tr, y_tr, X_val, y_val = load_svmlight_files( [filename, filename_tr, filename_val]) cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0]) X = sp.sparse.vstack((X_tr, X_val)) y = np.hstack((y_tr, y_val)) X_test = y_test = X_remaining = y_remaining = None elif (filename_t is not None) and (filename_r is not None): X, y, X_test, y_test, X_remaining, y_remaining = load_svmlight_files( [filename, filename_t, filename_r]) cv = None elif filename_t is not None: X, y, X_test, y_test = load_svmlight_files([filename, filename_t]) X_remaining = y_remaining = cv = None else: X, y = load_svmlight_file(filename) X_test = y_test = X_remaining = y_remaining = cv = None return X, y, X_test, y_test, cv, X_remaining, y_remaining
def rolling_model_PLS(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = {'n_components': sp_randint(1, 100), 'max_iter': sp_randint(50, len(X_traindata)), 'tol': [0.0001, 0.00001, 0.000001, 0.0000001]} # param_dist = {'n_components':[3,4]} PLS_model = PLSRegression(scale=False) # run gridsearchcv make_scorer(r2_score) n_iter_search = 50 estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2', cv=ps.split(), iid=False, n_jobs=1, n_iter=n_iter_search) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def decode(X, y, cv_ids, model): """ Parameters -------------- X: np.array, n_stimuli x n_voxels y: np.array, n_stimuli, cv_ids: np.array - n_stimuli, Return -------------- models, scores """ scores = [] models = [] ps = PredefinedSplit(cv_ids) for train_index, test_index in ps.split(): # split the data X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # fit the model on the training set model.fit(X_train, y_train) # calculate the accuracy for the hold out run score = model.score(X_test, y_test) # save stuff models.append(deepcopy(model)) scores.append(score) return models, scores
def test_predefined_split(): cv = PredefinedSplit(np.array(list(range(4)) * 5)) cv2 = PredefinedSplit(np.array(list(range(5)) * 4)) assert tokenize(cv) == tokenize(cv) assert tokenize(cv) != tokenize(cv2) sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
def main(argv): start_time = datetime.now() logger.info("START") args = argparser.parse_args() inFile = args.inFile testFile = args.testFile nameModel = args.nameModel conf_file = args.mod mod = __import__(conf_file, fromlist=['*']) model_conf = mod.gridSearch_Model_types[nameModel] conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf) prefix_dict = conf['prefix_dict'] out_dict = h.outfileName(fo=args.outFile, fi=inFile, prefix_dict=prefix_dict, add_date=True) logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile)) logger.info("LOADING THE DATA SET") param_grid = PARAM_DICT[nameModel] # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)} scoring = {'RMS': make_scorer(r2_score)} X, Y, len_train, numFeatures = readFile(inFile) cv = None if testFile: logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" % testFile) X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile) X = pd.concat([X, X_test], ignore_index=True) Y = pd.concat([Y, Y_test], ignore_index=True) cv_arr = [1] * len_train cv_arr.extend([0] * len_train_test) cv = PredefinedSplit(test_fold=cv_arr) print("Stampa di cv: ", cv) print("numero di fold", cv.get_n_splits()) for train_index, test_index in cv.split(): print("TRAIN:", train_index, "TEST:", test_index) logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape) logger.info("CREATION OF THE MODEL") t = TestClass(conf=conf, nm=nameModel, nf=numFeatures) if nameModel == 'NN': model = KerasClassifier(build_fn=t.createModelNN) X = X.as_matrix() Y = Y.as_matrix() else: model = t.selectModel() logger.info("START GRID SEARCH") grid_result = gridSearch(model, param_grid, cv, X, Y, scoring) logger.info("END OF GRID SEARCH") logger.info("PRINTING RESULTS") gridResults(grid_result, X, nameModel) SaveModel(nameModel, grid_result) logger.info("EXECUTED IN %f SEC" % ((datetime.now() - start_time)).total_seconds()) logger.info("END")
def split_dataset(dataset): X = dataset.drop(y_col, axis=1) y = dataset[y_col] test_fold = (fold_pattern * ( (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]] splitter = PredefinedSplit(test_fold) for train_index, test_index in splitter.split(): X_train, X_test = safe_indexing(X, train_index), safe_indexing( X, test_index) y_train, y_test = safe_indexing(y, train_index), safe_indexing( y, test_index) return X_train, y_train, X_test, y_test
def split(self, data): """Perform a data split with a fixed size for the test set""" data_size = 0 if data.is_row_split_validation(): #Time series split data by columns data_size = data.get_features().shape[1] else: data_size = data.get_features().shape[0] test_fold = [-1 for i in range(0, data_size - self.test_size_)] test_fold += [0 for i in range(data_size - self.test_size_, data_size)] splitter = PredefinedSplit(test_fold=test_fold) return splitter.split()
def rolling_model_ENetH(df_X, df_Y): split_num = 200 * 60 X_traindata = df_X[:split_num * 2] Y_traindata = df_Y[:split_num * 2] X_vdata = df_X[split_num:split_num * 2] X_testdata = df_X[split_num * 2:split_num * 3] Y_testdata = df_Y[split_num * 2:split_num * 3] # specify parameters and distributions to sample from num_valid_size = len(X_traindata) - len(X_vdata) test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { 'alpha': uniform(0.00001, 0.1), 'power_t': uniform(0.1, 0.9), 'l1_ratio': uniform(0.1, 0.9), 'eta0': uniform(0.00001, 0.1), 'epsilon': uniform(0.01, 0.9), 'max_iter': sp_randint(5, 10000), 'tol': [0.01, 0.001, 0.0001, 0.00001], 'fit_intercept': [True, False] } clf = SGDRegressor(shuffle=False, loss='huber', penalty='elasticnet', random_state=100) # run randomized search n_iter_search = 100 estim = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring='r2', cv=ps.split(), iid=False, random_state=100, n_jobs=1) estim.fit(X_traindata, Y_traindata) best_estimator = estim.best_estimator_ v_pred = best_estimator.predict(df_X[:split_num]) v_performance_score = r2_score(df_Y[:split_num], v_pred) test_pre_y_array = best_estimator.predict(X_testdata) test_performance_score = r2_score(Y_testdata, test_pre_y_array) return v_performance_score, test_performance_score
def train_self(scoring='accuracy'): csv_dir = Path("Features/CSV") for i in range(10): train_test_dir = csv_dir / f"train_test{i}" results_dir = Path("Results") / f"self{i}" results_dir.mkdir(exist_ok=True) for dataset_file in train_test_dir.glob("*test*"): dataset = str(dataset_file.stem).split("_test")[0] suffixes = ["train", "train_train", "train_val"] keys = [f"{s}" for s in suffixes] df_dict = { key: pd.read_csv(train_test_dir / f"{dataset}_{key}.csv") for key in keys } #xgboost with eval ################################### data = pd.concat([df_dict["train_train"], df_dict["train_val"]], axis=0) data.reset_index(inplace=True, drop=True) val_idx = np.concatenate( ((-1) * np.ones(df_dict["train_train"].shape[0]), np.zeros(df_dict["train_val"].shape[0]))) ps = PredefinedSplit(val_idx) X = data.drop(columns=["Label", "microRNA_name"]) y = data.Label.ravel() train_index, val_index = next(ps.split()) X_val = X.iloc[val_index] y_val = y[val_index] output_file = results_dir / f"{dataset}_xgbs_val_results.csv" print(output_file) if not output_file.exists(): clf = XGBClassifier(silent=True) grid_obj = GridSearchCV(clf, XGBS_PARAMS, scoring=scoring, cv=ps, verbose=3) fit_params = { "eval_set": [(X_val, y_val)], "early_stopping_rounds": 50 } grid_obj.fit(X, y, **fit_params) print('\n Best estimator:') print(grid_obj.best_estimator_) print(grid_obj.best_score_ * 2 - 1) results = pd.DataFrame(grid_obj.cv_results_) results.to_csv(output_file, index=False)
def cross_val_predict(X, y, estimator, param_grid=None, num_cvfolds=5, num_tunefolds=3, logger=None, random_state=None): """ Generates predictions for all instances in X using cross-validation. """ # create folds np.random.seed(random_state) cv_folds = np.random.randint(num_cvfolds, size=X.shape[0]) # store predictions p = y.copy().astype(float) # make predictions on each fold for i, (train_index, test_index) in enumerate(PredefinedSplit(cv_folds).split()): start = time.time() X_train, y_train = X[train_index], y[train_index] X_test = X[test_index] # tune the hyperparameters on this training fold if param_grid is not None: np.random.seed(random_state) tune_folds = np.random.randint(num_tunefolds, size=X_train.shape[0]) model = GridSearchCV(clone(estimator), cv=PredefinedSplit(tune_folds), param_grid=param_grid) model = clone(model).fit(X_train, y_train) else: model = clone(estimator).fit(X_train, y_train) # make predictions on this test set y_score = model.predict_proba(X_test)[:, 1] np.put(p, test_index, y_score) if logger: logger.info('[CV] fold {}: {:.3f}s'.format(i, time.time() - start)) assert len(p) == len(y) return p
def __fun_param_set(self): """Function: Set parameters to train DNN based on input parameters Input: Output: """ #set the number of neurons in hidden layers #layer-2 is half of layer-1 neuron_num_1st_layer = [ self.input_dim, int(1.5 * self.input_dim), 2 * self.input_dim ] neuron_num_2nd_layer = [int(x / 2) for x in neuron_num_1st_layer] self.neurons = list(zip(neuron_num_1st_layer, neuron_num_2nd_layer)) self.neurons = [list(x) for x in self.neurons] self.optimizer = Adam( ) # By default we use Adam, and not tune learning rate #Set activation function for hidden layer self.activation_hidden = 'relu' #set activation function /loss function for output layer based on output dimensionality if self.output_dim > 1: self.activation_output = 'sigmoid' #multi-class multi-label classification self.loss_fun = 'binary_crossentropy' else: self.activation_output = 'softmax' #binary classfication self.loss_fun = 'categorical_crossentropy' #Set batch size if self.batch_size_flag == True: self.batch_size = [16, 32] #Tune batch size else: self.batch_size = [32] #fix batch size if self.dropout_flag == True: self.dropout_rate = [0.2, 0.4] else: self.dropout_rate = [0.2] #split training data into training and validation (fast version of model training) if self.cv == 1: t_size = int(self.x_train.shape[0] * 0.8) self.train_val_split = [-1] * t_size + [0] * ( self.x_train.shape[0] - t_size) seed(self.rand_seed) shuffle(self.train_val_split) self.ps = PredefinedSplit(self.train_val_split) else: self.ps = self.cv
def rolling_model_PLS( X_traindata=X_traindata, Y_traindata_demean=np.ravel(Y_traindata_demean), X_traindata1=X_traindata1, Y_traindata1=np.ravel(Y_traindata1), X_testdata=X_testdata, Y_testdata=np.ravel(Y_testdata), mean_Ytrain=mean_Ytrain): # specify parameters and distributions to sample from split_num = 200 * 60 num_valid_size = split_num test_fold = -1 * np.ones(len(X_traindata)) test_fold[num_valid_size:] = 0 ps = PredefinedSplit(test_fold) # specify parameters and distributions to sample from param_dist = { 'n_components': sp_randint(1, 31), 'max_iter': sp_randint(50, len(X_traindata)), 'tol': [0.0001, 0.00001, 0.000001, 0.0000001] } PLS_model = PLSRegression(scale=False) # run gridsearchcv make_scorer(r2_score) n_iter_search = 50 estim = RandomizedSearchCV(PLS_model, param_distributions=param_dist, scoring='r2', cv=ps.split(), iid=False, n_jobs=-1, n_iter=n_iter_search) estim.fit(X_traindata, Y_traindata_demean) best_estimator = estim.best_estimator_ train_predict = best_estimator.predict( X_traindata1) + mean_Ytrain IS_score = r2_score(Y_traindata1, train_predict) test_predict = best_estimator.predict(X_testdata) + mean_Ytrain test_predict = test_predict[:, 0] OOS_score = 1 - np.sum( (Y_testdata - test_predict)**2) / np.sum( (Y_testdata - mean_Ytrain)**2) return IS_score, OOS_score
def train(self, data, clf='rf', param_search='single', tune_size=0.15, scoring='roc_auc', n_jobs=1, verbose=1): """Trains a classifier with the specified training data. data: tuple including training data. clf: string of {'rf' 'lr', 'xgb'}. Returns trained classifier.""" x_train, y_train, _, features = data if param_search == 'single' or tune_size == 0: model, params = self.classifier(clf, param_search='single') model.set_params(**params) elif tune_size > 0: t1 = self.out('tuning...') model, params = self.classifier(clf, param_search=param_search) train_len = x_train.shape[0] split_ndx = train_len - int(train_len * tune_size) sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:] sm_train_fold = np.full(sm_x_train.shape[0], -1) val_fold = np.full(x_val.shape[0], 0) predefined_fold = np.append(sm_train_fold, val_fold) ps = PredefinedSplit(predefined_fold) cv = ps.split(x_train, y_train) m = GridSearchCV(model, params, scoring=scoring, cv=cv, verbose=verbose, n_jobs=n_jobs) m.fit(x_train, y_train) model = m.best_estimator_ self.time(t1) t1 = self.out('training...') if clf == 'lgb': cat_feat = ['app', 'device', 'os', 'channel', 'hour'] cat_feat_ndx = [features.index(x) for x in cat_feat] train_len = x_train.shape[0] split_ndx = train_len - int(train_len * tune_size) sm_x_train, x_val = x_train[:split_ndx], x_train[split_ndx:] sm_y_train, y_val = y_train[:split_ndx], y_train[split_ndx:] eval_set = (x_val, y_val) model = model.fit(sm_x_train, sm_y_train, eval_set=eval_set, early_stopping_rounds=50, eval_metric='auc', categorical_feature=cat_feat_ndx) else: model = model.fit(x_train, y_train) self.time(t1) self.out(str(model)) return model
def predefined_train_test_split(data, labels, folds, workflow, label_encoder): folds = np.asarray(folds) fold_encoder = LabelEncoder() split_encoded = fold_encoder.fit_transform(folds) num_classes = len(label_encoder.classes_) performance = { 'classes': label_encoder.classes_.tolist(), 'intervals': {key: np.sum(folds == key) for key in sorted(list(set(folds)))} } split = PredefinedSplit(split_encoded) for fold_index, (train_inds, test_inds) in enumerate(split.split()): train_x, train_y = [data[ii] for ii in train_inds], [labels[ii] for ii in train_inds] test_x, test_y = [data[ii] for ii in test_inds], [labels[ii] for ii in test_inds] prior_train = [0] * num_classes for yy in train_y: prior_train[yy] += 1 prior_test = [0] * num_classes for yy in test_y: prior_test[yy] += 1 clf = deepcopy(workflow) clf.fit(train_x, train_y) param_dict = {kk: vv.__dict__ for kk, vv in clf.named_steps.iteritems()} test_pred = clf.predict(test_x) test_ind = folds[test_inds[0]] performance[test_ind] = { 'accuracy': metrics.accuracy_score(test_y, test_pred), 'precision_micro': metrics.precision_score(test_y, test_pred, average='micro'), 'precision_macro': metrics.precision_score(test_y, test_pred, average='micro'), 'recall_micro': metrics.recall_score(test_y, test_pred, average='micro'), 'recall_macro': metrics.recall_score(test_y, test_pred, average='macro'), 'f1_score_micro': metrics.f1_score(test_y, test_pred, average='micro'), 'f1_score_macro': metrics.f1_score(test_y, test_pred, average='macro'), 'confusion_matrix': metrics.confusion_matrix(test_y, test_pred).tolist(), 'prior_train': prior_train, 'prior_test': prior_test, 'model': serialise_dict(param_dict) } return serialise_dict(performance)
def __init__(self, train, dev, config, cross_val=True): self.baseline_config = config self.split = 5 if cross_val == False: train_samples = [-1 for i in range(len(train[0]))] dev_samples = [0 for i in range(len(dev[0]))] self.split = PredefinedSplit(test_fold=np.concatenate((train_samples, dev_samples))) self.train_x = np.concatenate((train[0], dev[0])) self.train_y = np.concatenate((train[1], dev[1])) print(f'Finding optimal CNN model configuration with:') print(f"Number of classes: {self.baseline_config['data']['num_classes']}") print(f"Static: {self.baseline_config['CNN']['static']}") print(f"Dataset path: {self.baseline_config['data']['output']}\n\n") self.activation_function = self.baseline_config['CNN']['activation_function'] self.filter_sizes = self.baseline_config['CNN']['filter_sizes'] self.output_filters_per_size = self.baseline_config['CNN']['output_filters_per_size'] self.dropout_rate = self.baseline_config['CNN']['dropout_rate'] self.batch_size = None self.epochs = None self.batch_size, self.epochs = self.best_batch_size_and_epochs() individual_filter_size = self.best_individual_filter_size() self.filter_sizes = self.best_filter_size_combination(individual_filter_size) self.activation_function = self.best_activation_function() self.output_filters_per_size, self.dropout_rate = self.best_num_feature_maps_and_dropout() print('Optimal configuration:') self.print_configuration()
def train_model(dataset, classifier, params): if params != None: split = PredefinedSplit( test_fold=[-1 for i in range(dataset['train'][0].shape[0])] + [0 for i in range(dataset['valid'][0].shape[0])]) classifier = GridSearchCV(classifier, params, cv=split, refit=True) merged_input = sparse.vstack( [dataset['train'][0], dataset['valid'][0]]) merged_output = np.concatenate( (dataset['train'][1], dataset['valid'][1])) classifier.fit(merged_input, merged_output) else: classifier.fit(dataset['train'][0], dataset['train'][1]) prediction_train = f1_score(dataset['train'][1], classifier.predict(dataset['train'][0]), average=AVERAGE) prediction_valid = f1_score(dataset['valid'][1], classifier.predict(dataset['valid'][0]), average=AVERAGE) prediction_test = f1_score(dataset['test'][1], classifier.predict(dataset['test'][0]), average=AVERAGE) best_param = None if params == None else classifier.best_params_ return prediction_train, prediction_valid, prediction_test, best_param
def find_best_params(self,validation_data_x,validation_data_y,n_jobs=1,params=[]): if not params: params = self.get_default_param_grid() merged_x = Data.merge_arrays(self.training_data_x, validation_data_x) merged_y = Data.merge_arrays(self.training_data_y, validation_data_y) test_fold = [] for i in range(0,len(self.training_data_y)): test_fold.append(1) for i in range(0,len(validation_data_y)): test_fold.append(0) cv = PredefinedSplit(test_fold) gs = GridSearchCV( estimator=GaussianNB(), scoring='f1_micro', param_grid=params, n_jobs=n_jobs, cv=cv ) gs.fit(merged_x,merged_y) best_params = gs.best_params_ results = gs.cv_results_ return best_params,results
def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10): res_train = np.zeros((train.shape[0], 2)) res_test = np.zeros((test.shape[0], 2)) for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)), total=cv.get_n_splits()): target_trn = target.iloc[trn_idx] X_trn = train.iloc[trn_idx] X_val = train.iloc[val_idx] n = X_trn[target_trn == 0] p = X_trn[target_trn == 1] for j, X in enumerate([n, p]): u = build(X, n_trees) res_train[val_idx, j] = get_feat(X_val, u, k=k) res_test[:, j] += get_feat(test, u, k) res_test /= cv.get_n_splits() return res_train, res_test
def fit(self, X_train, y_train, X_val, y_val): if X_train.ndim != 2: raise Exception('ValueError: `X_train` is incompatible: expected ndim=4, found ndim='+str(X_train.ndim)) elif X_val.ndim != 2: raise Exception('ValueError: `X_val` is incompatible: expected ndim=4, found ndim='+str(X_val.ndim)) print('Dimension of training set is: {} and label is: {}'.format(X_train.shape, y_train.shape)) print('Dimension of validation set is: {} and label is: {}'.format(X_val.shape, y_val.shape)) X_all = np.concatenate((X_train, X_val),axis=0) y_all = np.concatenate((y_train, y_val),axis=0) # Create a list where train data indices are -1 and validation data indices are 0 tr_index = np.full((X_train.shape[0]), -1) val_index = np.full((X_val.shape[0]), 0) split_index = np.concatenate((tr_index, val_index), axis=0).tolist() # Use the list to create PredefinedSplit pds = PredefinedSplit(test_fold = split_index) clf = GridSearchCV(estimator=SVC(), param_grid=self.tuned_parameters, cv=pds, scoring = 'accuracy') start = time.time() clf.fit(X_all , y_all) end = time.time() #Clasifying with an optimal parameter set Optimal_params = clf.best_params_ print(Optimal_params) classifier = SVC(**Optimal_params) classifier.fit(X_train, y_train) dump(classifier, self.model_path) write_log(filepath=self.time_log, data=['time_log'], mode='w') write_log(filepath=self.time_log, data=[end-start], mode='a')
def train_predict(clf_key, classifier, param_grid, trainX, trainY, valX, valY, testX, testY): all_tr_val_X = np.vstack((trainX, valX)) all_tr_val_Y = np.hstack((trainY, valY)) fold_meta = np.zeros(all_tr_val_X.shape[0]) fold_meta[0:trainX.shape[0]] = -1 cv = PredefinedSplit(test_fold=fold_meta) gcv = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=cv, verbose=0, n_jobs=2, scoring='accuracy') gcv.fit(all_tr_val_X, all_tr_val_Y) predictions = gcv.predict(testX) cm = confusion_matrix(testY, predictions) classes_lst = [ 'Corn', 'Cotton', 'Soy', 'Spring Wheat', 'Winter Wheat', 'Barley' ] creport = classification_report(y_true=testY, y_pred=predictions, target_names=classes_lst, digits=4, output_dict=True) creport_df = pd.DataFrame(creport).transpose() acc = accuracy_score(testY, predictions) print(creport) kappa_score = cohen_kappa_score(testY, predictions) print('Classifier : {}'.format(clf_key)) print('best params: {}'.format(gcv.best_params_)) print( 'Accuracy is {}\n Kappa Score is {}\n confusion matrix is {}\n clf report is {}' .format(acc, kappa_score, cm, creport))
def find_best_params(self,validation_data_x,validation_data_y,alpha_vals,n_jobs=1): merged_x = Data.merge_arrays(self.training_data_x, validation_data_x) merged_y = Data.merge_arrays(self.training_data_y, validation_data_y) test_fold = [] for i in range(0,len(self.training_data_y)): test_fold.append(1) for i in range(0,len(validation_data_y)): test_fold.append(0) cv = PredefinedSplit(test_fold) param = {"alpha": alpha_vals} gs = GridSearchCV( estimator=BernoulliNB(), scoring='f1_micro', param_grid=param, n_jobs=n_jobs, cv=cv ) gs.fit(merged_x,merged_y) best_params = gs.best_params_ results = gs.cv_results_ return best_params,results
def gridSearch_cv(train_x, train_y, test_x, test_y, param_grid, folds, scoring, refit, shuf, dev_ratio=0): cv_folds = KFold(n_splits=folds, shuffle=shuf) if dev_ratio > 0: pre_fold = np.ones(train_x.shape[0]) * -1 inds = np.random.choice(pre_fold.size, size=math.floor(dev_ratio * train_x.shape[0])) pre_fold[inds] = 0 cv_folds = PredefinedSplit(test_fold=pre_fold) svm = SVC() clf = GridSearchCV(estimator=svm, param_grid=param_grid, cv=cv_folds, scoring=scoring, refit=refit) clf.fit(train_x, train_y) logging.info("Best parameters in grid search:") logging.info("") logging.info(clf.best_params_) logging.info("Test scores:") score = clf.score(test_x, test_y) logging.info("test " + refit + " score: %0.3f ", score) pred_y = clf.predict(test_x) logging.info(classification_report(test_y, pred_y))
def tuning(X_train, Y_train, X_val, Y_val, classifier, params): ''' Tunes hyperparameters by running the classifier trained using training data on the range of parameters given, and returns the parameters which give the best f1-score on test data ''' #Combine training and validation into one set X = vstack([X_train,X_val]) Y_train.extend(Y_val) Y = np.array(Y_train) #Mark the training-validation splits train_i = np.ones((X_train.shape[0],), dtype = int) * -1 valid_i = np.zeros((X_val.shape[0],), dtype = int) split_fold = np.concatenate((train_i, valid_i)) ps = PredefinedSplit(split_fold) param_search = GridSearchCV(classifier, params, scoring=metrics.make_scorer(metrics.f1_score, average='macro'), cv=ps, return_train_score=True) param_search.fit(X,Y) results = param_search.cv_results_ best_params = param_search.best_params_ #Plotting #test_scores = results.get('split0_test_score') #par_ranges = params.values() #plt.plot(par_ranges[0],test_scores,'r-') #plt.show return best_params, results
def random_search(model, model_type, df, task_type='regression', refit='r2', verbose=False): years_present = len(df.index.get_level_values('year').unique()) starting_year = STARTING_YEAR last_year = starting_year + years_present - 1 train_year = starting_year + round(years_present * .7) validation_year = starting_year + round((train_year-starting_year) * .8) test_year = train_year + 1 X = df[(starting_year <= df.index.get_level_values('year')) & (df.index.get_level_values('year') <= train_year)] mask = (starting_year <= X.index.get_level_values('year')) & (X.index.get_level_values('year') <= validation_year) validation_fold = list(map(lambda x: -1 if x else 0, list(mask))) ps = PredefinedSplit(validation_fold) random_search = RandomizedSearchCV(model, param_distributions=get_param_grid(model_type=model_type, task_type=task_type), cv=ps, n_iter=10, n_jobs=-1, verbose=10, refit=True, scoring=refit, random_state=SEED) if verbose: print('Fitting with Random Search...') X_train = X.drop('crime_count', axis=1) y_train = X['crime_count'] if task_type == 'classification': y_train = y_train.apply(lambda x: 1 if x > 0.5 else 0).round(0).astype(int) random_search.fit(X_train, y_train) if verbose: print('Done fitting...') return random_search.best_estimator_
def main(data_dir, log_dir, source='xl-1542M-k40', n_train=500000, n_valid=10000, n_jobs=None, verbose=False): train_texts, train_labels = load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = load_split(data_dir, source, 'valid', n=n_valid) test_texts, test_labels = load_split(data_dir, source, 'test') vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21) train_features = vect.fit_transform(train_texts) valid_features = vect.transform(valid_texts) test_features = vect.transform(test_texts) model = LogisticRegression(solver='liblinear') params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]} split = PredefinedSplit([-1]*n_train+[0]*n_valid) search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False) search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels) model = model.set_params(**search.best_params_) model.fit(train_features, train_labels) valid_accuracy = model.score(valid_features, valid_labels)*100. test_accuracy = model.score(test_features, test_labels)*100. data = { 'source':source, 'n_train':n_train, 'valid_accuracy':valid_accuracy, 'test_accuracy':test_accuracy } print(data) json.dump(data, open(os.path.join(log_dir, f'{source}.json'), 'w'))
def _get_scores_and_estimators(experiment: Experiment) -> Tuple[List[float], List[Any]]: if experiment.test_set is not None: assert experiment.cross_validator is None, "Cannot use a cross validator with train test split" dataset = pd.concat([experiment.dataset, experiment.test_set]) split = np.array([-1] * len(experiment.dataset) + [1] * len(experiment.test_set)) cross_validator = PredefinedSplit(split) else: dataset = experiment.dataset cross_validator = experiment.cross_validator X = dataset.drop(columns=[experiment.label_column]) y = dataset[experiment.label_column] if experiment.group_column is None: if experiment.average_scores_on_instances: groups = Series(range(len(X)), index=X.index) else: groups = None else: groups = X[experiment.group_column] X = X.drop(columns=[experiment.group_column]) cv = check_cv(cross_validator, y, classifier=is_classifier(experiment.predictor)) train_test = cv.split(X, y, groups) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=None, verbose=False, pre_dispatch='2*n_jobs') scores_and_estimators = parallel( delayed(_fit_and_predict)( clone(experiment.predictor), X, y, train, test, groups, experiment.scorer) for train, test in train_test) scores_lists, estimators = zip(*scores_and_estimators) scores = [score for score_list in scores_lists for score in score_list] return scores, estimators
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1}, parameter_tuning=False): ''' if with_gpu: xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1) else: xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1) ''' X_train=data_dict['X_train'][feats_to_use].copy() y_train=data_dict['y_train'].copy() X_test=data_dict['X_test'][feats_to_use].copy() X_val=data_dict['X_val'][feats_to_use].copy() y_val=data_dict['y_val'].copy() if parameter_tuning: fit_params={ "early_stopping_rounds":10, "eval_metric" : "rmse", "eval_set" : [(X_val,y_val)]} xgb=XGBRegressor() train_val_features=pd.concat([X_train,X_val]) train_val_labels=pd.concat([y_train,y_val]) test_fold = np.zeros(train_val_features.shape[0]) # initialize all index to 0 test_fold[:X_train.shape[0]] = -1 # set index of training set to -1, indicating not to use it in validation ps=PredefinedSplit(test_fold=test_fold) X_train=data_dict['X_train'][feats_to_use] y_train=data_dict['y_train'] X_test=data_dict['X_test'][feats_to_use] grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1) start=time.time() grid.fit(train_val_features,train_val_labels) elapsed=time.time()-start print (elapsed) print ('best params:',grid.best_params_) print ('best score:',grid.best_score_) return grid.best_params_, grid.best_estimator_ else: xgb=XGBRegressor(**params) print (xgb) print ('start xgboost training') start=time.time() eval_set=[(X_val,y_val)] xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30) elapsed=time.time()-start print (elapsed) data_dict['y_pred']=np.exp(xgb.predict(X_test))-1 #generate submission data_dict['X_test']['item_cnt_month']=data_dict['y_pred'] test=pd.read_csv('test.csv') submission=pd.merge(test,data_dict['X_test'], on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']] return submission, xgb
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = PredefinedSplit(folds) # n_splits is simply the no of unique folds assert_equal(len(np.unique(folds)), ps.get_n_splits()) for train_ind, test_ind in ps.split(): ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) The target variable for supervised learning problems. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = X.shape[0] if self.n_splits > n_samples: raise ValueError( ("Cannot have number of splits n_splits={0} greater" " than the number of samples: n_samples={1}." ).format(self.n_splits, n_samples)) # generate test fold test_fold = np.arange(n_samples, dtype=int) % self.n_splits cv = PredefinedSplit(test_fold) return(cv.split())