def KFold_method(self): kf = KFold(n_splits=10) for train_index, test_index in kf.split(self.FeatureSet): X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) #clf = tree.DecisionTreeClassifier() #clf = clf.fit(X_train, y_train) #pre_labels = clf.predict(X_test) clf = AdaBoostClassifier(n_estimators=100) clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) print ACC, SN
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def cross_validate(self, values_labels, folds=10, processes=1): """ Trains and tests the model agaists folds of labeled data. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor folds : `int` When set to 1, cross-validation will run in the parent thread. When set to 2 or greater, a :class:`multiprocessing.Pool` will be created. """ folds_i = KFold(n_splits=folds, shuffle=True, random_state=0) if processes == 1: mapper = map else: pool = Pool(processes=processes or cpu_count()) mapper = pool.map results = mapper(self._cross_score, ((i, [values_labels[i] for i in train_i], [values_labels[i] for i in test_i]) for i, (train_i, test_i) in enumerate( folds_i.split(values_labels)))) agg_score_labels = [] for score_labels in results: agg_score_labels.extend(score_labels) self.info['statistics'].fit(agg_score_labels) return self.info['statistics']
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def CV_mean(X_slct, y, test_slct, model_name='RandomForest', model_obj=sk_ens.RandomForestRegressor, model_params=rf_params, eval_func=r2_score, nFolds=5, gen_rand_func=gen_rand): k_fold = KFold(n_splits=nFolds, shuffle=True, random_state=gen_rand_func()) cv_scores = [] model_li = [] preds = [] for train_index, test_index in k_fold.split(X_slct, y): X_train, X_test = X_slct[train_index,:], X_slct[test_index,:] y_train, y_test = y[train_index], y[test_index] if 'random_state' in model_params: model_params['random_state'] = gen_rand_func() elif 'seed' in model_params: model_params['seed'] = gen_rand_func() model = model_obj(**model_params) model.fit(X_train, y_train) scr = eval_func(y_test, model.predict(X_test)) print('Score of ' + model_name + ':', scr) model_li.append(model) cv_scores.append(scr) pred = model.predict(test_slct) preds.append(pred) plt.plot(cv_scores); plt.show() winner_pred = preds[cv_scores.index(max(cv_scores))] print('CV_mean ' + model_name + ':', np.mean(cv_scores)) return np.mean(cv_scores), winner_pred
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def predict_model_kfold(name,path,features_type,label_name,data): kfold = KFold(10, True) #RandomForest -I 1000 -K 0 -S 1 -num-slots 1 model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5) index = 0 size = data.shape[0] all_predictions = 0 x = data.drop('hasBug', axis=1) y = data['hasBug'] num_of_bugs = data.loc[data['hasBug'] == 1].shape[0] num_of_all_instances = data.shape[0] bug_precent = float(num_of_bugs) / float(num_of_all_instances) for train, test in kfold.split(data): index += 1 prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test]) all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None) all_predictions /= index start_list = [name,"training",features_type,"sklearn - python"] result_list = start_list+ all_predictions.tolist() global results_all_projects results_all_projects.loc[len(results_all_projects)] = result_list model.fit(x,y) return model
def KFold_method(self): kf = KFold(n_splits=10) for train_index, test_index in kf.split(self.FeatureSet): X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) tree = self.buildtree(X_train) #self.post_pruning(tree, 0.3) pre_labels = self.predict(X_test, tree) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) # MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) # print SP, SN print ACC, SN
def hyperopt_obj(self,param,train_X,train_y): # 5-fold crossvalidation error #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round']) kf = KFold(n_splits = 3) errors = [] r2 = [] int_params = ['max_depth','num_round'] for item in int_params: param[item] = int(param[item]) for train_ind,test_ind in kf.split(train_X): train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind] test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind] dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y) dtest = xgb.DMatrix(test_valid_x) pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round'])) pred_test = pred_model.predict(dtest) errors.append(mean_squared_error(test_valid_y,pred_test)) r2.append(r2_score(test_valid_y,pred_test)) all_dtrain = xgb.DMatrix(train_X,label = train_y) print('training score:') pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round'])) all_dtest = xgb.DMatrix(train_X) pred_train = pred_model.predict(all_dtest) print(str(r2_score(train_y,pred_train))) print(np.mean(r2)) print('\n') return {'loss':np.mean(errors),'status': STATUS_OK}
def computing_cv_accuracy_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) mean_u65, mean_u80 = 0, 0 lqa = LinearDiscriminant(init_matlab=True) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lqa.learn(X_cv_train, y_cv_train, ell=ell_optimal) sum_u65, sum_u80 = 0, 0 n_test, _ = X_cv_test.shape for i, test in enumerate(X_cv_test): print("--TESTING-----", i, ell_optimal) evaluate, _ = lqa.evaluate(test) print(evaluate, "-----", y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test mean_u65 = mean_u65 / cv_n_fold mean_u80 = mean_u80 / cv_n_fold print("--ell-->", ell_optimal, "--->", mean_u65, mean_u80)
def test_regression_with_custom_objective(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold def objective_ls(y_true, y_pred): grad = (y_pred - y_true) hess = np.ones(len(y_true)) return grad, hess boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor(objective=objective_ls).fit( X[train_index], y[train_index] ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_pred): raise XGBCustomObjectiveException() xgb_model = xgb.XGBRegressor(objective=dummy_objective) np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
def validateseq2(X_all, y, features, clf, score, v = False, esr=50, sk=5): temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 336 print('before delete: {}'.format(X_all.shape)) X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 306 print('before delete: {}'.format(X_all.shape)) X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) kf = KFold(n_splits=sk) print(len(features)) X['Prob_x'] = 0 for train_index, test_index in kf.split(X2): X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk Performance = [] features.append('Prob_x') for train_index, test_index in kf.split(X): X_train, X_test = X.ix[train_index,:], X.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) pred = clf.predict_proba(X_test)[:,1] Performance.append(roc_auc_score(y_test,pred)) print("Mean Score: {}".format(np.mean(Performance))) return np.mean(Performance),clf
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # Implement model selection using CV NB_SPLITS = 3 mean_scores = [] split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS) n_components = range(self.min_n_components, self.max_n_components + 1) try: for n_component in n_components: model = self.base_model(n_component) kfold_scores = [] for _, test_idx in split_method.split(self.sequences): test_X, test_length = combine_sequences(test_idx, self.sequences) kfold_scores.append(model.score(test_X, test_length)) mean_scores.append(np.mean(kfold_scores)) except Exception as e: pass if len(mean_scores) > 0: states = n_components[np.argmax(mean_scores)] else: states = self.n_constant return self.base_model(states)
def test_multiclass_classification(): from sklearn.datasets import load_iris from sklearn.model_selection import KFold def check_pred(preds, labels, output_margin): if output_margin: err = sum(1 for i in range(len(preds)) if preds[i].argmax() != labels[i]) / float(len(preds)) else: err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(len(preds)) assert err < 0.4 iris = load_iris() y = iris['target'] X = iris['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] check_pred(preds, labels, output_margin=False) check_pred(preds2, labels, output_margin=True) check_pred(preds3, labels, output_margin=True) check_pred(preds4, labels, output_margin=False)
def original_data(): for target in TARGETS: for algo_str in ALGORITHMS: algorithm = importlib.import_module('src.multi_class.' + algo_str) encoded_data = input_preproc.readFromDataset( INPUT_DIR + ORIGINAL_DATA_FILE, INPUT_COLS['original'], target ) # Split into predictors and target X = np.array(encoded_data[encoded_data.columns.difference([target])]) y = np.array(encoded_data[target]) kf = KFold(n_splits=CROSS_VALIDATION_K, shuffle=True) f1s = [] for train_index, test_index in kf.split(X): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train)) # , columns=X_train.columns) X_test = scaler.transform(X_test) precision, recall, f1_score, accuracy = algorithm.runClassifier(X_train, X_test, y_train, y_test) f1s.append(f1_score) final_f1 = sum(f1s) / len(f1s) print("\n================================") print("%s, %s, F1 Score: %.6f" % (target, algo_str, final_f1)) print("================================\n")
def _iter_test_masks(self, X, y=None, groups=None): # yields mask array for test splits n_samples = X.shape[0] # if groups is not specified, an entire data is specified as one group if groups is None: groups = np.zeros(n_samples, dtype=int) # constants indices = np.arange(n_samples) test_fold = np.empty(n_samples, dtype=bool) rng = check_random_state(self.random_state) group_indices = np.unique(groups) iters = np.empty(group_indices.shape[0], dtype=object) # generate iterators cv = KFold(self.n_splits, self.shuffle, rng) for i, g in enumerate(group_indices): group_member = indices[groups == g] iters[i] = cv.split(group_member) # generate training and test splits for fold in xrange(self.n_splits): test_fold[:] = False for i, g in enumerate(group_indices): group_train_i, group_test_i = next(iters[i]) test_fold[indices[groups == g][group_test_i]] = True yield test_fold
def kFolds(dataSet, k = 10): """ This is the k-fold method :param dataSet: of type DataFrame :param k: number of subsets to choose """ df_mx = dataSet.as_matrix() X = df_mx[:, 1:16] Y = df_mx[:, 0:1] lm = svm.SVC(gamma=0.001, C=100.) # Support Vector Machine kf = KFold(n_splits=10) # Define the split - into 10 folds i = 0 accuracies = numpy.zeros(kf.get_n_splits(X)) for train_index, test_index in kf.split(X): print("{}. TRAIN: {} TEST: {}".format(i+1, train_index, test_index)) X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] # train using X_Train model = lm.fit(X_train, Y_train) # evaluate against X_Test predictions = lm.predict(X_test) # save accuracy accuracies[i] = model.score(X_test, Y_test) i = i + 1 # find mean accuracy over all rounds print("Average accuracy of K-Folds (k={}): {}%".format(numpy.mean(accuracies) * 100, k))
def model_train(self, X_train, y_train, ignore_neutral=False): if ignore_neutral: X_train = X_train[y_train != 0] y_train = y_train[y_train != 0] self.ignore_neutral = ignore_neutral model = LinearSVC() classifier = model.fit(X_train, y_train) # pred = classifier.predict(X_train) # accu = np.mean(pred == y_train) # print 'The accuracy of training data is {}'.format(accu) # print confusion_matrix(y_train, pred) # k-fold kfold = KFold(n_splits=5) for i, (train_index, test_index) in enumerate((kfold.split(X_train))): X_split_train = X_train[train_index] y_split_train = y_train[train_index] X_split_valid = X_train[test_index] y_split_valid = y_train[test_index] classifier = model.fit(X_split_train, y_split_train) pred = classifier.predict(X_split_valid) accu = np.mean(pred == y_split_valid) print 'Fold {} : the accuracy of validation data is {}'.format(i + 1, accu) return classifier
def test_boston_housing_regression(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 350 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 350
def Get_KFolds(data, y_label, num_folds, scale): #Creates 5 folds from the train/test set each with a separate training and test set folds = [] kf = KFold(n_splits = num_folds) for train_index, test_index in kf.split(data): training = [] test = [] tempdf = Normalize_Scale(data,scale) train_x = tempdf.drop([y_label], axis=1).values train_y = tempdf[y_label].values #Creates a training set within the fold x = [] y = [] for index in train_index: x.append(train_x[index]) y.append(train_y[index]) training = [x,y] #Creates a test set within the fold x = [] y = [] for index in test_index: x.append(train_x[index]) y.append(train_y[index]) test = [x,y] folds.append([training,test]) return folds
def learn_decision_tree(data_set, label): #Create depths depths = list(range(1,14)) #Initialize the best model best_model = [None, 0, float("-inf")] #Create 13-fold kf = KFold(n_splits=13) track = [] for (train, test), cdepth in zip(kf.split(data_set), depths): #Get training set train_set = [data_set[i] for i in train] train_label = [label[i] for i in train] #Get validation set valid_set = [data_set[i] for i in test] valid_label = [label[i] for i in test] #Learn the decision tree from data clf = tree.DecisionTreeClassifier(max_depth=cdepth) clf = clf.fit(train_set, train_label) #Get accuracy from the model accuraclabel = clf.score(valid_set, valid_label) #Compare accuracies track.append([cdepth, accuraclabel]) if accuraclabel > best_model[2]: #Update the best model best_model = [clf, cdepth, accuraclabel] #Plot the graph fig = plt.figure() x = [x[0] for x in track] y = [x[1] for x in track] plt.xlabel('Depth') plt.ylabel('Accuracy') plt.title('Decision Tree') plt.plot(x,y) plt.savefig('decision_tree.png') return best_model
def predict(X_all, X_new, features, clf, score, v = False, esr=50, sk=3, fn='submission'): first_day = datetime.datetime.strptime('2017-08-31 00:00:00', '%Y-%m-%d %H:%M:%S') temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 336 print('before delete: {}'.format(X_all.shape)) X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) temp_user = target_order[(target_order.o_day_series < 366) & \ (target_order.o_day_series >= 366 - 75)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 366 print('before delete: {}'.format(X_new.shape)) X_new = temp_user.merge(X_new,on=['user_id','CreateGroup'],how = 'left') print('Train: {}'.format(X_new.shape)) kf = KFold(n_splits=sk) print(len(features)) Performance = [] X_new['Prob'] = 0 for train_index, test_index in kf.split(X): X_train, X_test = X.ix[train_index,:], X.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) pred = clf.predict_proba(X_test)[:,1] X_new['Prob'] = X_new['Prob'] + clf.predict_proba(X_new[features])[:,1]/sk Performance.append(roc_auc_score(y_test,pred)) print("Mean Score: {}".format(np.mean(Performance))) X_new['Days'] = np.random.randint(15,size=len(X_new)) X_new['pred_date'] = X_new['Days'].apply(lambda x: (datetime.timedelta(days=x) + first_day).strftime("%Y-%m-%d")) X_new.sort_values(by = ['Prob'], ascending = False, inplace = True) X_new[['user_id','Prob']].to_csv('prob_{}.csv'.format(fn), index = None) X_new[['user_id','pred_date']][:50000].to_csv('{}.csv'.format(fn), index = None) return np.mean(Performance),clf
def cross_validation(train_data, train_labels, k_range=np.arange(1,16)): ''' Perform 10-fold cross validation to find the best value for k Note: Previously this function took knn as an argument instead of train_data,train_labels. The intention was for students to take the training data from the knn object - this should be clearer from the new function signature. ''' folds = 10 kf = KFold(n_splits=folds) best_k = 1 average_accuracy_for_best_k = 0 for k in k_range: accuracy_sum = 0 for train_index, test_index in kf.split(train_data): X_train, X_test = train_data[train_index], train_data[test_index] y_train, y_test = train_labels[train_index], train_labels[test_index] knn = KNearestNeighbor(X_train, y_train) validation_accuracy = classification_accuracy(knn, k, X_test, y_test) accuracy_sum += validation_accuracy average_accuracy = accuracy_sum/folds if (average_accuracy > average_accuracy_for_best_k): average_accuracy_for_best_k = average_accuracy best_k = k return best_k, average_accuracy_for_best_k
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): print("---k-FOLD-new-executing--") X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lda.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
def split_data(root_path, num_splits=4): mask_list = [] for ext in ('*.mhd', '*.hdr', '*.nii'): mask_list.extend(sorted(glob(join(root_path,'masks',ext)))) assert len(mask_list) != 0, 'Unable to find any files in {}'.format(join(root_path,'masks')) outdir = join(root_path,'split_lists') try: mkdir(outdir) except: pass kf = KFold(n_splits=num_splits) n = 0 for train_index, test_index in kf.split(mask_list): with open(join(outdir,'train_split_' + str(n) + '.csv'), 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i in train_index: writer.writerow([basename(mask_list[i])]) with open(join(outdir,'test_split_' + str(n) + '.csv'), 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i in test_index: writer.writerow([basename(mask_list[i])]) n += 1
def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z): if self.kernelX_use_median: sigmax = self.kernelX.get_sigma_median_heuristic(train_x) self.kernelX.set_width(float(sigmax)) if self.kernelY_use_median: sigmay = self.kernelY.get_sigma_median_heuristic(train_y) self.kernelY.set_width(float(sigmay)) kf = KFold( n_splits=self.K_folds) matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as # this command simply copied [None] many times. But the above gives different ids. count = 0 for train_index, test_index in kf.split(np.ones((self.num_samples,1))): X_tr, X_tst = train_x[train_index], train_x[test_index] Y_tr, Y_tst = train_y[train_index], train_y[test_index] Z_tr, Z_tst = train_z[train_index], train_z[test_index] matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix count = count + 1 return matrix_results
class TargetEncoderNSplits(BaseTransformer): def __init__(self, n_splits, **kwargs): self.k_folds = KFold(n_splits=n_splits) self.target_means_map = {} def _target_means_names(self, columns): confidence_rate_names = ['target_mean_{}'.format(column) for column in columns] return confidence_rate_names def _is_null_names(self, columns): is_null_names = ['target_mean_is_nan_{}'.format(column) for column in columns] return is_null_names def fit(self, categorical_features, target, **kwargs): feature_columns, target_column = categorical_features.columns, target.columns[0] X_target_means = [] self.k_folds.get_n_splits(target) for train_index, test_index in self.k_folds.split(target): X_train, y_train = categorical_features.iloc[train_index], target.iloc[train_index] X_test, y_test = categorical_features.iloc[test_index], target.iloc[test_index] train = pd.concat([X_train, y_train], axis=1) for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)): group_object = train.groupby(column) train_target_means = group_object[target_column].mean(). \ reset_index().rename(index=str, columns={target_column: target_mean_name}) X_test = X_test.merge(train_target_means, on=column, how='left') X_target_means.append(X_test) X_target_means = pd.concat(X_target_means, axis=0).astype(np.float32) for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)): group_object = X_target_means.groupby(column) self.target_means_map[column] = group_object[target_mean_name].mean().reset_index() return self def transform(self, categorical_features, **kwargs): columns = categorical_features.columns for column, target_mean_name, is_null_name in zip(columns, self._target_means_names(columns), self._is_null_names(columns)): categorical_features = categorical_features.merge(self.target_means_map[column], on=column, how='left').astype(np.float32) categorical_features[is_null_name] = pd.isnull(categorical_features[target_mean_name]).astype(int) categorical_features[target_mean_name].fillna(0, inplace=True) return {'numerical_features': categorical_features[self._target_means_names(columns)], 'categorical_features': categorical_features[self._is_null_names(columns)]} def load(self, filepath): self.target_means_map = joblib.load(filepath) return self def save(self, filepath): joblib.dump(self.target_means_map, filepath)
def regulCV(X,y,n_splits = 10): kf = KFold(n_splits=n_splits) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield X_train, y_train, X_test, y_test
def _inner(o, **kwargs): if stratify: _, unique_counts = np.unique(o, return_counts=True) if np.min(unique_counts) >= 2 and np.min(unique_counts) >= n_splits: stratify_ = stratify elif np.min(unique_counts) < n_splits: stratify_ = False pv(f'stratify set to False as n_splits={n_splits} cannot be greater than the min number of members in each class ({np.min(unique_counts)}).', verbose) else: stratify_ = False pv('stratify set to False as the least populated class in o has only 1 member, which is too few.', verbose) else: stratify_ = False vs = 0 if train_only else 1. / n_splits if n_splits > 1 else int(valid_size * len(o)) if isinstance(valid_size, float) else valid_size if test_size: ts = int(test_size * len(o)) if isinstance(test_size, float) else test_size train_valid, test = train_test_split(range(len(o)), test_size=ts, stratify=o if stratify_ else None, shuffle=shuffle, random_state=random_state, **kwargs) test = toL(test) if shuffle: test = random_shuffle(test, random_state) if vs == 0: train, _ = RandomSplitter(0, seed=random_state)(o[train_valid]) train = toL(train) if balance: train = train[balance_idx(o[train], random_state=random_state)] if shuffle: train = random_shuffle(train, random_state) train_ = L(L([train]) * n_splits) if n_splits > 1 else train valid_ = L(L([train]) * n_splits) if n_splits > 1 else train test_ = L(L([test]) * n_splits) if n_splits > 1 else test if n_splits > 1: return [split for split in itemify(train_, valid_, test_)] else: return train_, valid_, test_ elif n_splits > 1: if stratify_: splits = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(train_valid)), o[train_valid]) else: splits = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(train_valid))) train_, valid_ = L([]), L([]) for train, valid in splits: train, valid = toL(train), toL(valid) if balance: train = train[balance_idx(o[train], random_state=random_state)] if shuffle: train = random_shuffle(train, random_state) valid = random_shuffle(valid, random_state) train_.append(L(L(train_valid)[train])) valid_.append(L(L(train_valid)[valid])) test_ = L(L([test]) * n_splits) return [split for split in itemify(train_, valid_, test_)] else: train, valid = train_test_split(range(len(train_valid)), test_size=vs, random_state=random_state, stratify=o[train_valid] if stratify_ else None, shuffle=shuffle, **kwargs) train, valid = toL(train), toL(valid) if balance: train = train[balance_idx(o[train], random_state=random_state)] if shuffle: train = random_shuffle(train, random_state) valid = random_shuffle(valid, random_state) return (L(L(train_valid)[train]), L(L(train_valid)[valid]), test) else: if vs == 0: train, _ = RandomSplitter(0, seed=random_state)(o) train = toL(train) if balance: train = train[balance_idx(o[train], random_state=random_state)] if shuffle: train = random_shuffle(train, random_state) train_ = L(L([train]) * n_splits) if n_splits > 1 else train valid_ = L(L([train]) * n_splits) if n_splits > 1 else train if n_splits > 1: return [split for split in itemify(train_, valid_)] else: return (train_, valid_) elif n_splits > 1: if stratify_: splits = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(o)), o) else: splits = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(o))) train_, valid_ = L([]), L([]) for train, valid in splits: train, valid = toL(train), toL(valid) if balance: train = train[balance_idx(o[train], random_state=random_state)] if shuffle: train = random_shuffle(train, random_state) valid = random_shuffle(valid, random_state) if not isinstance(train, (list, L)): train = train.tolist() if not isinstance(valid, (list, L)): valid = valid.tolist() train_.append(L(train)) valid_.append(L(L(valid))) return [split for split in itemify(train_, valid_)] else: train, valid = train_test_split(range(len(o)), test_size=vs, random_state=random_state, stratify=o if stratify_ else None, shuffle=shuffle, **kwargs) train, valid = toL(train), toL(valid) if balance: train = train[balance_idx(o[train], random_state=random_state)] return train, valid
def train_classifiers(train_vecs, train_labels, typ='bow'): X = np.array(train_vecs) y = np.array(train_labels) kf = KFold(5, shuffle=True, random_state=42) cv_rf_f1, cv_lrsgd_f1, cv_svcsgd_f1, = [], [], [] cv_rf_ac, cv_lrsgd_ac, cv_svcsgd_ac, = [], [], [] y_pred_sgd, y_pred_sgh, y_pred_rf, = [], [], [] for train_ind, val_ind in kf.split(X, y): # Assign CV IDX X_train, y_train = X[train_ind], y[train_ind] X_val, y_val = X[val_ind], y[val_ind] # Scale Data scaler = StandardScaler() X_train_scale = scaler.fit_transform(X_train) X_val_scale = scaler.transform(X_val) # Logisitic Regression # lr = LogisticRegression( # max_iter=4000, # class_weight= 'balanced', # solver='newton-cg', # fit_intercept=True # ).fit(X_train_scale, y_train) # # y_pred = lr.predict(X_val_scale) # cv_lr_f1.append(f1_score(y_val, y_pred, average='weighted')) # Logistic Regression SGD sgd = linear_model.SGDClassifier( max_iter=1000, tol=1e-3, loss='log', class_weight='balanced' ).fit(X_train_scale, y_train) y_pred_sgd.append(sgd.predict(X_val_scale)) cv_lrsgd_f1.append(f1_score(y_val, y_pred_sgd[-1], average='macro')) cv_lrsgd_ac.append(accuracy_score(y_val, y_pred_sgd[-1])) # SGD Modified Huber sgd_huber = linear_model.SGDClassifier( max_iter=1000, tol=1e-3, alpha=20, loss='modified_huber', class_weight='balanced' ).fit(X_train_scale, y_train) y_pred_sgh.append(sgd_huber.predict(X_val_scale)) cv_svcsgd_f1.append(f1_score(y_val, y_pred_sgh[-1], average='macro')) cv_svcsgd_ac.append(accuracy_score(y_val, y_pred_sgh[-1])) # Random Forest rf = RandomForestClassifier( class_weight='balanced' ).fit(X_train_scale, y_train) y_pred_rf.append(rf.predict(X_val_scale)) cv_rf_f1.append(f1_score(y_val, y_pred_rf[-1], average='macro')) cv_rf_ac.append(accuracy_score(y_val, y_pred_rf[-1])) y_pred_sgd_final = [item for sublist in y_pred_sgd for item in sublist] y_pred_sgh_final = [item for sublist in y_pred_sgh for item in sublist] y_pred_rf_final = [item for sublist in y_pred_rf for item in sublist] # print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}') print(f'SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}',typ) print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}',typ) print(f'Random Forest Val f1: {np.mean(cv_rf_f1):.3f} +- {np.std(cv_rf_f1):.3f}',typ) print("\n") print(f'SGD Val acc: {np.mean(cv_lrsgd_ac):.3f} +- {np.std(cv_lrsgd_ac):.3f}',typ) print(f'SVM Huber Val acc: {np.mean(cv_svcsgd_ac):.3f} +- {np.std(cv_svcsgd_ac):.3f}',typ) print(f'Random Forest Val acc: {np.mean(cv_rf_ac):.3f} +- {np.std(cv_rf_ac):.3f}',typ) print("\n") print("Precision (micro) SGD: %f" % precision_score(y, y_pred_sgd_final, average='micro'),typ) print("Recall (micro) SGD: %f" % recall_score(y, y_pred_sgd_final, average='micro'),typ) print("F1 score (micro) SGD: %f" % f1_score(y, y_pred_sgd_final, average='micro'),typ, end='\n\n') print("Precision (macro) SGD: %f" % precision_score(y, y_pred_sgd_final, average='macro'),typ) print("Recall (macro) SGD: %f" % recall_score(y, y_pred_sgd_final, average='macro'),typ) print("F1 score (macro) SGD: %f" % f1_score(y, y_pred_sgd_final, average='macro'),typ, end='\n\n') print("Precision (weighted) SGD: %f" % precision_score(y, y_pred_sgd_final, average='weighted'),typ) print("Recall (weighted) SGD: %f" % recall_score(y, y_pred_sgd_final, average='weighted'),typ) print("F1 score (weighted) SGD: %f" % f1_score(y, y_pred_sgd_final, average='weighted'),typ) print("\n") print("Precision (micro) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='micro'),typ) print("Recall (micro) SVM Huber: %f" % recall_score(y, y_pred_sgh_final, average='micro'),typ) print("F1 score (micro) SVM Huber: %f" % f1_score(y, y_pred_sgh_final, average='micro'),typ, end='\n\n') print("Precision (macro) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='macro'),typ) print("Recall (macro) SVM Huber: %f" % recall_score(y, y_pred_sgh_final, average='macro'),typ) print("F1 score (macro) SVM Huber: %f" % f1_score(y, y_pred_sgh_final, average='macro'),typ, end='\n\n') print("Precision (weighted) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='weighted'),typ) print("Recall (weighted) SVM Huber: %f" % recall_score(y, y_pred_sgh_final, average='weighted'),typ) print("F1 score (weighted) SVM Huber: %f" % f1_score(y, y_pred_sgh_final, average='weighted'),typ) print("\n") print("Precision (micro) RF: %f" % precision_score(y, y_pred_rf_final, average='micro'),typ) print("Recall (micro) RF: %f" % recall_score(y, y_pred_rf_final, average='micro'),typ) print("F1 score (micro) RF: %f" % f1_score(y, y_pred_rf_final, average='micro'),typ, end='\n\n') print("Precision (macro) RF: %f" % precision_score(y, y_pred_rf_final, average='macro'),typ) print("Recall (macro) RF: %f" % recall_score(y, y_pred_rf_final, average='macro'),typ) print("F1 score (macro) RF: %f" % f1_score(y, y_pred_rf_final, average='macro'),typ, end='\n\n') print("Precision (weighted) RF: %f" % precision_score(y, y_pred_rf_final, average='weighted'),typ) print("Recall (weighted) RF: %f" % recall_score(y, y_pred_rf_final, average='weighted'),typ) print("F1 score (weighted) RF: %f" % f1_score(y, y_pred_rf_final, average='weighted'),typ) return [sgd, sgd_huber, rf]
# Prepare data # =============================================================================================== # Load dataset inputData = np.loadtxt(open('DATASET.csv'), delimiter=",", skiprows=1, dtype='float') # Atributes except 'Class' column X = inputData[:, :-1] # Class labels y = inputData[:, -1] # Standarize data scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) kf = KFold(n_splits=2) # Split in train and test set for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # =============================================================================================== # KNN # =============================================================================================== # Moment at we start building the model time_ini_knn = time() # Apply 5NN knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train, y_train)
import os from sklearn.model_selection import KFold from sklearn.linear_model import LinearRegression, Ridge from src.utils.CONSTANTS import CITY_ID_COL, NEIGHBORHOOD_ID_COL from config import UNIFIED_FORMS_FILE, PROCESSED_DATA_DIR from src.train.constants import TIME_COL, DATE_COL, PRED_COL, GT_COL AGGREGATED_DIR = os.path.join(PROCESSED_DATA_DIR,'aggregated') N_splits = 3 kfold = KFold(n_splits=N_splits, random_state=1, shuffle=True) MINIMUM_PER_REGION = 50 x_agg_mode = {'mode': 'range', 'n_days': 3, 'min_per_region': 50} y_agg_mode = {'mode': 'range', 'n_days': 2, 'min_per_region': 15} city_type = 'city' neighborhood_type = 'neighbor' lower_cut_date = '2020-03-21' upper_cut_date = '2020-04-05' x_train_date = '2020-03-26' y_train_date = '2020-03-30' x_test_date = '2020-03-30' y_test_date = '2020-04-03' y_col_name = 'confirmed_cases' save_map = False
import warnings warnings.filterwarnings('ignore') import pandas as pd # 1. 데이터 dataset = pd.read_csv('../data/csv/iris_sklearn.csv', header=0, index_col=0) x = dataset.iloc[:, :-1] y = dataset.iloc[:, -1] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32) kfold = KFold(n_splits=5, shuffle=True) parameters = [{ "C": [1, 10, 100, 1000], "kernel": ["linear"] }, { "C": [1, 10, 100], "kernel": ["rbf"], "gamma": [0.001, 0.0001] }, { "C": [1, 10, 100, 1000], "kernel": ["sigmoid"], "gamma": [0.001, 0.0001] }] # 2. 모델 구성
def compare_estimators(estimators: list, datasets, metrics: list, n_cv_folds=10, decimals=3, cellsize=22, verbose=True): if type(estimators) != list: raise Exception( "First argument needs to be a list of tuples containing ('name', Estimator pairs)" ) if type(metrics) != list: raise Exception( "Argument metrics needs to be a list of tuples containing ('name', scoring function pairs)" ) mean_results = {d[0]: [] for d in datasets} std_results = {d[0]: [] for d in datasets} # loop over datasets for d in tqdm(datasets): if verbose: print("comparing on dataset", d[0]) mean_result = [] std_result = [] X, y = get_dataset(d[1]) # loop over estimators for (est_name, est) in estimators: mresults = [[] for i in range(len(metrics))] # loop over folds kf = KFold(n_splits=n_cv_folds) for train_idx, test_idx in kf.split(X): start = time.time() est.fit(X[train_idx, :], y[train_idx]) y_pred = est.predict(X[test_idx, :]) end = time.time() # loop over metrics for i, (met_name, met) in enumerate(metrics): if met_name == 'Time': mresults[i].append(end - start) elif met_name == 'Complexity': if est_name != 'MLPClassifier (sklearn)': mresults[i].append(get_complexity(est)) else: try: mresults[i].append(met(y[test_idx], y_pred)) except: mresults[i].append( met(to_numeric(y[test_idx]), to_numeric(y_pred))) for i in range(len(mresults)): mean_result.append(np.mean(mresults[i])) std_result.append(np.std(mresults[i]) / n_cv_folds) mean_results[d[0]] = mean_result std_results[d[0]] = std_result return mean_results, std_results
def dcv_rgr(X, y, model, param_grid, niter): """ Double cross validation (regression) Parameters ---------- X : array-like, shape = [n_samples, n_features] X training+test data y : array-like, shape = [n_samples] y training+test data model: machine learning model (scikit-learn) param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. niter : int number of DCV iteration Returns ------- None """ # parameters ns_in = 3 # n_splits for inner loop ns_ou = 3 # n_splits for outer loop scores = np.zeros((niter, 3)) for iiter in range(niter): ypreds = np.array([]) # list of predicted y in outer loop ytests = np.array([]) # list of y_test in outer loop kf_ou = KFold(n_splits=ns_ou, shuffle=True) # [start] outer loop for test of the generalization error for train_index, test_index in kf_ou.split(X): X_train, X_test = X[train_index], X[test_index] # inner loop CV y_train, y_test = y[train_index], y[test_index] # outer loop # [start] inner loop CV for hyper parameter optimization kf_in = KFold(n_splits=ns_in, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=kf_in) gscv.fit(X_train, y_train) # [end] inner loop CV for hyper parameter optimization # test of the generalization error ypred = gscv.predict(X_test) ypreds = np.append(ypreds, ypred) ytests = np.append(ytests, y_test) # [end] outer loop for test of the generalization error rmse = np.sqrt(mean_squared_error(ytests, ypreds)) mae = mean_absolute_error(ytests, ypreds) r2 = r2_score(ytests, ypreds) # print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f}'\ # .format(rmse, mae, r2)) scores[iiter, :] = np.array([rmse, mae, r2]) means, stds = np.mean(scores, axis=0), np.std(scores, axis=0) print() print('Double Cross Validation') print('In {:} iterations, average +/- standard deviation'.format(niter)) print('RMSE DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0])) print('MAE DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1])) print('R^2 DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2])) print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f} (ave)'\ .format(means[0], means[1], means[2])) print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f} (std)'\ .format(stds[0], stds[1], stds[2]))
def evaluate(self, ind, **kwargs): # ind.phenotype will be a string, including function definitions etc. # When we exec it, it will create a value XXX_output_XXX, but we exec # inside an empty dict for safety. p, d = ind.phenotype, {} genome, output, invalid, max_depth, nodes = ind.tree.get_tree_info( params['BNF_GRAMMAR'].non_terminals.keys(), [], []) Logger.log("Depth: {0}\tGenome: {1}".format(max_depth, genome)) # Exec the phenotype. X_test, y_test = self.X_test, self.y_test image_size = X_test[0].shape flat_ind, kernel_size = NetworkProcessor.process_network( ind, image_size) Logger.log("Individual: {}".format(flat_ind)) Logger.log("New kernel size: {}".format(kernel_size)) new_conv_layers = [] for i, k in enumerate(self.conv_layers): new_conv_layers.append((k[0], kernel_size[i], k[2], k[3], k[4])) train_loss = stats('mse') test_loss = stats('accuracy') kf = KFold(n_splits=params['CROSS_VALIDATION_SPLIT']) net = ClassificationNet(self.fcn_layers, new_conv_layers) fitness, fold = 0, 1 Logger.log("Training Start: ") # Cross validation s_time = np.empty((kf.get_n_splits())) validation_acc = np.empty((kf.get_n_splits())) test_acc = np.empty((kf.get_n_splits())) for train_index, val_index in kf.split(self.X_train): X_train, X_val = self.X_train[train_index], self.X_train[val_index] y_train, y_val = self.y_train[train_index], self.y_train[val_index] data_train = DataIterator(X_train, y_train, params['BATCH_SIZE']) early_ckpt, early_stop, early_crit, epsilon = 20, [], params[ 'EARLY_STOP_FREQ'], params['EARLY_STOP_EPSILON'] s_time[fold - 1] = time.time() # Train model net.model.reinitialize_params() for epoch in range(1, params['NUM_EPOCHS'] + 1): # mini-batch training for x, y in data_train: net.train(epoch, x, y, train_loss) # log training loss if epoch % params['TRAIN_FREQ'] == 0: Logger.log("Epoch {} Training loss (NLL): {:.6f}".format( epoch, train_loss.getLoss('mse'))) # log validation/test loss if epoch % params['VALIDATION_FREQ'] == 0: net.test(X_val, y_val, test_loss) Logger.log( "Epoch {} Validation loss (NLL/Accuracy): {:.6f} {:.6f}" .format(epoch, test_loss.getLoss('mse'), test_loss.getLoss('accuracy'))) net.test(X_test, y_test, test_loss) Logger.log( "Epoch {} Test loss (NLL/Accuracy): {:.6f} {:.6f}". format(epoch, test_loss.getLoss('mse'), test_loss.getLoss('accuracy'))) # check for early stop if epoch == early_ckpt: accuracy = net.test(X_test, y_test, test_loss, print_confusion=True) early_stop.append(accuracy) if len(early_stop) > 3: latest_acc = early_stop[-early_crit:] latest_acc = np.subtract(latest_acc, latest_acc[1:] + [0]) if (abs(latest_acc[:-1]) < epsilon).all() == True: Logger.log( "Early stopping at epoch {} (latest {} ckpts): {}" .format( epoch, early_crit, " ".join([ "{:.4f}".format(x) for x in early_stop[-early_crit:] ]))) break early_ckpt = min(early_ckpt + 300, early_ckpt * 2) # Validate model net.test(X_val, y_val, test_loss) validation_acc[fold - 1] = test_loss.getLoss('accuracy') Logger.log( "Cross Validation [Fold {}/{}] Validation (NLL/Accuracy): {:.6f} {:.6f}" .format(fold, kf.get_n_splits(), test_loss.getLoss('mse'), test_loss.getLoss('accuracy'))) # Test model net.test(X_test, y_test, test_loss) test_acc[fold - 1] = test_loss.getLoss('accuracy') Logger.log( "Cross Validation [Fold {}/{}] Test (NLL/Accuracy): {:.6f} {:.6f}" .format(fold, kf.get_n_splits(), test_loss.getLoss('mse'), test_loss.getLoss('accuracy'))) # Calculate time s_time[fold - 1] = time.time() - s_time[fold - 1] Logger.log( "Cross Validation [Fold {}/{}] Training Time (m / m per epoch): {:.3f} {:.3f}" .format(fold, kf.get_n_splits(), s_time[fold - 1] / 60, s_time[fold - 1] / 60 / epoch)) fold = fold + 1 fitness = validation_acc.mean() for i in range(0, kf.get_n_splits()): Logger.log( "STAT -- Model[{}/{}] #{:.3f}m Validation / Generalization accuracy (%): {:.4f} {:.4f}" .format(i, kf.get_n_splits(), s_time[i] / 60, validation_acc[i] * 100, test_acc[i] * 100)) Logger.log( "STAT -- Mean Validation / Generatlization accuracy (%): {:.4f} {:.4f}" .format(validation_acc.mean() * 100, test_acc.mean() * 100)) # ind.net = net params['CURRENT_EVALUATION'] += 1 return fitness
# pkl_file = open('data.pkl', 'rb') # [names, base, series, labels] = pickle.load(pkl_file) # pkl_file.close() #样本个数 N_Samples = 3050 base_dim = 19 series_dims = [27 - rounds, 27 - rounds, 27 - rounds] accuracy_list, auc_list = [], [] name_list = [] score_list = [] label_list = [] kf = KFold(N_Samples, n_folds=5) for train, test in kf: #基本型变量 x_train_base = base[train] x_test_base = base[test] #序列型变量 x_train_series = [ series[i][train] for i in xrange(0, len(series_dims)) ] x_test_series = [ series[i][test] for i in xrange(0, len(series_dims)) ] #标签 y_train = labels[train]
sampler = RandomUnderSampler(random_state=42) X, Y = sampler.fit_resample(X, Y) print(np.sum(Y == 1), np.sum(Y == 0)) # 特徴量を5つ選択 selector = SelectKBest(k=5) selector.fit(X, Y) mask = selector.get_support() # どの変数を選択したかを確認 print(bank_df.drop('y', axis=1).columns) print(mask) # kFoldを使って交差検証 # 一つ目の引数はデータセットを分割する個数 # 二つ目の引数はデータセットをシャッフルするよう指定 kf = KFold(n_splits=18, shuffle=True) scores = [] # 訓練データとテストデータの組み合わせを変えながら、モデルを作成し、精度を確認していきます。 for train_id, test_id in kf.split(X): # 訓練データを抽出 x = X[train_id] y = Y[train_id] # 分類のための決定木インスタンスclfを生成します。 cif = tree.DecisionTreeClassifier() # 訓練データを使って決定木モデルを作成 # モデル作成には、デフォルトのパラメータをそのまま使用します。 cif.fit(x, y) # predictを使って作成したモデルにテストデータを適用し出力を得ます pred_y = cif.predict(X[test_id]) # accuracy_scoreを使って、出力と正解の正誤数からモデルの精度を計算します
''' Using apex for faster training optimizer_list = [] for i in range(10): optimizer_list.append(AdamW(model.parameters(), lr=3e-5, correct_bias=False)) model = amp.initialize(model, opt_level="O2", verbosity=0) ''' ''' Save origin state dict of Model and Optimizer''' torch.save(model.state_dict(), 'origin_sd.pth') origin_sd = torch.load('origin_sd.pth') # Training with K-fold new_data = data.sample(frac=1).reset_index(drop=True) kf = KFold(2) BATCH_SIZE = 7 EPOCH = 5 LEARNING_RATE = 2e-5 last_predict = [] i = 0 for train_idx, test_idx in tqdm(kf.split(new_data)): train_data = new_data.iloc[train_idx] test_data = new_data.iloc[test_idx] print(model.load_state_dict(origin_sd)) ''' Get optimizer for each KFold optimizer = optimizer_list[i]
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 475 y = 476 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
# ### Classifier 1 : Decision Trees with Pruning # Steps in the process: # * Create the learning curves using test data and cross-validation (unpruned and non-optimized) # * Create Validation curves on 2 hyper parameters to find the best hyper parameter value # * Recreate the Learning Curve with the correct values of the hyper parameters # * After tuning the classifier using the two found hyper-parameters, use the classifier to predict the results and collect metrics # In[4]: scorer = make_scorer(accuracy_score) # In[5]: print("Decision Tree: Create Learning Curves") dtree_classifier = DecisionTreeClassifier() cv = KFold(n_splits=5, shuffle=True) dt_lc1_train_sizes, dt_lc1_train_scores, dt_lc1_validation_scores = learning_curve( dtree_classifier, X_train, Y_train, train_sizes=np.linspace(0.05, 1.0, 20), cv=cv, scoring=scorer, n_jobs=4) print("Decision Tree: Done with learning curve") # In[6]: plot_learning_curve( dt_lc1_train_sizes, dt_lc1_train_scores, dt_lc1_validation_scores, "Figure 1.1.1: Decision Tree Learning Curve (Unpruned) \n (Census Income Data)"
def _splitter(self): return KFold(n_splits=self._folds.value, shuffle=self._shuffle.value, random_state=self._randomSeed.value).split
cantidadDeParametros = data.shape[1]-1 #Etiquetas y = data[:,0] x = [] print(y) # print(data[0][1::]) for i in range(0, cantidadDeDatos): x.append(data[i][1:]) x = np.array(x) kf = KFold(n_splits = 5, shuffle=True) for i in range(1,11): print("=================== Medición con K = ",i) clf = KNeighborsClassifier(n_neighbors=i) accp = 0 for train_index, test_index in kf.split(x): x_train = x[train_index, :] y_train = y[train_index] clf.fit(x_train, y_train) x_test = x[test_index, :] y_test = y[test_index]
#X['modularity'] = dados['modularity'] #X['global_average_link_distance'] = dados['global_average_link_distance'] #X['eigenvector'] = dados['eigenvector'] X['coreness'] = dados['coreness'] X['transitivity'] = dados['transitivity'] #X['average_path_length'] = dados['average_path_length'] #X['eccentricity'] = dados['eccentricity'] #X['pagerank'] = dados['pagerank'] #X['grauMedio'] = dados['grauMedio'] X['links'] = dados['links'] Y = np.asarray(dados['flag']) X = np.asarray(X) kf = KFold(n_splits=10) #divide o dataset em 10 partes 9 p/ treino e 1 teste a = 0 f = 0 p = 0 r = 0 i = 0 for train_index, test_index in kf.split(X): i += 1 X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] clf = clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X_train, y_train)
dat_overlap_vars.hist() dat_separate_vars = dat_separate_vars.dropna() dat_separate_vars = dat_separate_vars[(np.abs(stats.zscore(dat_separate_vars)) < float(std_dev)).all(axis=1)] dat_separate_vars.hist() # %% try some linear regression X_sep = dat_separate_vars.drop('distance', axis=1) y_sep = dat_separate_vars['distance'] model = LinearRegression() scores_sep = [] kfold = KFold(n_splits=10, shuffle=True, random_state=123) for i, (train, test) in enumerate(kfold.split(X_sep, y_sep)): model.fit(X_sep.iloc[train, :], y_sep.iloc[train]) scores_sep.append(model.score(X_sep.iloc[test, :], y_sep.iloc[test])) print(np.mean(scores_sep)) X_over = dat_overlap_vars.drop('distance', axis=1) y_over = dat_overlap_vars['distance'] scores_over = [] for i, (train, test) in enumerate(kfold.split(X_over, y_over)): model.fit(X_over.iloc[train, :], y_over.iloc[train]) scores_over.append(model.score(X_over.iloc[test, :], y_over.iloc[test])) print(np.mean(scores_over)) # %% different methods of cross validation: model_norm = LinearRegression(normalize=True)
def cross_val_score(X, y, model_creation_fun, save_dir, n_folds=4): kfold = KFold(n_splits=n_folds) fold_liks = np.empty(n_folds) for i, (cur_train_ind, cur_test_ind) in tqdm(enumerate(kfold.split(X, y))): cur_X = X[cur_train_ind] cur_y = y[cur_train_ind] gpf.reset_default_graph_and_session() model = model_creation_fun() model.fit(cur_X, cur_y) cur_save_dir = join(save_dir, f'fold_{i + 1}') os.makedirs(cur_save_dir, exist_ok=True) model.save_model(cur_save_dir) cur_test_x = X[cur_test_ind] cur_test_y = y[cur_test_ind] log_liks = model.calculate_log_likelihood(cur_test_x, cur_test_y) marg_pred = pd.DataFrame( model.predict_marginal_probabilities(cur_test_x)) marg_pred.to_csv(join(cur_save_dir, 'marginal_probs.csv')) pd.DataFrame(cur_test_y).to_csv(join(cur_save_dir, 'y_t.csv')) # I am also interested in the log loss. y_t_df = pd.DataFrame(cur_test_y) neg_log_loss_results = multi_class_eval(marg_pred, y_t_df, neg_log_loss_with_labels, 'log_lik') neg_log_loss_results.to_csv( join(cur_save_dir, 'marginal_species_log_lik.csv')) pd.Series(neg_log_loss_results.mean()).to_csv( join(cur_save_dir, 'neg_log_loss_mean.csv')) fold_liks[i] = np.mean(log_liks) np.savez(join(cur_save_dir, 'cv_results'), site_log_liks=log_liks, cur_train_X=cur_X, cur_train_y=cur_y, cur_test_X=cur_test_x, cur_test_y=cur_test_y, train_ind=cur_train_ind, test_ind=cur_test_ind) pd.Series({ 'mean_lik': np.mean(fold_liks) }).to_csv(join(save_dir, 'mean_lik.csv')) pd.Series(fold_liks, index=[f'fold_{i+1}' for i in range(n_folds) ]).to_csv(join(save_dir, 'fold_liks.csv')) return np.mean(fold_liks), np.std(fold_liks) / np.sqrt(len(fold_liks))
def dcv_clf(X, y, model, param_grid, niter): """ Double cross validation (classification) Parameters ---------- X : array-like, shape = [n_samples, n_features] X training+test data y : array-like, shape = [n_samples] y training+test data model: estimator object. This is assumed to implement the scikit-learn estimator interface. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. niter : int number of DCV iteration Returns ------- None """ # parameters ns_in = 3 # n_splits for inner loop ns_ou = 3 # n_splits for outer loop scores = np.zeros((niter, 5)) for iiter in range(niter): ypreds = np.array([]) # list of predicted y in outer loop ytests = np.array([]) # list of y_test in outer loop kf_ou = KFold(n_splits=ns_ou, shuffle=True) # [start] outer loop for test of the generalization error for train_index, test_index in kf_ou.split(X): X_train, X_test = X[train_index], X[test_index] # inner loop CV y_train, y_test = y[train_index], y[test_index] # outer loop # [start] inner loop CV for hyper parameter optimization kf_in = KFold(n_splits=ns_in, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=kf_in) gscv.fit(X_train, y_train) # [end] inner loop CV for hyper parameter optimization # test of the generalization error ypred = gscv.predict(X_test) ypreds = np.append(ypreds, ypred) ytests = np.append(ytests, y_test) # [end] outer loop for test of the generalization error tn, fp, fn, tp = confusion_matrix(ytests, ypreds).ravel() acc = accuracy_score(ytests, ypreds) scores[iiter, :] = np.array([tp, fp, fn, tn, acc]) means, stds = np.mean(scores, axis=0), np.std(scores, axis=0) print() print('Double Cross Validation') print('In {:} iterations, average +/- standard deviation'.format(niter)) print('TP DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0])) print('FP DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1])) print('FN DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2])) print('TN DCV: {:.3f} (+/-{:.3f})'.format(means[3], stds[3])) print('Acc. DCV: {:.3f} (+/-{:.3f})'.format(means[4], stds[4]))
# month_dummies = pd.get_dummies(train_test_features['month'], prefix='month', prefix_sep='_') # if 'phase' in select_list: # phase_dummies = pd.get_dummies(train_test_features['phase'], prefix='phase', prefix_sep='_') # n_X_train_test_mer = pd.concat([n_X_train_test_pd, chargemode_dummies, hour_dummies, week_dummies, month_dummies,phase_dummies], axis=1) # n_X_train_test_mer.drop(['charge_mode', 'hour', 'week', 'month', 'phase'], axis=1, inplace=True) # else: # n_X_train_test_mer = pd.concat([n_X_train_test_pd, chargemode_dummies, hour_dummies, week_dummies, month_dummies], axis=1) # n_X_train_test_mer.drop(['charge_mode', 'hour', 'week', 'month'], axis=1, inplace=True) n_testB = n_X_train_test_mer.tail(selected_testB_features.shape[0]) n_X_train = n_X_train_test_mer.drop(n_testB.index.tolist()) return n_X_train, n_y_train, n_testB, y_scaler ram_num = 5 kfolds = KFold(n_splits=10, shuffle=True, random_state=ram_num) def cv_rmse(model, train, y_train): rmse = np.sqrt(-cross_val_score(model, train, y_train, scoring="neg_mean_squared_error", cv = kfolds)) return(rmse) def ridge_selector(k, X, y): model = make_pipeline(RidgeCV(alphas = [k], cv=kfolds)).fit(X, y) rmse = cv_rmse(model, X, y).mean() return(rmse) def lasso_selector(k, X, y): model = make_pipeline(LassoCV(max_iter=1e7, alphas = [k], cv = kfolds)).fit(X, y) rmse = cv_rmse(model, X, y).mean()
if __name__ == '__main__': train_samples = pd.read_csv('train.csv')['fname'].values # f_tr, f_val = train_test_split(train_samples, test_size=0.1) import os from torch.utils.data import DataLoader from sklearn.model_selection import train_test_split, KFold with ignore(OSError): os.mkdir('checkpoints/naive') save_paths = [f'naive/resnet50_r{i:2d}' for i in range(10)] round_id = 0 for ix_tr, ix_val in KFold(n_splits=10).split(train_samples): f_tr, f_val = train_samples[ix_tr], train_samples[ix_val] with timer('load data'): train_loader = DataLoader(DSet(f_tr), batch_size=128, shuffle=True, **kwargs) val_loader = DataLoader(DSet(f_val), batch_size=128, **kwargs) train(build_resnet50(), train_loader, val_loader, 300, save_paths[round_id]) round_id += 1 with timer('load test data'): sub = pd.read_csv('sample_submission.csv') test_loader = DataLoader(DSet(sub['fname'].values, 'test'),
num_folds = 10 scoring = "neg_mean_squared_error" seed = 51 # Spot Check Algorithms models = [] models.append(('LR', LinearRegression())) models.append(('LASSO', Lasso())) models.append(('EN', ElasticNet())) models.append(('KNN', KNeighborsRegressor())) models.append(('CART', DecisionTreeRegressor())) models.append(('SVR', SVR())) results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # In[16]: #fig = plt.figure() #fig.suptitle('Algorithm Comparison') #ax = fig.add_subplot(111) #plt.boxplot(results) #ax.set_xticklabels(names)
if k == 1: all_X = X else: all_X = np.hstack((all_X, X)) print('...............................................................................') # output the spectrum profile np.savetxt(featurename + 'Feature1.txt', all_X) # prediction based on spectrum profile print('###############################################################################') print('The prediction based on ' + featurename + ', beginning') tic = time.clock() clf = XGBClassifier(learning_rate=0.05, n_estimators=20, max_depth=4, objective='binary:logistic') folds = KFold(10, True, 1) getshapvalue(all_X, y, clf) auc_score, accuracy, sensitivity, specificity, MCC = getCrossValidation(all_X, y, clf, folds) print('results for feature:' + featurename) print('****AUC score:%.3f, accuracy:%.3f, sensitivity:%.3f, specificity:%.3f, MCC:%.3f****' % ( auc_score, accuracy, sensitivity, specificity, MCC)) toc = time.clock() print('The prediction time: %.3f minutes' % ((toc - tic) / 60.0)) print('###############################################################################\n') # output result results = DataFrame({'Feature': [featurename], \ 'AUC': [auc_score], \ 'ACC': [accuracy], \
style.use("ggplot") from sklearn import svm from sklearn.model_selection import KFold, cross_val_score from sklearn.metrics import classification_report, confusion_matrix, f1_score, matthews_corrcoef #loading breast cancer data from UCI ML Repo url = "https://goo.gl/AP7kzV" raw_data = urllib.request.urlopen(url) dataset = np.genfromtxt(raw_data, delimiter=",") #features are first 9, and classification is the 10th X = dataset[:,0:10] y = dataset[:,10] #splits data into 5 chunks for N fold cross validation k_fold = KFold(n_splits=5) #below chunk is to test which are testing and training #for train_indices, test_indices in k_fold.split(X): #print('Train: %s | test: %s' % (train_indices, test_indices)) svc = svm.SVC(C=1, kernel='linear') #to change all nan values to 0 X[(np.isnan(X))] = 0 #loop to do 5 fold cross validation, stores scores in array # scores = [svc.fit(X[train], y[train]).score(X[test], y[test]) # for train, test in k_fold.split(X)] # print(scores) i = 1
y_all = np.concatenate((y_train, y_dev)) #X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42) X_all = np.concatenate((X_train, X_dev)) y_all = np.concatenate((y_train, y_dev)) seed = 7 numpy.random.seed(seed) idx = np.random.permutation(len(X_all)) X_all = X_all[idx] y_all = y_all[idx] from sklearn.model_selection import KFold #test_fold = [-1]*len(X_train)+[1]*len(X_dev) kf = KFold(n_splits=10) # create model # define the grid search parameters conv_layers = [3] conv_units = [256] lr = [5e-4] drop = [0.3] i = 0 for t, v in kf.split(X_all): i += 1 X_train, y_train, X_dev, y_dev = X_all[t], y_all[t], X_all[v], y_all[v] #X_train_em,X_dev_em=X_all_em[t],X_all_em[v] model = create_model(conv_layers=4, conv_units=256, lr=5e-4, drop=0.3)
print("Size of x:",len(x)," Size of y:",len(radical)," Positive : ",radicalOne) X = [] for t in x: t = re.sub(r'[^\w\s]',' ',t) t = ' '.join([word for word in t.split() if word != " "]) t = t.lower() t = ' '.join([word for word in t.split() if word not in cachedStopWords]) X.append(t) with timer("making Tokeniser"): print("Type of X:",type(X)) Features = X Radical = radical kf = KFold(n_splits=10) iteration = 0 gRadicalAccu = 0 gPrecision = [0,0] gRecall = [0,0] gFScore = [0,0] vocabSize = len(allEnglishWords) tokenizer = Tokenizer(num_words= vocabSize) tokenised = tokenizer.fit_on_texts(allEnglishWords) gPositivePredRadical = 0 with timer("Cross Validation"):
def __init__(self, n_splits=2, shuffle=False): self.n_splits = n_splits if self.n_splits > 1: self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
return None, loss, hold_dict, out_prob global_step = tf.Variable(0.0, trainable=False, name="global_step") train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step) tf.summary.scalar("loss", loss) return train_op, loss, hold_dict, out_prob if __name__ == "__main__": train_x, train_y, test_x, test_id = get_data() train_y = train_y[:, None] train_x_std, test_x_std, _ = scale_data(train_x, test_x) k_fold = KFold(n_splits=5, shuffle=True, random_state=seed) tf.set_random_seed(seed) with tf.Graph().as_default(): with tf.name_scope("train"): with tf.variable_scope("dnn", reuse=None): train_op, train_loss, train_holder, train_res = dnn_network( keep_prob=keep_prob, is_training=True) with tf.name_scope("test"): with tf.variable_scope("dnn", reuse=True): _, test_loss, test_holder, test_res = dnn_network( keep_prob=keep_prob, is_training=False) saver = tf.train.Saver() summary = tf.summary.merge_all()
if img.shape[0] == 1: img = img[0][0] if img.shape[0] == 3: img = np.moveaxis(img, 0, 2) X_images_.append(img) shape = (img.shape[1], img.shape[0]) mask = enc2mask(data.loc[data.id == data.id[i], "encoding"].values[0], shape) Masks_.append(mask) image_dims_.append(img.shape) print(img.shape) del img, mask, shape gc.collect() indexes = [i for i in range(15)] kf = KFold(n_splits=5, shuffle=True, random_state=2021) sum_masks = [] k = 0 for fold, (train_index, val_index_) in enumerate(kf.split(indexes)): print('Train fold ', fold, 'val indexes = ', val_index_) with open(f"../{model_name}/{model_name}.log", 'a+') as logger: logger.write(f'fold {fold} val index {val_index_}\n') masks = train_fold(val_index_, X_images_, Masks_, image_dims_, fold, train=train, predict=predict) if len(sum_masks) == 0: sum_masks = masks else: for i in range(len(sum_masks)): if predict: sum_masks[i] = sum_masks[i] + masks[i] k += 1 del masks
def rbf_svc_fs98(finC_x, finC_y, finT_x, finT_y): dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split( finC_x, finC_y, test_size=0.1) dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split( finT_x, finT_y, test_size=0.1) estimator_c1 = LinearSVC() selector_c1 = RFE(estimator_c1, 10000, step=0.1) new_x_c1 = selector_c1.fit_transform(dataC_x_train, np.ravel(dataC_y_train)) estimator_t1 = LinearSVC() selector_t1 = RFE(estimator_t1, 10000, step=0.1) new_x_t1 = selector_t1.fit_transform(dataT_x_train, np.ravel(dataT_y_train)) new_x = pd.concat([pd.DataFrame(new_x_c1), pd.DataFrame(new_x_t1)], axis=1) best_acc = [] K = 10 kf = KFold(n_splits=K) for num in features_num: print('selected num of features: ', num) estimator_c = LinearSVC() selector_c = RFE(estimator_c, num, step=0.1) new_x_c = selector_c.fit_transform(new_x_c1, np.ravel(dataC_y_train)) estimator_t = LinearSVC() selector_t = RFE(estimator_t, num, step=0.1) new_x_t = selector_t.fit_transform(new_x_t1, np.ravel(dataT_y_train)) estimator_ = LinearSVC() selector_ = RFE(estimator_, num, step=0.1) new_x_ = selector_.fit_transform(new_x, np.ravel(dataT_y_train)) new_x = pd.concat([ pd.DataFrame(new_x_), pd.concat([pd.DataFrame(new_x_c), pd.DataFrame(new_x_t)], axis=1) ], axis=1) cv_accur = 0 cv_sd = 0 accur_total = 0 accur_list = [] for train_index, test_index in kf.split(new_x): data_x_train, data_x_test = new_x.values[ train_index], new_x.values[test_index] data_y_train, data_y_test = finC_y.values[ train_index], finC_y.values[test_index] data_y_train = np.ravel(data_y_train) data_y_test = np.ravel(data_y_test) accur = np.zeros(num_costs) for i in range(num_costs): model = SVC(gamma=cost_range[i], kernel='rbf') model.fit(data_x_train, data_y_train) pred = model.predict(data_x_test) accur[i] = accuracy_score(data_y_test, pred) accur_total += np.max(accur) accur_list.append(np.max(accur)) cv_accur = accur_total / K cv_sd = np.std(accur_list) print('Accuracy = ', cv_accur, 'std = ', cv_sd) best_acc.append(cv_accur) return best_acc