def test_repeated_kfold_determinstic_split(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] random_state = 258173307 rkf = RepeatedKFold( n_splits=2, n_repeats=2, random_state=random_state) # split should produce same and deterministic splits on # each call for _ in range(3): splits = rkf.split(X) train, test = next(splits) assert_array_equal(train, [2, 4]) assert_array_equal(test, [0, 1, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 3]) assert_array_equal(test, [2, 4]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3, 4]) train, test = next(splits) assert_array_equal(train, [2, 3, 4]) assert_array_equal(test, [0, 1]) assert_raises(StopIteration, next, splits)
kf = RepeatedKFold(n_splits=num_fold, n_repeats=1, random_state=666) # Define a loop for plotting figures. max_samples_batch = 200 batch_size = 1 # Shuffle the dataset. combined = list(zip(dataset, strings)) random.seed(666) random.shuffle(combined) dataset[:], strings[:] = zip(*combined) pool = multiprocessing.Pool(os.cpu_count()) args = [] # print(os.cpu_count()) # It counts for logical processors instead of physical cores. for train_idx, test_idx in kf.split(dataset): tmp_args = { 'train_idx': train_idx, 'test_idx': test_idx, 'dataset': dataset, 'strings': strings, 'max_samples_batch': max_samples_batch, 'batch_size': batch_size, } args.append(tmp_args) results = pool.map(cv_edit_active_learn, args) # print(len(results)) # print(len(results[0])) phrase_acc = [results[i][0] for i in range(num_fold)] out_acc = [results[i][1] for i in range(num_fold)] # print(len(phrase_acc))
yield filenames, images filenames = [] images = np.zeros(self.batch_shape) idx = 0 if idx > 0: yield filenames, images # k-fold cross-validation from sklearn.model_selection import RepeatedKFold splitter = RepeatedKFold(n_splits=3, n_repeats=1, random_state=0) partitions = [] for train_idx, test_idx in splitter.split(train_labels.index.values): partition = {} partition["train"] = train_labels.Id.values[train_idx] partition["validation"] = train_labels.Id.values[test_idx] partitions.append(partition) # Define the CNN parameters class ModelParameter: def __init__(self, basepath, num_classes=28, image_rows=512, image_cols=512, batch_size=200, n_channels=1,
# 数据标准化 x = preprocessing.scale(x) # 划分数据集(20%测试集) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) tt = time.time() forest = RandomForestClassifier(criterion='entropy') # 训练:使用k折(cv=k,这里用5折)交叉验证 kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=0) kf_score = [] for t, v in kf.split(x_train): forest.fit(x_train[t], y_train[t]) # fitting val_score = forest.score(x_train[v], y_train[v]) kf_score.append(val_score) print('time: {:.5f}s'.format(time.time() - tt)) tt = time.time() # 测试结果 accuracy_score = forest.score(x_test, y_test) print('time: {:.5f}s'.format(time.time() - tt)) print('验证集accuracy_score: {:.4f}'.format(np.mean(kf_score))) print("测试集accuracy_score: {:.4f}".format(accuracy_score))
#test #X = np.array(sorted(set(subdf.index.date))) #rkf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=12883823) #for train_index,test_index in rkf.split(X): # tmp = subdf[pd.to_datetime(subdf.index.date).isin(X[test_index])] # print(tmp['Weekday'].value_counts().sort_index()) all_date = np.array(sorted(set(dfPm.index.date))) rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=12883823) a = 0 dict_of_2019=dict() dict_of_max2019=dict() RF1_feature_table = pd.DataFrame(columns = dfPm.keys()[2:]) RF2_feature_table = pd.DataFrame(columns = dfPm.keys()[2:]) for train_index, test_index in rkf.split(all_date): train_datetime_index = pd.to_datetime(dfPm.index.date).isin(all_date[train_index]) test_datetime_index = pd.to_datetime(dfPm.index.date).isin(all_date[test_index]) X_train, X_test = X[train_datetime_index], X[test_datetime_index] y_train, y_test = y[train_datetime_index], y[test_datetime_index] # feature extraction model_RF = RandomForestRegressor(n_estimators=100, max_depth=7,random_state=137) model_RF = model_RF.fit(X_train, y_train) new_train = pd.DataFrame(X_train) new_train['obs_o3'] = y_train new_train['pred_o3'] = model_RF.predict(X_train) new_train['diff_o3'] = abs(new_train['pred_o3']-new_train['obs_o3']) new_train=new_train[new_train['diff_o3']>5] X_train2 = np.array(new_train.drop(['obs_o3','pred_o3','diff_o3'],axis=1))
oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit) predictions_xgb += clf.predict( xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y))) # 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack1 = np.zeros(train_stack.shape[0]) predictions1 = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], y.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack1[val_idx] = clf_3.predict(val_data) predictions1 += clf_3.predict(test_stack) / 10 print("CV score: {:<8.8f}".format(mean_squared_error(y.values, oof_stack1))) sub_df = pd.DataFrame() sub_df[0] = pd.read_csv('./jinnan_round1_testB_20190121.csv', header=None)[0][1:]
def regression_cross_validate_perseverance(data): dm = data['DM'][0] ccs = [] ccs_ch = [] ccs_rew = [] for s, sess in enumerate(dm): DM = dm[s] choices = DM[:, 1] reward = DM[:, 2] block = DM[:, 4] block_df = np.diff(block) ind_block = np.where(block_df != 0)[0] if len(ind_block) >= 11: trials_since_block = [] t = 0 for st, s in enumerate(block): if block[st - 1] != block[st]: t = 0 else: t += 1 trials_since_block.append(t) #block_totals_ind = (np.where(np.asarray(ind_block) == 1)[0]-1)[1:] block_totals_ind = ind_block block_totals = np.diff(block_totals_ind) - 1 trials_since_block = trials_since_block[:ind_block[11]] fraction_list = [] for t, trial in enumerate(trials_since_block): if t <= block_totals_ind[0]: fr = trial / block_totals_ind[0] fraction_list.append(fr) elif t > block_totals_ind[0] and t <= block_totals_ind[1]: fr = trial / block_totals[0] fraction_list.append(fr) elif t > block_totals_ind[1] and t <= block_totals_ind[2]: fr = trial / block_totals[1] fraction_list.append(fr) elif t > block_totals_ind[2] and t <= block_totals_ind[3]: fr = trial / block_totals[2] fraction_list.append(fr) elif t > block_totals_ind[3] and t <= block_totals_ind[4]: fr = trial / block_totals[3] fraction_list.append(fr) elif t > block_totals_ind[4] and t <= block_totals_ind[5]: fr = trial / block_totals[4] fraction_list.append(fr) elif t > block_totals_ind[5] and t <= block_totals_ind[6]: fr = trial / block_totals[5] fraction_list.append(fr) elif t > block_totals_ind[6] and t <= block_totals_ind[7]: fr = trial / block_totals[6] fraction_list.append(fr) elif t > block_totals_ind[7] and t <= block_totals_ind[8]: fr = trial / block_totals[7] fraction_list.append(fr) elif t > block_totals_ind[8] and t <= block_totals_ind[9]: fr = trial / block_totals[8] fraction_list.append(fr) elif t > block_totals_ind[9] and t <= block_totals_ind[10]: fr = trial / block_totals[9] fraction_list.append(fr) elif t > block_totals_ind[10] and t <= len(trials_since_block): fr = trial / trials_since_block[-1] fraction_list.append(fr) choices = choices[:ind_block[11]] reward = reward[:ind_block[11]] last_reward = [] for r, rew in enumerate(reward): if r > 0: if reward[r - 1] == 1: last_reward.append(1) elif reward[r - 1] == 0: last_reward.append(0) last_choice = [] for c, ch in enumerate(choices): if c > 0: if choices[r - 1] == 1: last_choice.append(1) elif choices[r - 1] == 0: last_choice.append(0) fraction_list = np.asarray(fraction_list)[1:] last_reward = np.asarray(last_reward) last_choice = np.asarray(last_choice) fraction_reward = last_reward * fraction_list fraction_choice = last_reward * fraction_list trials = len(fraction_choice) predictors_all = OrderedDict([ ('Last Reward', last_reward), ('Last Choice', last_choice), ('Block Fraction', fraction_list), ('Block Fraction x Choice', fraction_choice), ('Block Fraction x Reward', fraction_reward) ]) X = np.vstack(predictors_all.values()).T[:trials, :].astype(float) y = choices[1:] kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=99) #initialise repeated K-Fold ccx = [] ccx_ch = [] ccx_rew = [ ] #initialise contained for storing cross validated fits for train_ix, test_ix in kf.split(y): y_train = y[train_ix] y_test = y[test_ix] #get train and test indices for activity #get train and test DM without time in block interactions x_train_no_choice_int = X[:, 0][train_ix] x_test_no_choice_int = X[:, 0][test_ix] #get train and test DM with time in block interactions x_train_choice_int = X[:, 1][train_ix] x_test_choice_int = X[:, 1][test_ix] x_train_rew_int = X[train_ix, 2:3] x_test_rew_int = X[test_ix, 2:3] #fit linear model with regularisation. Ideally would do nested K-fold #to select optimal hyper-parameter linR = lm.LogisticRegression(fit_intercept=True) ft = linR.fit(x_train_no_choice_int, y_train) ccx.append( np.corrcoef(ft.predict(x_test_no_choice_int), y_test)[0, 1]) #get cross validated fit quality linR = lm.LogisticRegression(fit_intercept=True) ft = linR.fit(x_train_choice_int, y_train) ccx_ch.append( np.corrcoef(ft.predict(x_test_choice_int), y_test)[0, 1]) #get cross validated fit quality linR = lm.LogisticRegression(fit_intercept=True) ft = linR.fit(x_train_rew_int, y_train) ccx_rew.append( np.corrcoef(ft.predict(x_test_rew_int), y_test)[0, 1]) #get cross validated fit quality ccs.append(np.nanmean(ccx)) ccs_ch.append(np.nanmean(ccx_ch)) ccs_rew.append(np.nanmean(ccx_rew)) c1 = np.array(ccs)**2 c2 = np.array(ccs_ch)**2 ixs = np.logical_and.reduce([np.isfinite(c1), np.isfinite(c2)]) t, p = stt.ttest_rel(c1[ixs], c2[ixs]) print( 'Variance explained \nwithout time in block: {:.5f}\nwith time in block: {:.5f}' .format(np.nanmean(c1), np.nanmean(c2))) print('t:{:.3f}\np:{:.3e}'.format(t, p))
def yacht(): df = pd.read_table(f'{datasets_folder}/yacht.txt', sep='\s+', header=None) X = df.iloc[:, :-1] y = df.iloc[:, -1] cv = RepeatedKFold(n_splits=10, n_repeats=4) return X, y, cv.split(X)
from sklearn.model_selection import RepeatedKFold from sklearn import svm, metrics from scipy import stats import pandas as pd import numpy import training training.createdata() dataset = pd.read_csv('dir/hog.csv') dataset = dataset[(numpy.abs(stats.zscore(dataset)) < 5.04).all(axis=1)] random_state = 12883823 rkf = RepeatedKFold(n_splits=5, n_repeats=30, random_state=random_state) result = next(rkf.split(dataset), None) data_train = dataset.iloc[result[0]] data_test = dataset.iloc[result[1]] data = data_train.iloc[:, [0, 3780]] target = data_train.iloc[:, [3781]] classifier = svm.SVC(C=1, gamma=0.1) classifier.fit(data, target) dataset_teste = pd.read_csv('dir/test_hog.csv') predicted = classifier.predict(dataset_teste.iloc[:, [0, 3780]]) print(metrics.classification_report(dataset_teste.iloc[:, [3781]], predicted)) print("Confusion matrix:\n%s" % metrics.confusion_matrix(dataset_teste.iloc[:, [3781]], predicted)) print( classifier.score(dataset_teste.iloc[:, [0, 3780]],
def analyze_dataset(d_seq, sample_diam, flag): # remove all cases with no tumor cells in the sampled tile mask = y == -1 y = y[~mask] X = X[~mask, :] X = np.load(STORE_DIR + "data_x.npy") y = np.load(STORE_DIR + "data_y.npy") print X.shape, y.shape # set aside holdout set here # feature_names = ["".join(["f", str(x)]) for x in range(X.shape[1])] # feature_names.append('y') # feature_names # tmp = pd.DataFrame(np.hstack((X, y.reshape(-1,1)))) # tmp.columns = feature_names # tmp.to_csv('local_data.csv', index=False) # add logit transformed response variable dtr = learning.VectorTransform(y) yt = dtr.zero_one_scale().apply('logit') plt.hist(yt) plt.hist(y) plt.hist(np.sqrt(y)) plt.scatter(X[:, 12], y) # for i in range(30): # # p = int(i / 5) # r = i % 5 # plt.hist(X[:,i]) # # plt.title(phens[p] + '_' + str(diams[r])) # plt.show() # from sklearn.feature_selection import mutual_info_regression # y_noise = y + np.random.normal(scale=0.01, size=(len(y))) # for i in range(6): # print "Phenotype ", i # mi = mutual_info_regression(X[:,i].reshape(-1, 1), y.reshape(-1, 1)) # print "MI: ", mi # # display.scatter_hist(X[:,i], y) # # plt.scatter(X[:,i], y_noise, s=0.3) # print "Corr: ", helper.metrics.corr(X[:, i], y) # # plt.show() # X_train, X_test, y_train, y_test = train_test_split(X, discrete_response, # test_size=0.4) # from sklearn.linear_model import LassoCV # # from sklearn.neural_network import MLPClassifier # # from sklearn.ensemble import AdaBoostClassifier # # from sklearn.tree import DecisionTreeClassifier # # rf = MLPClassifier(solver='lbfgs', alpha=1e-5, # # hidden_layer_sizes=(300, 2)) # from sklearn.ensemble import ExtraTreesClassifier # from sklearn.ensemble import RandomForestRegressor # from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor # rf = ExtraTreesClassifier(n_estimators=500, class_weight='balanced', oob_score=True, bootstrap=True) # rf = RandomForestRegressor(n_estimators=500, oob_score=True, bootstrap=True) # rf = LassoCV(cv=10, normalize=False) #### fit machine learning models #### from sklearn.linear_model import LassoCV from sklearn.linear_model import Lasso from sklearn.linear_model import RidgeCV from sklearn.linear_model import Ridge from sklearn.linear_model import ElasticNetCV from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.svm import SVR from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MaxAbsScaler from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RepeatedKFold, GroupKFold ################################################################ X_ = X y_ = y from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(2) X_p = np.hstack(((X + 1), 1 / (X + 1))) X_p = poly.fit_transform(X_p) X_p = np.sqrt(X_p) X_p.shape X_ = X_p plt.scatter(X_[:, 4], y_) rep_scores = [] estimator = RidgeCV() estimator = RandomForestRegressor(n_estimators=50) cv = RepeatedKFold(n_splits=10, n_repeats=1) out_sample = {'pred': [], 'target': []} for train, test in cv.split(X_, y_): X_train = X_[train] X_test = X_[test] y_train = y_[train] y_test = y_[test] # X_train = np.sqrt(X_train + 1) # X_test = np.sqrt(X_test + 1) scale = StandardScaler() X_train = scale.fit_transform(X_train) X_test = scale.transform(X_test) # X_train = pca.fit_transform(X_train) # X_test = pca.transform(X_test) preds = estimator.fit(X_train, y_train).predict(X_test) # preds = dtr.undo(preds) # y_test = dtr.undo(y_test) rep_scores.append(metrics.rmse(preds, y_test)) # rep_scores.append(estimator.score(X_test, y_test)) out_sample['pred'].extend(preds) out_sample['target'].extend(y_test) print np.mean(rep_scores), np.std(rep_scores) plt.scatter(preds, y_test) # dict elements from list to array for key, value in out_sample.iteritems(): out_sample[key] = np.array(value) metrics.rmse(out_sample['pred'], out_sample['target']) fig = plt.scatter(out_sample['pred'], out_sample['target']) np.sqrt(np.mean(out_sample['target']**2)) y_test plt.hist(yt) ################################################################ # # rf = AdaBoostClassifier(n_estimators=500) # # rf = LogisticRegression(penalty='l2') # estimator = LogisticRegression(max_iter=1000) # estimator = RandomForestClassifier(n_estimators=500, class_weight='balanced_subsample', oob_score=True) estimator = RandomForestRegressor(n_estimators=300, oob_score=True, bootstrap=True) learner = learning.TestLearner(task='regress') learner.test(estimator, X, y, folds=5, n_classes=2, rf_oob=True) print "proportion of 0:", sum(y == 0) / len(y) print "proportion of 1:", sum(y == 1) / len(y) estimator.fit(X, y) print estimator.feature_importances_ # tmp = estimator.feature_importances_[::-1].reshape(6,5) def adjust_missing_feature_importances(importances, flag, n_outer, n_inner): rings = n_outer + n_inner if flag == 'n': return importances.reshape(6, rings) if flag == 'a': tmp = np.insert(importances, n_outer, n_inner * [0]) tmp = np.insert(tmp, 0, n_inner * [0]) return tmp.reshape(7, rings) # tmp = estimator.feature_importances_.reshape(6,5) from visualize_disc_importance import plot_discs, infer_sign_array signs = infer_sign_array(X, y) n_outer = np.sum(np.array(d_seq) > sample_diam) n_inner = len(d_seq) - n_outer
ratios = [5.0, 3.3, 2.5] for ratio in ratios: P_opt_file = open( subject + "/P-Pot-Average-Value-" + str(ratio) + ".csv", "w") Statistical_file = open( subject + "/STATISTICAL-" + str(ratio) + ".csv", "w") P_opt_betweenness_list = [] P_opt_pagerank_list = [] P_opt_degree_list = [] P_opt_effort_list = [] P_opt_effortcore_list = [] kf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=0) for train_index, test_index in kf.split(all_data): try: data_train = all_data[train_index] data_test = all_data[test_index] label_train = all_label[train_index] label_test = all_label[test_index] test_class_name = [] for each_index in test_index: test_class_name.append(class_name_list[each_index]) if (label_sum(label_train) > (len(label_train) / 2)): print "The training data does not need balance." predprob_auc, predprob, precision, recall, fmeasure, auc = classifier_output( data_train, label_train,
oof_cb = np.array(pd.read_csv('cab_train.csv')['price']) # 读取price,对验证集进行评估 Train_data = pd.read_csv('train_tree.csv', sep=' ') TestA_data = pd.read_csv('text_tree.csv', sep=' ') Y_data = Train_data['price'] train_stack = np.vstack([oof_lgb, oof_cb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_cb]).transpose() folds_stack = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2018) tree_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) # 二层贝叶斯回归stack for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, Y_data)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], Y_data[trn_idx] val_data, val_y = train_stack[val_idx], Y_data[val_idx] Bayes = linear_model.BayesianRidge() Bayes.fit(trn_data, trn_y) tree_stack[val_idx] = Bayes.predict(val_data) predictions += Bayes.predict(test_stack) / 20 tree_predictions = np.expm1(predictions) tree_stack = np.expm1(tree_stack) tree_point = mean_absolute_error(tree_stack, np.expm1(Y_data)) print("树模型:二层贝叶斯: {:<8.8f}".format(tree_point)) # 导入神经网络模型预测训练集数据,进行三层融合
def main(argv=None): if (len(sys.argv) < 2): print_syntax() inputFile = None modelFile = None gamma = 0.1 c = 1.0 degree = 1 kernel = "linear" param_index = 1 while param_index < len(sys.argv): if (sys.argv[param_index] == "-i"): param_index = param_index + 1 inputFile = sys.argv[param_index] elif (sys.argv[param_index] == "-o"): param_index = param_index + 1 modelFile = sys.argv[param_index] elif (sys.argv[param_index] == "-m"): param_index = param_index + 1 modelFile = sys.argv[param_index] elif (sys.argv[param_index] == "-g"): param_index = param_index + 1 gamma = float(sys.argv[param_index]) elif (sys.argv[param_index] == "-c"): param_index = param_index + 1 c = float(sys.argv[param_index]) elif (sys.argv[param_index] == "-d"): param_index = param_index + 1 degree = int(sys.argv[param_index]) elif (sys.argv[param_index] == "-t"): param_index = param_index + 1 if sys.argv[param_index] == "0": kernel = "linear" elif sys.argv[param_index] == "1": kernel = "poly" elif sys.argv[param_index] == "2": kernel = "rbf" else: print_syntax() else: print("Unknown parameter: ", sys.argv[param_index]) print_syntax() param_index = param_index + 1 lines = None with open(inputFile) as f: lines = [line.rstrip() for line in f] split_lines = [None] * len(lines) max_index = 0 valid_lines = 0 for i in range(0, len(lines)): fields = lines[i].split(" ") split_lines[i] = fields if (len(fields) > 1): current_maxindex = int( fields[len(fields) - 1][:fields[len(fields) - 1].find(":")]) if (current_maxindex > max_index): max_index = current_maxindex valid_lines = valid_lines + 1 matrix = [] label = [] for fields in split_lines: if (len(fields) > 1): data = [0.0 for x in range(max_index)] label.append(int(fields[0])) for i in range(1, len(fields)): data[int(fields[i][:fields[i].find(":")]) - 1] = float( fields[i][(fields[i].find(":")) + 1:]) matrix.append(data) model = svm.SVC(kernel=kernel, gamma=gamma, C=c) random_state = 12883824 rkf = RepeatedKFold(n_splits=len(matrix), n_repeats=1, random_state=random_state) pred = [0 for x in range(len(matrix))] for train_index, test_index in rkf.split(matrix): X_train = [[0 for x in range(max_index)] for y in range(len(matrix) - 1)] X_test = [[0 for x in range(max_index)] for y in range(1)] label_train = [0 for x in range(len(matrix) - 1)] label_test = [0 for x in range(1)] y = 0 for i in train_index: for x in range(max_index): X_train[y][x] = matrix[i][x] label_train[y] = label[i] y = y + 1 y = 0 for i in test_index: for x in range(max_index): X_test[y][x] = matrix[i][x] label_test[y] = label[i] y = y + 1 model.fit(X_train, label_train) res = model.predict(X_test) pred[test_index[0]] = res[0] relevant = 0 for i in range(len(label)): if (label[i] == 1): relevant = relevant + 1 relevant_and_retrieved = 0 retrieved = 0 for i in range(len(pred)): if (pred[i] == 1): retrieved = retrieved + 1 if ((pred[i] == 1) and (label[i] == 1)): relevant_and_retrieved = relevant_and_retrieved + 1 recall = relevant_and_retrieved / relevant precision = 0 if (retrieved > 0): precision = relevant_and_retrieved / retrieved print("Mean-Recall " + repr(recall)) print("Mean-Precision " + repr(precision))
resultados = [] # Lista de resultados onde será calculada a acuracia X_treino, X_val, y_treino, y_val = train_test_split(X, y, test_size=0.6) # função train_test_split é usada para dividir nossos dados de maneira padroni- # zada, assim como foi passado 60% para teste e 40% para treino KFold = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10) # função que faz a divisão para encontrar a divisão para treino e validação # (ou teste) para poder encontrar acuracia do modelo, além de repetir essas # divições # laço dedicado para nos dizer aleatoriamente quais linhas devemos usar do # treino e da validação for linhas_treino, linhas_val in KFold.split(X): print("Treino:", linhas_treino[0]) print("Valid:", linhas_val.shape[0]) print() X_treino = X.iloc[linhas_treino] X_val = X.iloc[linhas_val] y_treino = y.iloc[linhas_treino] y_val = y.iloc[linhas_val] print(X_treino.head()) print() Floresta.fit(X_treino, y_treino) # funçao.fit é a função usada para treinar o modelo para chegar em uma previsão
# Set random state here random_state = 0 # Train test split, save 20% of data point to the test set X_train, X_test, y_train, y_test, X_before_train, X_before_test = train_test_split(X, y, X_before_scaling, test_size=0.2, random_state = random_state) # The alpha grid used for plotting path alphas_grid = np.logspace(0, -3, 20) # Cross-validation scheme rkf = RepeatedKFold(n_splits = 10, n_repeats = 10 , random_state =random_state) # Explicitly take out the train/test set X_cv_train, y_cv_train, X_cv_test, y_cv_test = [],[],[],[] for train_index, test_index in rkf.split(X_train): X_cv_train.append(X_train[train_index]) y_cv_train.append(y_train[train_index]) X_cv_test.append(X_train[test_index]) y_cv_test.append(y_train[test_index]) # %% [markdown] # ### Step 5 - Train ML models # %% [markdown] # #### LASSO Regression<a name="lasso"></a> # # %% #%% LASSO regression '''
def cv_baggingDT(self, pu_data, splits=3, repeats=100, bags=100, filename=''): """ Train bagged decision tree base classifiers and do repeated k-fold CV. Synthesizability scores (0 = not synthesizable, 1 = already synthesized) are generated for an unlabeled sample by averaging the scores from the ensemble of decision tree classifiers that have not been trained on that sample. Args: pu_data (json): A file of numeric features describing materials. There MUST be a column called "PU_label" where a 1 value indicates a synthesized (positive) compound and a 0 value indicates an unlabeled compound. splits (int): Number of splits in k-fold CV. repeats (int): Number of repeated k-fold CV. bags (int): Number of bags in bootstrap aggregation. filename (string): Save model training results to file with filename ending in .json or .pkl. Returns: pu_stats (dict): Metrics and outputs of PU learning model training. """ print('Start PU Learning.') # Preprocess data and set attributes df = pd.read_json(pu_data) df_P, df_U, X_P, X_U = self._process_pu_data(df) self.df_P = df_P self.df_U = df_U # Split data into training and test splits for k-fold CV kfold = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=42) # Scores for PU learning (tpr = True Positive Rate) scores = [] tprs = [] # Predicted synthesis probabilty of CVed P and U sets prob_P = np.ones(shape=(X_P.shape[0], splits * repeats)) prob_U = -np.ones(shape=(X_U.shape[0], splits * repeats)) # Feature importance feat_rank = np.zeros(shape=(X_P.shape[1], splits * repeats)) idsp = 0 # index of repeated k splits # Loop over P and U training/test samples for (ptrain, ptest), (utrain, utest) in zip(kfold.split(X_P), kfold.split(X_U)): # Number of P and U training samples N_ptrain = X_P[ptrain].shape[0] N_utrain = X_U[utrain].shape[0] d = X_P.shape[1] K = N_ptrain train_label = np.zeros(shape=(N_ptrain + K, )) train_label[:N_ptrain] = 1.0 # Synthesized (positive) # Out of bag samples n_oob = np.zeros(shape=(N_utrain, )) f_oob = np.zeros(shape=(N_utrain, 2)) # Sums of probabilities of test sets f_ptest = np.zeros(shape=(X_P[ptest].shape[0], 2)) f_utest = np.zeros(shape=(X_U[utest].shape[0], 2)) # Bootstrap resampling for each bag for i in range(bags): bootstrap_sample = np.random.choice(np.arange(N_utrain), replace=True, size=K) # Positive samples and bootstrapped unlabeled samples data_bootstrap = np.concatenate( (X_P[ptrain], X_U[bootstrap_sample, :]), axis=0) # Train decision tree classifier model = DecisionTreeClassifier(max_depth=None, max_features=None, criterion='gini', class_weight='balanced') model.fit(data_bootstrap, train_label) # Index for the oob samples idx_oob = sorted( set(range(N_utrain)) - set(np.unique(bootstrap_sample))) # Transductive learning on oob samples f_oob[idx_oob] += model.predict_proba(X_U[utrain][idx_oob]) n_oob[idx_oob] += 1 f_ptest += model.predict_proba(X_P[ptest]) f_utest += model.predict_proba(X_U[utest]) feat_rank[:, idsp] = model.feature_importances_ # Predicted synthesis probabilities of unlabeled samples predict_utrain = f_oob[:, 1] / n_oob # Predicted probabilities for P and U test sets predict_ptest = f_ptest[:, 1] / bags predict_utest = f_utest[:, 1] / bags # Find predicted positives true_pos = predict_ptest[np.where(predict_ptest > 0.5)].shape[0] u_pos = predict_utest[np.where(predict_utest > 0.5)].shape[0] N_ptest = X_P[ptest].shape[0] N_utest = X_U[utest].shape[0] # Predicted positive ratio in test set p_pred_pos = (true_pos + u_pos) / (N_ptest + N_utest) + 0.0001 # Compute PU recall (TPR) and score metrics recall = true_pos / N_ptest score = recall**2 / p_pred_pos scores.append(score) tprs.append(recall) # Predicted probabilities prob_P[ptest, idsp] = predict_ptest prob_U[utrain, idsp] = predict_utrain prob_U[utest, idsp] = predict_utest idsp += 1 # Progress update if (idsp + 1) % splits == 0: tpr_tmp = np.asarray(tprs[-splits - 1:-1]) print("Performed Repeated " + str(splits) + "-fold: " + str(idsp // splits + 1) + " out of " + str(repeats)) print("True Positive Rate: %0.2f (+/- %0.2f)" % (tpr_tmp.mean(), tpr_tmp.std() * 2)) # Predicted labels from k-fold CV label_U = np.zeros(shape=(X_U.shape[0], splits * repeats + 1), dtype=int) label_U[:, :splits * repeats][np.where(prob_U > 0.5)] = 1 label_U[:, splits * repeats] = np.sum(label_U[:, :splits * repeats + 1], axis=1) tprs = np.asarray(tprs) scores = np.asarray(scores) # Metrics for each model in the k-folds label_U_rp = np.zeros(shape=(X_U.shape[0], repeats), dtype=int) prob_U_rp = np.zeros(shape=(X_U.shape[0], repeats)) feat_rank_rp = np.zeros(shape=(X_U.shape[1], repeats)) tpr_rp = np.zeros(shape=(repeats, )) scores_rp = np.zeros(shape=(repeats, )) labels = np.zeros(shape=(X_U.shape[0], )) for i in range(repeats): prob_U_rp[:, i] = prob_U[:, i * splits:(i + 1) * splits].mean(axis=1) feat_rank_rp[:, i] = feat_rank[:, i * splits:(i + 1) * splits].mean(axis=1) tpr_rp[i] = tprs[i * splits:(i + 1) * splits].mean() scores_rp[i] = scores[i * splits:(i + 1) * splits].mean() label_U_rp[np.where(prob_U_rp > 0.5)] = 1 prob = prob_U_rp.mean(axis=1) labels[np.where(prob > 0.5)] = 1 # Get confidence interval of TPR for each kfold tpr_low, tpr_up = self.bootstrapCI(tpr_rp) scores_low, scores_up = self.bootstrapCI(scores_rp) # PU learning metrics metrics = np.asarray([ tpr_rp.mean(), tpr_low, tpr_up, scores_rp.mean(), scores_low, scores_up ]) print("Accuracy: %0.2f" % (tpr_rp.mean())) print("95%% confidence interval: [%0.2f, %0.2f]" % (tpr_low, tpr_up)) # Metrics and results from training / testing pu_stats = { 'prob': prob, 'labels': labels, 'metrics': metrics, 'prob_rp': prob_U_rp, 'label_rp': label_U_rp, 'tpr_rp': tpr_rp, 'scores_rp': scores_rp, 'feat_rank_rp': feat_rank_rp } # Save results if filename: if filename.endswith(".json"): dumpfn(pu_stats, filename) if filename.endswith(".pkl"): with open(filename, 'wb') as file: pickle.dump(pu_stats, file, protocol=pickle.HIGHEST_PROTOCOL) self.pu_stats = pu_stats return pu_stats
def pls_train(groups, varname='valence', arrayname='norm', scale=True, ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Partial Least Squares model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] scale bool to scale data [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] ncomps number of independent components (See Note 5) [2] Returns ------- group with trained PSLResgession, to be used with pls_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. The optimal number of components may be best found from PCA. If set to None, a search will be done for ncomps that gives the lowest RMSE_CV. """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws['scale'] = scale kws['n_components'] = ncomps model = PLSRegression(**kws) rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) resid = [] cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :])[:, 0] resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) # final fit without cross-validation model = PLSRegression(**kws) out = model.fit(spectra, ydat) ypred = model.predict(spectra)[:, 0] rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, coefs=model.x_weights_, loadings=model.x_loadings_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, scale=scale, groupnames=groupnames, keywords=kws)
def train(): # net = Net() print(net) # 十折交叉验证, 重复十次 kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=int(time.time())) # data data = np.genfromtxt('5.csv', delimiter=',') X = data[:, :-1] Y = data[:, -1] optimizer = torch.optim.Adam(net.parameters(), lr=0.00001) loss_func = nn.CrossEntropyLoss() validate_loss_final = 0.0 for train_index, test_index in kf.split(X): X_train = X[train_index] X_validate = X[test_index] Y_train = Y[train_index] Y_validate = Y[test_index] train_dataset = InsuranceDataSet(X_train, Y_train) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) for epoch in range(30): # train net.train() for i, train_data in enumerate(train_loader, 0): features, label = train_data features = Variable(features) label = Variable(label) prediction = net(features) # print('output size is {}'.format(prediction.size())) loss = loss_func(prediction, label) # 优化及反向传播 optimizer.zero_grad() loss.backward() optimizer.step() # 验证 net.eval() validate_features = Variable( torch.from_numpy(X_validate).type(torch.FloatTensor)) validate_predictions = net(validate_features).detach().numpy() result = validate_predictions[:, 0] - validate_predictions[:, 1] result_bool = result < 0 result_bool = result_bool.astype('int') validate_loss = f1_score(Y_validate, result_bool) print('f1 is {}'.format(validate_loss)) if epoch == 0 or validate_loss > validate_loss_final: torch.save(net.state_dict(), 'Net-round-{}.pth'.format(epoch)) validate_loss_final = validate_loss print('Finish training...') print('best is {}'.format(validate_loss_final)) torch.save(net.state_dict(), 'Net.pth')
import numpy as np from sklearn.model_selection import KFold, RepeatedKFold from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit, KFold, RepeatedKFold X = ["a", "b", "c", "d", "d", "e", "h", "er", "erer", "342"] kf = RepeatedKFold(n_splits=4, n_repeats=3) for train, test in kf.split(X): print("%s %s" % (train, test)) y = np.random.random(10) train_test_split(X, y, stratify=y, test_size=0.3, shuffle=True)
epsilon=.1, coef0=1) # ############################################################################# # Look at the results lw = 2 svrs = [svr_rbf, svr_lin, svr_poly] kernel_label = ['RBF', 'Linear', 'Polynomial'] model_color = ['m', 'c', 'g'] # 3-fold train data kf = RepeatedKFold(n_splits=5) x_np_array, y_np_array = X.to_numpy(), y.to_numpy() index = np.arange(0, SAMPLE_LENGTH) for train_index, test_index in kf.split(X): train_x, train_y = x_np_array[train_index], y_np_array[train_index] test_x, test_y = x_np_array[test_index], y_np_array[test_index] clf = svr_lin.fit(train_x, train_y) mae_in_train = mean_absolute_error(svr_lin.predict(train_x), train_y) mae_in_test = mean_absolute_error(svr_lin.predict(test_x), test_y) r2_score_in_train = r2_score(svr_lin.predict(train_x), train_y) # r2_score_in_test = r2_score(svr_lin.predict(test_x),test_y) print(mae_in_train, mae_in_test) plt.plot(index, y, label="Real surface roughness") plt.scatter(train_index, svr_lin.predict(train_x), facecolor="none", edgecolor="k", label="SF in train dataset")
def fit(self, X, y, labels=None, dist=None, importance_weights=None, cv_indices=None, dist_savename=None): t = time.time() if y.ndim < 2: y = y.reshape(-1, 1) if self.n_components is not None: if self.verbose > 0: elapsed = time.time() - t print('PCA [%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60))) sys.stdout.flush() self.pca = PCA(n_components=self.n_components, svd_solver='arpack') y_ = self.pca.fit_transform(y) if self.verbose > 0: print('Lost %.1f%% information ' % (self.pca.noise_variance_) + '[%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60))) elapsed = time.time() - t else: y_ = y if labels is not None: raise RuntimeError('Not implemented.') if cv_indices is None: cv_indices = np.arange(X.shape[0]) if self.cv_type is None: kfold = RepeatedKFold(n_splits=self.cv_nfolds, n_repeats=self.cv_shuffles) cv_folds = kfold.split(X[cv_indices]) n_cv_folds = kfold.get_n_splits() elif self.cv_type == 'iter': cv_folds = self.cv_groups n_cv_folds = len(self.cv_groups) elif self.cv_type == 'group': groups = self.cv_groups if self.cv_nfolds is None: self.cv_nfolds = len(np.unique(groups)) kfold = GroupKFold(n_splits=self.cv_nfolds) cv_folds = kfold.split(X[cv_indices], y[cv_indices], groups) n_cv_folds = kfold.get_n_splits() else: raise Exception('Cross-validation type not supported') add_train_inds = np.setdiff1d(np.arange(X.shape[0]), cv_indices) cv_folds = list(cv_folds) cv_folds = [(np.concatenate((train_fold, add_train_inds)), test_fold) for train_fold, test_fold in cv_folds] if self.verbose > 0: elapsed = time.time() - t print('Computing distance matrix [%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60))) sys.stdout.flush() if dist is None: dist = euclidean_distances(X, None, squared=self.squared_dist) if dist_savename is not None: if self.verbose > 0: print('Saving distance matrix to file:', dist_savename) np.save(dist_savename, dist) if importance_weights is None: self.krr_param_grid['lambda'] = [0] importance_weights = np.ones((X.shape[0], )) importance_weights = importance_weights**(0.5) errors = [] if 'v' in self.krr_param_grid: for fold_i, (train_i, test_i) in enumerate(cv_folds): fold_errors = np.empty( (len(self.krr_param_grid['v']), len(self.krr_param_grid['gamma']), 1, len(self.krr_param_grid['alpha']), y_.shape[1])) if self.verbose > 0: elapsed = time.time() - t print('CV %d of %d [%dmin %dsec]' % (fold_i + 1, n_cv_folds, int( elapsed / 60), int(elapsed % 60))) sys.stdout.flush() for v_i, v in enumerate(self.krr_param_grid['v']): for gamma_i, gamma in enumerate( self.krr_param_grid['gamma']): for lamb_i, lamb in enumerate( self.krr_param_grid['lambda']): iw = importance_weights**lamb iw = iw[:, None] K_train = self.kernel.apply_to_dist(dist[np.ix_( train_i, train_i)], gamma=gamma) K_train *= np.outer(iw[train_i], iw[train_i]) K_test = self.kernel.apply_to_dist(dist[np.ix_( test_i, train_i)], gamma=gamma) if self.verbose > 0: sys.stdout.write('.') sys.stdout.flush() for alpha_i, alpha in enumerate( self.krr_param_grid['alpha']): if self.verbose > 0: sys.stdout.write(',') sys.stdout.flush() for y_i in np.arange(y_.shape[1]): K_train_ = K_train.copy() alpha_add = get_alpha_add( self.n_basis, self.n_grid, self.delta, v) K_train_.flat[::K_train_.shape[0] + 1] += alpha * alpha_add[y_i] try: L_ = cholesky(K_train_, lower=True) x = solve_triangular(L_, y_[train_i, y_i], lower=True) dual_coef_ = solve_triangular(L_.T, x) pred_mean = np.dot(K_test, dual_coef_) if self.mae: e = np.mean( np.abs(pred_mean - y_[test_i, y_i]), 0) else: e = np.mean((pred_mean - y_[test_i, y_i])**2, 0) except np.linalg.LinAlgError: e = np.inf fold_errors[v_i, gamma_i, 0, alpha_i, y_i] = e if self.verbose > 0: sys.stdout.write('\n') sys.stdout.flush() errors.append(fold_errors) errors = np.array(errors) errors = np.mean(errors, 0) # average over folds else: for fold_i, (train_i, test_i) in enumerate(cv_folds): fold_errors = np.empty( (len(self.krr_param_grid['gamma']), len(self.krr_param_grid['lambda']), len(self.krr_param_grid['alpha']), y_.shape[1])) if self.verbose > 0: elapsed = time.time() - t print('CV %d of %d [%dmin %dsec]' % (fold_i + 1, n_cv_folds, int( elapsed / 60), int(elapsed % 60))) sys.stdout.flush() for gamma_i, gamma in enumerate(self.krr_param_grid['gamma']): if self.verbose > 0: sys.stdout.write('.') sys.stdout.flush() for lamb_i, lamb in enumerate( self.krr_param_grid['lambda']): iw = importance_weights**lamb iw = iw[:, None] K_train = self.kernel.apply_to_dist(dist[np.ix_( train_i, train_i)], gamma=gamma) K_train *= np.outer(iw[train_i], iw[train_i]) K_test = self.kernel.apply_to_dist(dist[np.ix_( test_i, train_i)], gamma=gamma) for alpha_i, alpha in enumerate( self.krr_param_grid['alpha']): if self.verbose > 0: sys.stdout.write(',') sys.stdout.flush() K_train_ = K_train.copy() K_train_.flat[::K_train_.shape[0] + 1] += alpha try: L_ = cholesky(K_train_, lower=True) x = solve_triangular(L_, iw[train_i] * y_[train_i], lower=True) dual_coef_ = iw[train_i] * solve_triangular( L_.T, x) pred_mean = np.dot(K_test, dual_coef_) if self.mae: e = np.mean( np.abs(pred_mean - y_[test_i]) * importance_weights[test_i, None]**2, 0) else: e = np.mean( ((pred_mean - y_[test_i])**2) * importance_weights[test_i, None]**2, 0) except np.linalg.LinAlgError: e = np.inf fold_errors[gamma_i, lamb_i, alpha_i] = e if self.verbose > 0: sys.stdout.write('\n') sys.stdout.flush() errors.append(fold_errors) errors = np.array(errors) errors = np.mean(errors, 0) # average over folds self.dual_coefs_ = np.empty((y_.shape[1], X.shape[0])) self.alphas_ = np.empty(y_.shape[1]) self.lambdas_ = np.empty(y_.shape[1]) self.gammas_ = np.empty(y_.shape[1]) if self.verbose > 0: elapsed = time.time() - t print('Refit [%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60))) sys.stdout.flush() print_count = 0 if not self.single_combo: for i in range(y_.shape[1]): min_params = np.argsort(errors[:, :, :, i], axis=None) # lin_alg_errors = 0 gamma_i, lamb_i, alpha_i = np.unravel_index( min_params[0], errors.shape[:2]) gamma = self.krr_param_grid['gamma'][gamma_i] lamb = self.krr_param_grid['lambda'][lamb_i] alpha = self.krr_param_grid['alpha'][alpha_i] self.alphas_[i] = alpha self.gammas_[i] = gamma self.lambdas_[i] = lamb if (gamma_i in (0, len(self.krr_param_grid['gamma']) - 1) or lamb_i in (0, len(self.krr_param_grid['lambda']) - 1) or alpha_i in (0, len(self.krr_param_grid['alpha']) - 1)): if print_count <= 200: fmtstr = '%d: gamma=%g\talpha=%g\tlambda=%g\terror=%g\tmean=%g' print(fmtstr % (i, gamma, alpha, lamb, errors[gamma_i, lamb_i, alpha_i, i], errors[gamma_i, lamb_i, alpha_i, i] / np.mean(np.abs(y_[:, i])))) print_count += 1 else: errors = np.mean(errors, -1) # average over outputs if self.verbose > 1: print('CV errors:') print(errors) print('Alpha params:') print(self.krr_param_grid['alpha']) print('Gamma params:') print(self.krr_param_grid['gamma']) print('Lambda params:') print(self.krr_param_grid['lambda']) if self.verbose > 0: print('Min error: ', np.min(errors)) # print np.log(errors) # plt.imshow(np.log(errors)) # plt.xticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['alpha']))) # plt.yticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['gamma']))) # plt.xlabel('alpha') # plt.ylabel('gamma') # plt.colorbar() # plt.show() min_params = np.argsort(errors, axis=None) if 'v' in self.krr_param_grid: v_i, gamma_i, lamb_i, alpha_i = np.unravel_index( min_params[0], errors.shape) else: gamma_i, lamb_i, alpha_i = np.unravel_index( min_params[0], errors.shape) if 'v' in self.krr_param_grid: v = self.krr_param_grid['v'][v_i] print('v=', v) gamma = self.krr_param_grid['gamma'][gamma_i] alpha = self.krr_param_grid['alpha'][alpha_i] lamb = self.krr_param_grid['lambda'][lamb_i] if 'v' in self.krr_param_grid: if v == self.krr_param_grid['v'][0]: print('v at lower edge.') if v == self.krr_param_grid['v'][-1]: print('v at upper edge.') if len(self.krr_param_grid['gamma']) > 1: if gamma == self.krr_param_grid['gamma'][0]: print('Gamma at lower edge.') if gamma == self.krr_param_grid['gamma'][-1]: print('Gamma at upper edge.') if len(self.krr_param_grid['alpha']) > 1: if alpha == self.krr_param_grid['alpha'][0]: print('Alpha at lower edge.') if alpha == self.krr_param_grid['alpha'][-1]: print('Alpha at upper edge.') if len(self.krr_param_grid['lambda']) > 1: if lamb == self.krr_param_grid['lambda'][0]: print('Lambda at lower edge.') if lamb == self.krr_param_grid['lambda'][-1]: print('Lambda at upper edge.') self.alphas_[:] = alpha self.gammas_[:] = gamma self.lambdas_[:] = lamb if 'v' in self.krr_param_grid: alpha_add = get_alpha_add(self.n_basis, self.n_grid, self.delta, v) self.alphas_ *= alpha_add combos = list(zip(self.alphas_, self.gammas_, self.lambdas_)) n_unique_combos = len(set(combos)) self.L_fit_ = [None] * n_unique_combos for i, (alpha, gamma, lamb) in enumerate(set(combos)): if self.verbose > 0: elapsed = time.time() - t print('Parameter combinations ' + '%d of %d [%dmin %dsec]' % (i + 1, n_unique_combos, int(elapsed / 60), int(elapsed % 60))) sys.stdout.flush() y_list = [ i for i in range(y_.shape[1]) if self.alphas_[i] == alpha and self.gammas_[i] == gamma and self.lambdas_[i] == lamb ] iw = importance_weights**lamb iw = iw[:, None] K = self.kernel.apply_to_dist(dist, gamma=gamma) K *= np.outer(iw, iw) # np.exp(K, K) while True: K.flat[::K.shape[0] + 1] += alpha - (alpha / 10) try: if self.verbose > 0: print('trying cholesky decomposition, alpha', alpha) L_ = cholesky(K, lower=True) self.L_fit_[i] = L_ x = solve_triangular(L_, iw * y_[:, y_list], lower=True) # x = solve_triangular(L_, y_[:, y_list], lower=True) dual_coef_ = solve_triangular(L_.T, x) self.dual_coefs_[y_list] = iw.T * dual_coef_.T.copy() break except np.linalg.LinAlgError: if self.verbose > 0: print('LinalgError, increasing alpha') alpha *= 10 self.alphas_[0] = alpha if self.copy_X: self.X_fit_ = X.copy() self.y_fit_ = y.copy() else: self.X_fit_ = X self.y_fit_ = y self.errors = errors if self.verbose > 0: elapsed = time.time() - t print('Done [%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60))) sys.stdout.flush()
y = train[target_col].values id_train = train[id_col].values X_test = test.drop(cols_to_drop, axis=1, errors='ignore') id_test = test[id_col].values feature_names = list(X.columns) n_features = X.shape[1] dprint(f'n_features: {n_features}') p_test = [] dfs_train = [] dfs_test = [] for fold_i_oof, (train_index_oof, valid_index_oof) in enumerate(kf1.split(X, y)): x_train_oof = X.iloc[train_index_oof] x_valid_oof = X.iloc[valid_index_oof] y_train_oof = y[train_index_oof] y_valid_oof = y[valid_index_oof] id_train_oof = id_train[valid_index_oof] for fold_i, (train_index, valid_index) in enumerate( kf2.split(x_train_oof, y_train_oof)): params = lgb_params.copy() x_train = x_train_oof.iloc[train_index] x_valid = x_train_oof.iloc[valid_index]
markers = [x for x in df.columns if 'aal' in x] X = df[markers].values y = 1 - (df['Final diagnosis (behav)'] == 'VS').values.astype(np.int) results = dict() results['Iteration'] = [] results['Weight Val'] = [] results['Classifier'] = [] results['AUC'] = [] results['Precision'] = [] results['Recall'] = [] sss = RepeatedKFold(n_splits=5, n_repeats=50, random_state=42) for t_iter, (train, test) in enumerate(sss.split(X, y)): for val in weight_val: classifiers['SVC_fs10'] = Pipeline([ ('scaler', RobustScaler()), ('select', SelectKBest(f_classif, 10)), ('clf', SVC(kernel="linear", C=1, probability=True, class_weight={0: 1, 1: val})) ]) classifiers['XRF'] = Pipeline([ ('scaler', RobustScaler()), ('clf', ExtraTreesClassifier( max_depth=5, n_estimators=2000, max_features='auto', class_weight={0: 1, 1: val})) ]) classifiers['Dummy'] = Pipeline([ ('clf', DummyClassifier(
def predictConditions(query): print('Collecting all conditions:\n') conditions = getFeatures.get_conditions( query=query, startDate='2019-01-01', endDate='2020-12-31') print(conditions) patients = getFeatures.get_live_patients( query=query, startDate='2019-12-31', endDate='2019-12-31') print('\nNumber of patients', len(patients)) age_groups = getFeatures.make_age_groups() print('\nAge groups\n', age_groups) print('\nCompute features: ') x_df = getFeatures.get_feature_vec( query, conditions=conditions, startDate='2019-01-01', endDate='2019-12-31', age_groups=age_groups) print('\nx_df.shape ', x_df.shape) print('\nCompute labels: ') y_df = getFeatures.get_feature_vec( query, conditions=conditions, startDate='2020-01-01', endDate='2020-12-31', age_groups=age_groups) print('\ny_df.shape ', y_df.shape) train, test = train_test_split(patients, test_size=0.25, random_state=42) x_train_df = x_df.loc[train] y_train_df = y_df.loc[train] x_test_df = x_df.loc[test] y_test_df = y_df.loc[test] print('\n\nTrain set:', len(train), 'Test set: ', len(test)) print( '\n\nSorted x_train means:\n\n', x_train_df.mean().sort_values(ascending=False), '\n\nSorted y_train means:\n\n', y_train_df.mean().sort_values(ascending=False) ) filter_below = 20 print('\nFiltereing conditions with less than {} cases:'.format(filter_below)) x_drop_list = ( set(x_train_df.columns[x_train_df.sum() < filter_below]) | set(x_test_df.columns[x_train_df.sum() < filter_below]) ) x_train_df = x_train_df.drop(x_drop_list, axis=1) x_test_df = x_test_df.drop(x_drop_list, axis=1) y_drop_list = ( set(y_train_df.columns[y_train_df.sum() < filter_below]) | set(y_test_df.columns[y_train_df.sum() < filter_below]) ) y_train_df = y_train_df.drop(y_drop_list, axis=1) y_test_df = y_test_df.drop(y_drop_list, axis=1) print( '\n\nSorted x_train means:\n\n', x_train_df.mean().sort_values(ascending=False), '\n\nSorted y_train means:\n\n\n\n', y_train_df.mean().sort_values(ascending=False) ) print( '\n\nSorted x_test means:\n\n', x_test_df.mean().sort_values(ascending=False), '\n\nSorted y_test means:\n\n\n\n', y_test_df.mean().sort_values(ascending=False) ) y_weights = 1 / (y_train_df.var() + 1e-3) y_weights = y_weights/(y_train_df.var()*y_weights).sum() print( '\n', pd.DataFrame( [y_train_df.var(), y_weights, y_weights*y_train_df.var()], index=['y_train var', 'y_weights', 'var*weight'] ).transpose() ) wmse = feature_weighted_mse.make_feature_weighted_mse(y_weights) print( '\nBasic benchmark - y means\n', 'Train loss', wmse( y_true=y_train_df.values, y_pred=y_train_df.values.mean(axis=0) ).numpy().mean(), ) from sklearn.model_selection import RepeatedKFold n_splits = 4 n_repeats = 2 alpha=0.00001 learning_rate=0.001 patience=30 print('\nTrain linear model using Lasso alpha {} {}-fold CV repeated {} times.\n'.format( alpha, n_splits, n_repeats, )) rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42) models=[] history=[] performance=[] i=0 for train_index, validate_index in rkf.split(x_train_df): i += 1 print('\n\nFold {} out of {}\n\n'.format(i, n_splits*n_repeats)) x_train, x_validate = x_train_df.iloc[train_index], x_train_df.iloc[validate_index] y_train, y_validate = y_train_df.iloc[train_index], y_train_df.iloc[validate_index] inputs = keras.layers.Input(shape=x_train_df.shape[1]) outputs = keras.layers.Dense( units=y_train_df.shape[1], kernel_regularizer=keras.regularizers.l1(l=alpha), )(inputs) models.append(keras.Model(inputs=inputs, outputs=outputs)) models[-1].compile(loss=wmse, optimizer=keras.optimizers.Adam(learning_rate=learning_rate)) history.append(models[-1].fit( x=x_train, y=y_train, batch_size=128, epochs=1000, validation_data=(x_validate, y_validate), callbacks=[ keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True), ] )) print('\nEvaluate on test set:\n') performance.append(models[-1].evaluate(x=x_test_df, y=y_test_df)) print(performance[-1],'\n') print('Test loss mean', np.mean(performance), 'std' , np.std(performance, ddof=1)) constant_full = pd.DataFrame( np.array([model.layers[1].get_weights()[1] for model in models]).transpose(), index=y_train_df.columns, columns=['Fold {}'.format(i) for i in range(1, 1+n_splits*n_repeats)], ) constant_full.to_csv('constant_full.csv') coef_mat = np.array([model.layers[1].get_weights()[0] for model in models]).transpose((1, 2, 0)) coef_full = pd.DataFrame( [[json.dumps(coef_mat[i,j].tolist()) for j in range(coef_mat.shape[1])] for i in range(coef_mat.shape[0])], columns=y_train_df.columns, index=x_train_df.columns ).transpose() coef_full.to_csv('coef_full.csv')
xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits plot_importance(clf) plt.show() print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target))) # 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 print(mean_squared_error(target.values, oof_stack)) sub_df = pd.read_csv('./jinnan_round1_submit_20181227.csv', header=None) sub_df[1] = predictions sub_df[1] = sub_df[1].apply(lambda x: round(x, 3))
def boston(): X = load_boston()['data'] y = load_boston()['target'] cv = RepeatedKFold(n_splits=10, n_repeats=4) return X, y, cv.split(X)
# # t = ttest_ind( # # X[c].fillna(X[c].mean()), # # X_test[c].fillna(X_test[c].mean())) # t = ks_2samp( # X[c].dropna(), # X_test[c].dropna()) # # print(c, t) # if t[1] < 0.001: # print(c, t) # cols_to_drop.append(c) # print(f'Dropping after statistical tests: {cols_to_drop}') # X = X.drop(cols_to_drop, axis=1, errors='ignore') # X_test = X_test.drop(cols_to_drop, axis=1, errors='ignore') p_test = [] for fold_i, (train_index, valid_index) in enumerate(kf.split(X, y)): x_train = X.iloc[train_index].copy() x_valid = X.iloc[valid_index].copy() y_train = y[train_index] y_valid = y[valid_index] x_test = X_test.copy() # Frequency encoding for c in cat_features: # for c in ['hospital_id']: if c in x_train.columns: encoding = x_train.groupby(c).size() encoding = encoding/len(x_train) x_train[f'{c}_fe'] = x_train[c].map(encoding)
def energy(): df = pd.read_csv(f'{datasets_folder}/energy_efficiency.csv') X = df.iloc[:, :-2] y = df.iloc[:, -2] cv = RepeatedKFold(n_splits=10, n_repeats=4) return X, y, cv.split(X)
plot = plt.scatter(y_test, y_pred) # In[30]: from sklearn.metrics import roc_auc_score print(confusion_matrix(y_test, y_pred)) #print(roc_auc_score(y-test,y_pred)) # In[31]: from sklearn.model_selection import RepeatedKFold random_state = 12883823 rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state) for train, test in rkf.split(x): print("%s %s" % (train, test)) # In[32]: from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() for train, test in loo.split(x): print("%s %s" % (train, test)) # In[33]: get_ipython().run_line_magic('matplotlib', 'inline') svclassifier = SVC(kernel='rbf', C=1) svclassifier.fit(x_train, y_train) y_pred = svclassifier.predict(x_test)
def power(): df = pd.read_csv(f'{datasets_folder}/power.csv') X = df.iloc[:, :-1] y = df.iloc[:, -1] cv = RepeatedKFold(n_splits=10, n_repeats=4) return X, y, cv.split(X)
dep_f = [] mse_f = [] rmse_f = [] mae_f = [] mdae_f = [] evs_f = [] r2_f = [] for i in dep: c = 0 mse_t = [] rmse_t = [] mae_t = [] mdae_t = [] evs_t = [] r2_t = [] for tr_i, ts_i in rkf.split(data): print(i, c) train, test = data.iloc[tr_i], data.iloc[ts_i] train_x = train.drop(columns=['Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['Rainfall']) test_y = test['Rainfall'] model = RandomForestRegressor(n_estimators=100, max_depth=i) model.fit(train_x, train_y) ts_p = model.predict(test_x) mse_t.append(mse(test_y, ts_p)) mae_t.append(mae(test_y, ts_p)) mdae_t.append(mdae(test_y, ts_p)) evs_t.append(evs(test_y, ts_p)) r2_t.append(r2(test_y, ts_p)) c += 1
def wine(): df = pd.read_csv(f'{datasets_folder}/wine.csv', sep=';') X = df.iloc[:, :-1] y = df.iloc[:, -1] cv = RepeatedKFold(n_splits=10, n_repeats=4) return X, y, cv.split(X)
if len(daily_attention) == age and len(daily_share) == age: attention_data.append(daily_attention) share_data.append(daily_share) vid_array.append(vid) # convert to ndarray attention_data = np.array(attention_data) share_data = np.array(share_data) vid_array = np.array(vid_array) # == == == == == == == == Part 4: Forecast future attention == == == == == == == == # # 10-repeated 10-fold cross validation rkf = RepeatedKFold(n_splits=10, n_repeats=10) fold_idx = 0 for train_cv_idx, test_idx in rkf.split(vid_array): fold_idx += 1 print('>>> Forecast on fold: {0}'.format(fold_idx)) # == == == == == == == == Part 5: Split cv subset to select best alpha value == == == == == == == == # train_idx, cv_idx = train_test_split(train_cv_idx, test_size=0.1) # grid search best alpha value over -4 to 4 in log space alpha_array = [10 ** t for t in range(-4, 5)] cv_mse = [] for alpha in alpha_array: # == == == == == == == == Part 6: Training with Ridge Regression == == == == == == == == # cv_predict = forecast_future_attention(train_idx, cv_idx, alpha) # == == == == == == == == Part 7: Evaluate cv mean squared error == == == == == == == == # cv_norm = np.sum(attention_data[cv_idx, :age], axis=1)
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, use_lars=True, fit_intercept=True, normalize=True, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Lasso/LassoLars model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] alpha alpha parameter for LassoLars (See Note 5) [None] use_lars bool to use LassoLars instead of Lasso [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] Returns ------- group with trained LassoLars model, to be used with lasso_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. alpha is the regularization parameter. if alpha is None it will be set using LassoLarsSCV """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws.update(dict(fit_intercept=fit_intercept, normalize=normalize)) creator = LassoLars if use_lars else Lasso model = None rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) if alpha is None: lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, max_iter=1e7, eps=1.e-12, **kws) lcvmod.fit(spectra, ydat) alpha = lcvmod.alpha_ model = creator(alpha=alpha, **kws) resid = [] for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :]) resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) if alpha is None: cvmod = creator(**kws) cvmod.fit(spectra, ydat) alpha = cvmod.alpha_ if model is None: model = creator(alpha=alpha, **kws) # final fit without cross-validation out = model.fit(spectra, ydat) ypred = model.predict(spectra) rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, alpha=alpha, active=model.active_, coefs=model.coef_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, fit_intercept=fit_intercept, normalize=normalize, groupnames=groupnames, keywords=kws)