def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2, 2]) for train_index, test_index in inputData: from sklearn.ensemble import IsolationForest clf = IsolationForest(random_state=666, contamination=0.07) X_train, X_test = self.features[train_index], self.features[ test_index] #X_train, X_test = np.abs(X_train), np.abs(X_test) # print(X_train) y_train, y_test = self.labels[train_index], self.labels[test_index] X_train, X_test = X_train[:, :], X_test[:, :] y_train, y_test = np.array(y_train), np.array(y_test) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) y_train = list(y_train) cm = confusion_matrix(y_train, pred_train, labels=[-1, 1]) print("第", count, "轮训练集里的表现\n", cm) pred = clf.predict(X_test) y_test = list(map(lambda x: x[0], y_test)) print(y_test) #print(pred) cm = confusion_matrix(y_test, pred, labels=[-1, 1]) totalCM = totalCM + np.array(cm) print("第", count, "轮测试集里的表现\n", cm) # print(X_test) print("####################################") count += 1 print("混淆矩阵的和是\n", totalCM, "准确率是", (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2,2]) for train_index, test_index in inputData: X_train, X_test = self.features[train_index], self.features[test_index] #X_train, X_test = np.abs(X_train), np.abs(X_test) # print(X_train) y_train, y_test = self.labels[train_index], self.labels[test_index] num_feature = len(X_train[0]) clf = deepLearning.LSTMClassifier(2, num_feature, learning_rate=1e-3, layer_num=2, hidden_size=50, timestep_size=60) clf.initGraph() clf.initOneHotEncoder4Y(y_train) def scala(x): res = [] for i in range(len(x)): temp = [] for n in x[i]: v = 1 if n > 0 else 0 temp.append(v) res.append(temp) return np.array(res) # X_train, X_test = aaa(X_train, len(y_train)), aaa(X_test, len(y_test)) X_train,X_test = X_train[:,:], X_test[:,:] X_train, X_test = scala(X_train), scala(X_test) y_train, y_test = np.array(y_train), np.array(y_test) for i in range(5000): print("这是第", count, "折,第", i, "轮训练。", len(y_train) ) step = 500 for j in range(0, len(y_train), step): batch_ys = clf.oneHotEncode(y_train[j: j+ step,:]) batch_xs = np.array(X_train[j: j+ step,:]).astype(np.float32) clf.fit(batch_xs, batch_ys) #clf.fit(batch_xs + random.uniform(0,0.01), batch_ys) print("训练集") batch_ys = clf.oneHotEncode(y_train) batch_xs = np.array(X_train).astype(np.float32) clf.test(batch_xs, batch_ys) print("测试机") batch_ys = clf.oneHotEncode(y_test) batch_xs = np.array(X_test).astype(np.float32) y_pre = clf.test(batch_xs, batch_ys) y_pre = list(map(lambda x: 1 if x[0]<x[1] else 0, y_pre)) y_real = list(map(lambda x:x[0], y_test)) cm = confusion_matrix(y_real, y_pre, labels=[0, 1]) #print(y_pre) #print(y_real) print(cm) print("*************************") break
def trainPatient (X, y): # y=0 for interictal, y=1 for preictal # CALLED BY: __main__() # given the data for a patient, train a classifier # do K-fold splitting on training data, where K is 5 k_fold = KFold(n_splits=5) dataSplitIndices = k_fold.split(X) # use example: [svc.fit(X[train], y[train]) for train, test in k_fold.split(X)]
def k_fold(self, X, y): X = np.array(X) y = np.array(y) kf = KFold(3, True, 1) data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] for train_indices, test_indices in kf.split(data): print('Train: %s | test: %s' % (train_indices, test_indices))
def get_image_paths_and_labels_headcap(images_path, usage, nfold, ifold): image_paths = [] labels = [] idx_train_all = [] idx_test_all = [] image_paths_final = [] labels_final = [] folders = os.listdir(images_path) folders.sort() for fold in folders: if not os.path.isdir(os.path.join(images_path, fold)): continue img_path_folder = glob.glob(os.path.join(images_path, fold, '*.png')) img_path_folder.sort() image_paths += img_path_folder label_txt = glob.glob(os.path.join(images_path, fold, '*.txt'))[0] with open(label_txt, 'r') as f: for line in f.readlines(): line = line.replace('\r\n','\n') #print ('%s %s'%(fold, line)) labels.append(int(line[-2:-1])) # folds = KFold(n=len(labels_flat), n_folds=nrof_folds, shuffle=True) #folds = KFold(n=len(labels), n_folds=10, shuffle=False) ## Before the version of sklearn 0.20 kf = KFold(n_splits=nfold, shuffle=False) ## After the version of sklearn 0.20 i = 0 #for idx_train, idx_test in folds: ## Before sklearn 0.20 for idx_train, idx_test in kf.split(labels): ## After skleran 0.20 idx_train_all.append([]) idx_train_all[i].append(idx_train) idx_test_all.append([]) idx_test_all[i].append(idx_test) #print('train:', idx_train, 'test', idx_test) i += 1 idx_train = idx_train_all[ifold][0] idx_test = idx_test_all[ifold][0] if usage == 'Training': for idx in idx_train: #idx_train.append(idx) image_paths_final.append(image_paths[idx]) labels_final.append(labels[idx]) if usage == 'Test': for idx in idx_test: #idx_test.append(idx) image_paths_final.append(image_paths[idx]) labels_final.append(labels[idx]) nrof_classes = len(set(labels_final)) return image_paths_final, labels_final, usage, nrof_classes
def run_kfold_on_model(df: DataFrame, exclude_cols: Iterable[str], target_col: str, model, lossfun): X, Y = make_dataset(df, exclude_cols, target_col) kf = KFold(n_folds=10, shuffle=True) losses = [] for train_idx, test_idx in kf.split(X): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] model.fit(X_train, Y_train) pred = model.predict(Y_test) losses.append(lossfun(Y_test, pred)) return losses
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2,2]) for train_index, test_index in inputData: clf = StackingClassifier() clf.setBaseModels({"DT": DecisionTreeClassifier(), 'KNN': KNeighborsClassifier(n_neighbors=20), "LR": LogisticRegression(max_iter=1000, solver='lbfgs', C=100), 'mlp': MLPClassifier(hidden_layer_sizes=(100,)), "gbdt": GradientBoostingClassifier(n_estimators=20) }) clf.setMetaModel(MLPClassifier(hidden_layer_sizes=(100,))) X_train, X_test = self.features[train_index], self.features[test_index] y_train, y_test = self.labels[train_index], self.labels[test_index] def scala(x): res = [] for i in range(len(x)): temp = [] for n in x[i]: v = 1 if n>0 else 0 temp.append(v) res.append(temp) return np.array(res) X_train,X_test = X_train[:,:], X_test[:,:] X_train, X_test = scala(X_train), scala(X_test) y_train, y_test = np.array(y_train), np.array(y_test) inputMap = {"DT": X_train, 'KNN': X_train, 'mlp': X_train, "gbdt": X_train, 'LR': X_train} clf.fit(inputMap, y_train) pred_train = clf.predict(inputMap) y_train = list(y_train) cm = confusion_matrix(y_train, pred_train) print("第", count, "轮训练集里的表现\n", cm) inputMap = {"DT": X_test, 'KNN': X_test, 'mlp': X_test, "gbdt": X_test, 'LR': X_test} pred = clf.predict(inputMap) y_test = list(y_test) cm = confusion_matrix(y_test, pred, labels=[0, 1]) totalCM = totalCM + np.array(cm) print("第", count, "轮测试集里的表现\n", cm) # print(X_test) print("####################################") count += 1 print("混淆矩阵的和是\n", totalCM, "准确率是", (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False): assert (embeddings1.shape[0] == embeddings2.shape[0]) assert (embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds, nrof_thresholds)) fprs = np.zeros((nrof_folds, nrof_thresholds)) accuracy = np.zeros((nrof_folds)) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate( [embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1 - mean, embeddings2 - mean, distance_metric) # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy( threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy( threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy( thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs, 0) fpr = np.mean(fprs, 0) return tpr, fpr, accuracy
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False): assert (embeddings1.shape[0] == embeddings2.shape[0]) assert (embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate( [embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1 - mean, embeddings2 - mean, distance_metric) # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far( threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train) >= far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far( threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2, 2]) for train_index, test_index in inputData: # clf = DecisionTreeClassifier(max_depth=10) # clf = RandomForestClassifier(n_estimators=10, max_depth=5,random_state=666) # clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10)) # clf = MLPClassifier(max_iter=200, hidden_layer_sizes=(200, 20)) # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100) # clf = GradientBoostingClassifier(n_estimators=20) # clf = SVC(C=0.8) clfList = [['desisionTree', DecisionTreeClassifier()], ['mlp', MLPClassifier()], ['KNN', KNeighborsClassifier(n_neighbors=10)]] clf = VotingClassifier(clfList, voting='hard') X_train, X_test = self.features[train_index], self.features[ test_index] y_train, y_test = self.labels[train_index], self.labels[test_index] # featureProcessor = PCA(n_components=20) featureProcessor = LinearDiscriminantAnalysis(n_components=50).fit( X_train, y_train) # featureProcessor = SelectKBest(chi2, k=20)#.fit(features, labels) featureProcessor.fit(X_train, y_train) X_train = featureProcessor.transform(X_train) X_test = featureProcessor.transform(X_test) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) y_train = list(y_train) cm = confusion_matrix(y_train, pred_train) print("第", count, "轮训练集里的表现\n", cm) pred = clf.predict(X_test) y_test = list(y_test) cm = confusion_matrix(y_test, pred, labels=['m', 'f']) totalCM = totalCM + np.array(cm) print("第", count, "轮测试集里的表现\n", cm) res = list( map(lambda x: y_test[x] + '_' + pred[x], range(len(pred)))) print(res) # print(X_test) print("####################################") count += 1 print("混淆矩阵的和是\n", totalCM, "准确率是", (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
def keras_eval(X_train_cv, y, X_test, test_ids, folds=5, nbags=5, nepochs=55): np.random.seed(123) start_time = timer(None) # set up KFold that matches xgb.cv number of folds cv_pred = np.zeros((X_train_cv.shape[0], folds, nbags)) test_pred = np.zeros((X_test.shape[0], folds, nbags)) cv_score = np.zeros((folds, nbags)) kf = KFold(n_splits=folds, shuffle=True, random_state=0) for i, (train_index, cv_index) in enumerate(kf.split(X_train_cv)): X_train, X_cv = X_train_cv.iloc[train_index,:], X_train_cv.iloc[cv_index,:] y_train, y_cv = y[train_index], y[cv_index] ## train models for j in range(nbags): model = nn_model() model.fit_generator( generator = batch_generator(X_train, y_train, 128, True), nb_epoch = nepochs, samples_per_epoch = X_train.shape[0], verbose = 0) cv_pred[cv_index,i,j] = model.predict_generator( generator = batch_generatorp(X_cv, 800, False), val_samples = X_cv.shape[0])[:,0] test_pred[:,i,j] = model.predict_generator( generator = batch_generatorp(X_test, 800, False), val_samples = X_test.shape[0])[:,0] cv_score[i,j] = mean_absolute_error(y_cv, cv_pred[cv_index]) print('Fold {}, Bag {} - MAE: {}'.format(i, j, cv_score[i,j])) print(' Fold {} - MAE: {}\n'.format(i, cv_score.mean(1)[i])) score = mean_absolute_error(y, cv_pred.mean(2).mean(1)) print('Total - MAE: {}'.format(score)) timer(start_time) print("#\n Writing results") result = pd.DataFrame({'id': test_ids, 'loss': test_pred.mean(2).mean(1)}) result = result.set_index("id") now = datetime.now() sub_file = 'submission_{}fold-{}bag-average-keras-{}-{}.csv.gz'.format( folds, nbags, score, now.strftime("%Y-%m-%d-%H-%M")) print("\n Writing submission: {}".format(sub_file)) result.to_csv(sub_file, index=True, index_label='id', compression='gzip')
def Stacking(model,train_x,train_y, test,n_splits=5, random_state=None, shuffle=False): # only return the predictions of the model, no target # average the prediction on test by every model df_kf_valid=np.zeros((train_x.shape[0],)) df_kf_test =np.zeros((test.shape[0], n_splits)) kf=KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) for i, (train_index, valid_index) in enumerate(kf.split(train_x)): kf_train_x= train_x.reindex(train_index) kf_train_y=train_y.reindex(train_index) kf_valid_x=train_x.reindex(valid_index) model.train(kf_train_x, kf_train_y) df_kf_valid[train_index]= model.predict(kf_valid_x) df_kf_test[:,i]= model.predict(test) df_test=np.mean(df_kf_test ,axis=1) #samples x 1 , return df_kf_valid.reshape(-1,1), df_kf_test.reshape(-1,1)
"param_3", "price+", "item_seq_number+", ] from sklearn.model_selection import KFold kf = KFold(n_splits=10, random_state=42, shuffle=True) numIter = 0 rmse_sume = 0. numLimit = 1 # 5 for train_index, valid_index in kf.split(y): numIter +=1 if numIter>=numLimit+1: pass else: print("Modeling Stage ...") X_train, X_valid = X.tocsr()[train_index], X.tocsr()[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] gc.collect() lgbm_params = { "tree_method": "feature", "num_threads": 11, # 3
recall_p = 0 Loop_n = 1 #循环次数 fold_n = 10 #n-折交叉验证:折数 for i in range(0, Loop_n): train = shuffle(data_train) x_columns = [x for x in train.columns if x not in [label, cardcol]] X = train[x_columns] y = train[label] X = np.array(X) y = np.array(y) kf = KFold(n_splits=fold_n) kf.get_n_splits(X) #给出K折的折数,输出为2 for train_index, test_index in kf.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] rf_model = RandomForestClassifier(oob_score=True, random_state=10) time1 = time.time() rf_model.fit(X_train, y_train) time2 = time.time() print "rf_model used time: %f sec" % (time2 - time1) #时间 second pred_test = rf_model.predict(X_test) temp_m = confusion_matrix(y_test, pred_test) precision_p = precision_p + float(temp_m[1][1]) / float( (temp_m[0][1] + temp_m[1][1])) recall_p = recall_p + float(temp_m[1][1]) / float(
train['new2'] = train['V2'] + train['V3'] + train['V4'] test['new2'] = test['V2'] + test['V3'] + test['V4'] train_x = train.drop(['target'], axis=1) Y = train['target'] import lightgbm as lgb from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True, random_state=2) p = [] test_err = 0 res = np.zeros((test.shape[0], 5)) for k, (train_index, test_index) in enumerate(kf.split(train_x)): x, test_x = train_x.loc[train_index], train_x.loc[test_index] y, test_y = Y[train_index], Y[test_index] lgb_model = lgb.LGBMRegressor( boosting_type='gbdt', max_depth=-1, learning_rate=0.01, n_estimators=5000, objective='regression', ) lgb_model.fit( x, y, eval_set=[(x, y), (test_x, test_y)], eval_names=['Train', 'Test'],
from sklearn.metrics import mean_squared_error from sklearn import feature_selection from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.model_selection import KFold # Viz lib import matplotlib if os.environ.get('DISPLAY', '') == '': print('no display found. Using non-interactive Agg backend') matplotlib.use('agg') import seaborn as sns import matplotlib.pyplot as plt kf = KFold(n_splits=5, shuffle=True) if useKfold: for idx, (train_index, test_index) in enumerate(kf.split(X_tr)): print('+++++ CV at fold number ', idx) X_train, X_test = X_tr[train_index], X_tr[test_index] y_train, y_test = y_tr[train_index], y_tr[test_index] X_tr_lgb = lgb.Dataset(X_train, label=y_train, feature_name=feature_names, categorical_feature=cat_cols) X_va_lgb = lgb.Dataset(X_test, label=y_test, feature_name=feature_names, categorical_feature=cat_cols, reference=X_tr_lgb) model = lgb.train(parameters, X_tr_lgb, valid_sets=[X_tr_lgb, X_va_lgb],
def get_image_paths_and_labels_hand(images_path, labelfile, nfold, ifold): image_paths = [] labels = [] idx_train_all = [] idx_test_all = [] image_paths_final = [] labels_final = [] image_paths_final_test = [] labels_final_test = [] datal = pandas.read_excel(labelfile) labels_all = datal['PersonID'].values labels_frm = datal['Frame'].values labels_frm_list = labels_frm.tolist() labels_all_list = labels_all.tolist() image_paths = glob.glob(os.path.join(images_path, '*.png')) image_paths.sort() for imgfile in image_paths: strtmp = str.split(imgfile,'/')[-1] strtmp = str.split(strtmp, '_')[0] framenum = int(strtmp[5:]) idx = labels_frm_list.index(framenum) labels.append(labels_all_list[idx]) # folds = KFold(n=len(labels_flat), n_folds=nrof_folds, shuffle=True) if sklearn.__version__ < '0.20': folds = KFold(n=len(labels), n_folds=10, shuffle=True) ## Before the version of sklearn 0.20 else: kf = KFold(n_splits=nfold, shuffle=True) ## After the version of sklearn 0.20 i = 0 if sklearn.__version__ < '0.20': for idx_train, idx_test in folds: ## Before sklearn 0.20 idx_train_all.append([]) idx_train_all[i].append(idx_train) idx_test_all.append([]) idx_test_all[i].append(idx_test) # print('train:', idx_train, 'test', idx_test) i += 1 else: for idx_train, idx_test in kf.split(labels): ## After skleran 0.20 idx_train_all.append([]) idx_train_all[i].append(idx_train) idx_test_all.append([]) idx_test_all[i].append(idx_test) #print('train:', idx_train, 'test', idx_test) i += 1 idx_train = idx_train_all[ifold][0] idx_test = idx_test_all[ifold][0] for idx in idx_train: #idx_train.append(idx) image_paths_final.append(image_paths[idx]) labels_final.append(labels[idx]) for idx in idx_test: #idx_test.append(idx) image_paths_final_test.append(image_paths[idx]) labels_final_test.append(labels[idx]) nrof_classes = len(set(labels_final)) nrof_classes_test = len(set(labels_final_test)) return image_paths_final, labels_final, nrof_classes, image_paths_final_test, labels_final_test, nrof_classes_test
num_leaves=85, max_depth=15, learning_rate=0.003, n_estimators=3677, subsample_for_bin=400000, objective="binary", min_split_gain=0.0, min_child_weight=0.01, min_child_samples=50, subsample=0.8, subsample_freq=1, colsample_bytree=0.7, reg_alpha=5.0, reg_lambda=0.0, silent=True) kf = KFold(n_splits=5) for n_fold, (train_index, test_index) in enumerate(kf.split(train_X)): print n_fold X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index] y_train, y_test = train_y[train_index], train_y[test_index] model_1.fit(X_train, y_train) #prediction = model_1.predict_proba(X_test) #train_score.append(prediction[:,1]) oof_train[test_index] = model_1.predict_proba(X_test)[:, 1] oof_test_skf[n_fold, :] = model_1.predict_proba(test_X)[:, 1] oof_test[:] = oof_test_skf.mean(axis=0) te['buy'] = oof_test tr['buy'] = oof_train
# Toy data set modeled off of the Iris data set data = [[3.0, 1.2, 4.5], [2.8, 2.0, 5.6], [1.4, 2.2, 5.2], [2.5, 1.5, 6.3], [3.1, 1.7, 5.7]] # Toy target set modeled off of the Iris target set target = [0, 0, 1, 2, 1] # The number of splits I want n = 10 # Get the KFolder, telling it the number of ways you want it to be split and that you want the data to be # selected randomly. kf = KFold(10, n_folds=n, shuffle=True) # Store what your classifier returns in a list. There are alternate ways of doing this step predictions = [] # I put in print statements just so that you could see what it was doing for train_index, test_index in kf.split(data): print(train_index) # See the list. Be the list. print(test_index) # And do the same here print(data[train_index]) # Proof that it gets those indexes print(data[test_index]) # Notice that it collects different ones # collect all of your predictions. This is optional, depending on what else you are doing. predictions.append(classifier(data[train_index], # I don't know what your classifier does, but mine takes in all of these data[test_index], target[train_index], # Notice that both target and data get indexed. This is important! target[test_index]))
#Model Score print("The coefficient of determination for the Random Forest model is: %.4f" % iris_rf.score(irisX, irisY)) # # K- Fold Cross Validation # In[6]: from sklearn.model_selection import KFold from sklearn.metrics import confusion_matrix x = irisX y = irisY kf = KFold(n_splits=5, random_state=None, shuffle=True) kf.get_n_splits(x) for train_i, test_i in kf.split(x): print("TRAIN:", train_i, "TEST:", test_i) X_train, X_test = x[train_i], x[test_i] y_train, y_test = y[train_i], y[test_i] # # 2. KFold Score # We use cross validation so as to better predict the test error and gauge the accuracy of our model by using such a prediction. it is used over a validation set so as to not decrease the size of our training data too much as that raises error. # In[7]: #K- Fold Score from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict from sklearn import metrics iris_dtree.fit(X_train, y_train)
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2, 2]) # self.features = np.array(self.features) # self.labels = np.array(self.labels) for train_index, test_index in inputData: # clf = DecisionTreeClassifier(max_depth=10) # clf = RandomForestClassifier(n_estimators=30, max_depth=6,random_state=666) # clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20) clf = MLPClassifier(hidden_layer_sizes=(500, )) # clf = LogisticRegression(max_iter=1000, solver='lbfgs', C=100) # clf = GradientBoostingClassifier(n_estimators=20) # clf = SVC(C=0.8) # clfList = [ # ["AD", AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20)], # ["gbdt", GradientBoostingClassifier(n_estimators=20)], # ["LR", LogisticRegression(max_iter=1000, solver='lbfgs', C=100)], # ['desisionTree', DecisionTreeClassifier()], # ['mlp', MLPClassifier(hidden_layer_sizes=(200, 100))], # ['KNN', KNeighborsClassifier(n_neighbors=50)]] # clf = VotingClassifier(clfList, voting='hard') X_train, X_test = self.features[train_index], self.features[ test_index] y_train, y_test = self.labels[train_index], self.labels[test_index] # featureProcessor = PCA(n_components=20) print(len(X_train), 'asd', len(y_train)) # print(X_train,'asd', y_train) def aaa(X, L): res = [] for i in range(L): line = X[i] tres = [] for n in line: tres.append(n) res.append(tres) return np.array(res) X_train, X_test = aaa(X_train, len(y_train)), aaa(X_test, len(y_test)) X_train, X_test = X_train[:, :], X_test[:, :] y_train, y_test = np.array(y_train), np.array(y_test) # featureProcessor = PCA(n_components=10) featureProcessor = LinearDiscriminantAnalysis( n_components=200).fit(X_train, y_train) # # featureProcessor = SelectKBest(chi2, k=20)#.fit(features, labels) # featureProcessor.fit(X_train, y_train) X_train = featureProcessor.transform(X_train) X_test = featureProcessor.transform(X_test) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) y_train = list(y_train) cm = confusion_matrix(y_train, pred_train) print("第", count, "轮训练集里的表现\n", cm) pred = clf.predict(X_test) y_test = list(y_test) cm = confusion_matrix(y_test, pred, labels=[0, 1]) totalCM = totalCM + np.array(cm) print("第", count, "轮测试集里的表现\n", cm) # print(X_test) print("####################################") count += 1 print("混淆矩阵的和是\n", totalCM, "准确率是", (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))
random_state=12345) f, axes = plt.subplots(1, 2, figsize=(10, 5)) axes[0].scatter(X[y == 0, 0], X[y == 0, 1], color='blue', s=2, label='y=0') axes[0].scatter(X[y != 0, 0], X[y != 0, 1], color='red', s=2, label='y=1') axes[0].set_xlabel('X[:,0]') axes[0].set_ylabel('X[:,1]') axes[0].legend(loc='lower left', fontsize='small') k_fold = KFold(y, n_folds=FOLDS, shuffle=True, random_state=12345) predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=12345) y_real = [] y_proba = [] for i, (train_index, test_index) in enumerate(k_fold.split(X)): Xtrain, Xtest = X[train_index], X[test_index] ytrain, ytest = y[train_index], y[test_index] predictor.fit(Xtrain, ytrain) pred_proba = predictor.predict_proba(Xtest) precision, recall, _ = precision_recall_curve(ytest, pred_proba[:, 1]) lab = 'Fold %d AUC=%.4f' % (i + 1, auc(recall, precision)) axes[1].step(recall, precision, label=lab) y_real.append(ytest) y_proba.append(pred_proba[:, 1]) y_real = numpy.concatenate(y_real) y_proba = numpy.concatenate(y_proba) precision, recall, _ = precision_recall_curve(y_real, y_proba) lab = 'Overall AUC=%.4f' % (auc(recall, precision)) axes[1].step(recall, precision, label=lab, lw=2, color='black')
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2, 2]) for train_index, test_index in inputData: X_train, X_test = self.features[train_index], self.features[ test_index] #X_train, X_test = np.abs(X_train), np.abs(X_test) # print(X_train) y_train, y_test = self.labels[train_index], self.labels[test_index] num_feature = len(X_train[0]) clf = deepLearning.LSTMClassifier(2, num_feature, learning_rate=1e-3, layer_num=2, hidden_size=50, timestep_size=350) clf.initGraph() clf.initOneHotEncoder4Y(y_train) def aaa(X, L): res = [] for i in range(L): line = X[i] tres = [] for n in line: tres.append(n) res.append(tres) return np.array(res) def scala(data): res = [] for i in range(len(data)): res.append(data[i] / (0.0000001 + sum(data[i]))) return np.array(res) X_train, X_test = aaa(X_train, len(y_train)), aaa(X_test, len(y_test)) from sklearn.preprocessing import RobustScaler, StandardScaler #norm_x = StandardScaler().fit(X_train) #X_train, X_test = norm_x.transform(X_train), norm_x.transform(X_test) X_train, X_test = X_train[:, :], X_test[:, :] X_train, X_test = scala(X_train), scala(X_test) y_train, y_test = np.array(y_train), np.array(y_test) for i in range(500): print("这是第", count, "折,第", i, "轮训练。", len(y_train)) for j in range(0, len(y_train), 200): batch_ys = clf.oneHotEncode(y_train[j:j + 200, :]) batch_xs = np.array(X_train[j:j + 200, :]).astype( np.float32) clf.fit(batch_xs, batch_ys) print("训练集") batch_ys = clf.oneHotEncode(y_train) batch_xs = np.array(X_train).astype(np.float32) clf.test(batch_xs, batch_ys) print("测试机") batch_ys = clf.oneHotEncode(y_test) batch_xs = np.array(X_test).astype(np.float32) clf.test(batch_xs, batch_ys) print("*************************") break
# # Fradient Boosting import lightgbm as lgb from sklearn.linear_model import Ridge from sklearn.cross_validation import KFold NFOLDS = 5 SEED = 42 if os.path.exists("../tmp/oof_index.dat"): with open("../tmp/oof_index.dat", "rb") as f: kfolds = dill.load(f) else: dftrain_tmp = pd.read_csv("../input/train.csv") fold = KFold(n_splits=5, shuffle=True, random_state=1234) kfolds = list(fold.split(dftrain_tmp)) with open("../tmp/oof_index.dat", "wb") as f: dill.dump(kfolds, f) del dftrain_tmp; gc.collect() print("Creating Ridge Features...") class SklearnWrapper(object): def __init__(self, clf, seed=0, params=None, seed_bool = True): if(seed_bool == True): params['random_state'] = seed self.clf = clf(**params) def train(self, x_train, y_train): self.clf.fit(x_train, y_train) def predict(self, x):
from scipy import interp from sklearn.multiclass import OneVsRestClassifier plt.style.use('ggplot') X, y = make_classification(n_samples=500, random_state=100, flip_y=0.3) kf = KFold(n_splits=5, shuffle=True, random_state=0) clf = LinearDiscriminantAnalysis() pipe = Pipeline([('scaler', StandardScaler()), ('clf', clf)]) tprs = [] aucs = [] base_fpr = np.linspace(0, 1, 101) colors = ['darksalmon', 'gold', 'royalblue', 'mediumseagreen', 'violet'] for i, (train, test) in enumerate(kf.split(X, y)): model = pipe.fit(X[train], y[train]) y_score = model.predict_proba(X[test]) fpr, tpr, _ = roc_curve(y[test], y_score[:, 1]) roc_auc = auc(fpr, tpr) aucs.append(roc_auc) #plt.plot(fpr, tpr, lw=1, alpha=0.6, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc), c = colors[i]) tpr = interp(base_fpr, fpr, tpr) tpr[0] = 0.0 tprs.append(tpr) tprs = np.array(tprs) mean_tprs = tprs.mean(axis=0) std = tprs.std(axis=0) mean_auc = auc(base_fpr, mean_tprs)
def crossValidation(self): kf = KFold(n_splits=10) inputData = kf.split(self.features) count = 1 totalCM = np.zeros([2, 2]) for train_index, test_index in inputData: # clf = DecisionTreeClassifier(max_depth=10) clf = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=666) #clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20) #clf = MLPClassifier(hidden_layer_sizes=(200,)) # clf = LogisticRegression(max_iter=2000, solver='lbfgs', C=100) #clf = GradientBoostingClassifier(n_estimators=50) # clf = SVC(C=0.8) clfList = [[ "AD", AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=20) ], ["gbdt", GradientBoostingClassifier(n_estimators=20)], [ "LR", LogisticRegression(max_iter=1000, solver='lbfgs', C=100) ], ['desisionTree', DecisionTreeClassifier()], ['mlp', MLPClassifier(hidden_layer_sizes=(100, ))], ['KNN', KNeighborsClassifier(n_neighbors=20)]] clf = VotingClassifier(clfList, voting='hard') X_train, X_test = self.features[train_index], self.features[ test_index] #X_train, X_test = np.abs(X_train), np.abs(X_test) # print(X_train) y_train, y_test = self.labels[train_index], self.labels[test_index] def scala(x): res = [] for i in range(len(x)): res.append(x[i, :] / (0.0000001 + np.median(x[i, :]))) return np.array(res) X_train, X_test = X_train[:, :], X_test[:, :] X_train, X_test = scala(X_train), scala(X_test) y_train, y_test = np.array(y_train), np.array(y_test) clf.fit(X_train, y_train) pred_train = clf.predict(X_train) y_train = list(y_train) cm = confusion_matrix(y_train, pred_train) print("第", count, "轮训练集里的表现\n", cm) pred = clf.predict(X_test) y_test = list(y_test) cm = confusion_matrix(y_test, pred, labels=[0, 1]) totalCM = totalCM + np.array(cm) print("第", count, "轮测试集里的表现\n", cm) # print(X_test) print("####################################") count += 1 print("混淆矩阵的和是\n", totalCM, "准确率是", (totalCM[0][0] + totalCM[1][1]) / (sum(sum(totalCM))))