def stacking_proba(clf,X_train,y,X_test,nfolds=5,random_seed=2017,return_score=False, shuffle=True,metric='acc',clf_name='UnKnown'): folds = StratifiedKFold(n_splits=nfolds, shuffle=shuffle, random_state=random_seed) folds.get_n_splits(X_train,y) #return stacking_proba for train set train_stacking_proba=np.zeros((X_train.shape[0],np.unique(y).shape[0])) score=0 for i,(train_index, validate_index) in enumerate(folds.split(X_train, y)): # print(str(clf_name)+" folds:"+str(i+1)+"/"+str(nfolds)) X_train_fold=X_train[train_index,:] y_train_fold=y[train_index] X_validate_fold=X_train[validate_index,:] y_validate_fold=y[validate_index] clf.fit(X_train_fold,y_train_fold) fold_preds=clf.predict_proba(X_validate_fold) train_stacking_proba[validate_index,:]=fold_preds #validation fold_preds_a = np.argmax(fold_preds, axis=1) fold_score=len(np.nonzero(y_validate_fold - fold_preds_a == 0)[0]) / len(y_validate_fold) # print('validate '+metric+":"+str(fold_score)) score+=fold_score score/=nfolds #return stacking_proba for test set clf.fit(X_train,y) test_stacking_proba=clf.predict_proba(X_test) if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba train_stacking_proba=train_stacking_proba[:,1] test_stacking_proba=test_stacking_proba[:,1] if return_score: return train_stacking_proba,test_stacking_proba,score else: return train_stacking_proba,test_stacking_proba
def _get_fold_generator(target_values): if params.stratified_cv: cv = StratifiedKFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) cv.get_n_splits(target_values) fold_generator = cv.split(target_values, target_values) else: cv = KFold(n_splits=params.n_cv_splits, shuffle=True, random_state=cfg.RANDOM_SEED) fold_generator = cv.split(target_values) return fold_generator
def stratified_cross_validate(self, k): attributes = np.append(self.training_attributes, self.testing_attributes, axis=0) labels = np.append(self.training_labels, self.testing_labels, axis=0) all_data = np.array([np.append(attributes[i], labels[i]) for i in range(len(attributes))]) #print("all data : %s" % all_data) #print("") np.random.shuffle(all_data) X = all_data[:, :-1] y = all_data[:, -1] print(X.shape, y.shape) skf = StratifiedKFold(n_splits=2) print(skf.get_n_splits(X, y)) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] yield (X_train, y_train, X_test, y_test) #print("shuffled data : %s" % all_data) #print("") for i in range(k): split = len(all_data) / k #print("split : %s" % split) test_data = all_data[i * split:(i + 1) * split, :] train_data = np.delete(all_data, np.arange(i * split, (i + 1) * split), axis=0) train_input, train_output = train_data[:, :-1], train_data[:, -1] test_input, test_output = test_data[:, :-1], test_data[:, -1] yield (train_input, train_output, test_input, test_output)
def train_val(X_trainval, y_trainval, temp_dir, current_ration, test_count, hyper_count): # construct the validataion set by stratified cross-validataion skf_train_val = StratifiedKFold(n_splits=VAL_FOLD, random_state=RANDOM_STATE, shuffle=True) skf_train_val.get_n_splits(X_trainval, y_trainval) fold_count = 1 all_train_loss = [] all_val_loss = [] all_val_acc = [] all_val_f1 = [] all_val_precision = [] all_val_recall = [] all_num_train = [] all_num_val = [] all_num_under_val = [] for train_index, val_index in skf_train_val.split(X_trainval, y_trainval): X_train = X_trainval.iloc[train_index] y_train = y_trainval.iloc[train_index] X_val = X_trainval.iloc[val_index] y_val = y_trainval.iloc[val_index] # calculating the required OS & US samples print("-" * 70) print("START SAMPLING TRAIN SET [{}] ".format(fold_count)) start_time = time.time() num_class0, num_class1 = y_train.value_counts() diff = num_class0 - num_class1 num_os_instance = int(diff * current_ration) num_us_instance = int(diff * (1 - current_ration)) # performing OS & US by Resampling sample_train = pd.concat([y_train, X_train], axis=1) sample_train_over = oversampling(sample_train, num_os_instance) sample_train_over_under = undersampling(sample_train_over, num_us_instance) end_time = time.time() print("FINISH SAMPLING TRAIN SET [{}]: {}".format( fold_count, (end_time - start_time))) # undersampling val set by resampling num_class0, num_class1 = y_val.value_counts() all_num_val.append([num_class0, num_class1]) sample_val = pd.concat([y_val, X_val], axis=1) num_us_instance = num_class0 - num_class1 under_sample_val = undersampling(sample_val, num_us_instance) # saving undersampled val data under_val_path = temp_dir + '/sample_val_under.tsv' under_sample_val.to_csv(under_val_path, sep='\t', encoding="utf-8", index=False, header=False) num_class0, num_class1 = under_sample_val['y'].value_counts() all_num_under_val.append([num_class0, num_class1]) # saving sampled train data train_path = temp_dir + '/sample_train_aug.tsv' sample_train_over_under.to_csv(train_path, sep='\t', encoding="utf-8", index=False, header=False) num_class0, num_class1 = sample_train_over_under['y'].value_counts() all_num_train.append([num_class0, num_class1]) del sample_train, X_train, y_train del sample_val, X_val, y_val del sample_train_over, sample_train_over_under # processing to model classifier (BERT) history = bert(train_path, under_val_path, INPUT_EPOCH, EVAL_STEPS, test_count, hyper_count, fold_count, predict=False) # calculating average train loss all_train_loss.append(history['train_loss']) # calculating average val loss, accuracy, f1, precision and recall all_val_loss.append(history['val_loss']) all_val_acc.append(history['val_acc']) all_val_f1.append(history['val_f1']) all_val_precision.append(history['val_precision']) all_val_recall.append(history['val_recall']) # increamenting the fold index, and repeat above process fold_count = fold_count + 1 # return back the ICV log results = { 'final_all_train_loss': all_train_loss, 'final_all_val_loss': all_val_loss, 'final_all_val_acc': all_val_acc, 'final_all_val_f1': all_val_f1, 'final_all_val_precision': all_val_precision, 'final_all_val_recall': all_val_recall, 'final_avg_val_loss': np.mean(all_val_loss), 'train_distribution': all_num_train, 'val_distribution': all_num_val, 'under val val_distribution': all_num_under_val } return results
def cross_validation(self): ''' do a 6 fold cross-validation, draw ROC curve ''' self._load_data() mean_tpr = 0.0 mean_fpr = numpy.linspace(0, 1, 100) colors = ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'] lw = 2 i = 0 #pdf = PdfPages('../data/cnn_cv.pdf') plt.figure(figsize = (10,10)) cvscores = [] kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed) for (train, test), color in zip(kfold.split(self.X_train, self.y_train), colors): self._init_model(verbose=False) # Fit the model self.model.fit(self.X_train[train], self.y_train[train], nb_epoch=self.nb_epoch, batch_size=self.batch_size, verbose=self.verbose) # evaluate the model scores = self.model.evaluate( self.X_train[test], self.y_train[test], verbose=self.verbose) print("%s: %.2f%%" % (self.model.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) # Compute ROC curve and area the curve, mean ROC using interpolation probas_ = self.model.predict(self.X_train[test]) fpr, tpr, thresholds = roc_curve(self.y_train[test], probas_[:, 0]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) i += 1 cv_results = "%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)) plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= kfold.get_n_splits(self.X_train, self.y_train) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) df = pd.read_csv(self.fname) y_true = df.pop('target') plot_roc(df, y_true) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Training cross-validation ROC\nAccuracy:' + cv_results) plt.legend(loc="lower right") plt.show() #plt.savefig('../data/cnn_cv_' + prefix + '_' + datetime.datetime.now().strftime('%Y%m%d-%H.%M.%S') + '.eps', format = 'eps', dpi=600) #, bbox_inches='tight') plt.savefig('../data/cnn_cv_'+ datetime.datetime.now().strftime('%Y%m%d-%H.%M.%S') +'.eps', format = 'eps', dpi = 600) #pdf.close() plt.close() print 'Saving ROC plot in .eps in data folder...'
def kfold_plot(self, train, ytrain, model): # kf = StratifiedKFold(y=ytrain, n_folds=5) kf = StratifiedKFold(n_splits=5) scores = [] mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) exe_time = [] colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue']) lw = 2 i = 0 for (train_index, test_index), color in zip(kf.split(train, ytrain), colors): X_train, X_test = train.iloc[train_index], train.iloc[test_index] y_train, y_test = ytrain.iloc[train_index], ytrain.iloc[test_index] begin_t = time.time() predictions = model(X_train, X_test, y_train) end_t = time.time() exe_time.append(round(end_t - begin_t, 3)) scores.append(roc_auc_score(y_test.astype(float), predictions)) fpr, tpr, thresholds = roc_curve(y_test, predictions) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= kf.get_n_splits(train, ytrain) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc='lower right') plt.show() # print 'scores: ', scores print('mean scores: ', np.mean(scores)) print('mean model process time: ', np.mean(exe_time), 's') return scores, np.mean(scores), np.mean(exe_time)
def load_data(fold=1, n_workers=N_WORKERS, spec_dir="specs_train_v1", train_verified=True, train_unverified=True, normalize=False, fix_lengths=True, max_len=None, min_len=None, validate_verified=True, train_file="train.csv", load_test=True, train_on_all=False): """ load data """ if not min_len: min_len = N_FRAMES # get annotations files, labels, verified = get_files_and_labels(os.path.join(DATA_ROOT, train_file), spec_dir) _, _, verified_for_val = get_files_and_labels(os.path.join(DATA_ROOT, "train.csv"), spec_dir) # stratified split np.random.seed(4711) r_idx = np.random.permutation(len(files)) files, labels, verified = files[r_idx], labels[r_idx], verified[r_idx] verified_for_val = verified_for_val[r_idx] verified_indices = np.nonzero(verified)[0] unverified_indices = np.nonzero(~verified)[0] from sklearn.model_selection import StratifiedKFold sss = StratifiedKFold(n_splits=4, random_state=0) sss.get_n_splits(files[verified], labels[verified]) for i_fold, (train_index_ver, test_index_ver) in enumerate(sss.split(files[verified], labels[verified])): if i_fold + 1 == fold: break sss = StratifiedKFold(n_splits=4, random_state=0) sss.get_n_splits(files[~verified], labels[~verified]) for i_fold, (train_index_unver, test_index_unver) in enumerate(sss.split(files[~verified], labels[~verified])): if i_fold + 1 == fold: break train_index = np.concatenate((verified_indices[train_index_ver], unverified_indices[train_index_unver])) test_index = np.concatenate((verified_indices[test_index_ver], unverified_indices[test_index_unver])) if train_on_all: train_index = np.concatenate((train_index, test_index)) # split into train and validation data tr_files, tr_labels, tr_verified = files[train_index], labels[train_index], verified[train_index] va_files, va_labels, va_verified = files[test_index], labels[test_index], verified_for_val[test_index] # load only verified labels train_idx = np.zeros_like(tr_verified, dtype=np.bool) if train_verified: train_idx = train_idx | tr_verified if train_unverified: train_idx = train_idx | (tr_verified == False) tr_files = tr_files[train_idx] tr_labels = tr_labels[train_idx] # keep only verified examples for validation if validate_verified: va_files = va_files[va_verified] va_labels = va_labels[va_verified] # create data pools pool = AugmentedAudioFileClassificationDataPool train_pool = pool(tr_files, tr_labels, None, n_workers=n_workers, shuffle=True, use_cache=True) valid_pool = pool(va_files, va_labels, None, n_workers=n_workers, shuffle=False, use_cache=True) if load_test: test_pool = load_data_test(spec_dir=spec_dir.replace("train", "test"))["test"] else: test_pool = None # fix spectrogram lengths print("Fixing spectrogram lengths ...") if max_len is None: max_len = np.max([s.shape[-1] for s in train_pool.cache.values()]) def fix_pool(pool, test_mode): for k in pool.cache.keys(): # copy spectrogram spec = pool.cache[k].copy() tmp = spec.copy() while spec.shape[-1] < max_len: if test_mode and spec.shape[-1] >= min_len: break spec = np.concatenate((spec, tmp), axis=-1) # clip spectrogram if too long pool.cache[k] = spec[:, :, 0:max_len] return pool if fix_lengths: train_pool = fix_pool(train_pool, test_mode=False) valid_pool = fix_pool(valid_pool, test_mode=False) if load_test: test_pool = fix_pool(test_pool, test_mode=True) # normalize data if normalize: print("Normalizing data ...") specs = train_pool.cache.values() specs = np.concatenate(specs, axis=2).astype(np.float32) sub = specs.mean(axis=(0, 2), keepdims=True)[0] div = specs.std(axis=(0, 2), keepdims=True)[0] # sub = specs.min() # div = np.max(specs - sub) for key in train_pool.cache.keys(): train_pool.cache[key] -= sub train_pool.cache[key] /= div for key in valid_pool.cache.keys(): valid_pool.cache[key] -= sub valid_pool.cache[key] /= div if load_test: for key in test_pool.cache.keys(): test_pool.cache[key] -= sub # [0:1] test_pool.cache[key] /= div # [0:1] print("Train %d" % train_pool.shape[0]) print("Valid %d" % valid_pool.shape[0]) if load_test: print("Test %d" % test_pool.shape[0]) return {'train': train_pool, 'valid': valid_pool, 'test': test_pool}
def k_fold_cross_validation(k, hiddenLayers, numEpochs): # load dataset noShow = ds.import_data_df([ds._FILE_PATHS['merged']]) noShow_X, noShow_y = noShow.iloc[:, :-1].values, noShow.iloc[:, -1].values noShow_y = np.array([[i] for i in noShow_y]) # Stratified k-fold skf = StratifiedKFold(n_splits=k, shuffle=True) skf.get_n_splits(noShow_X, noShow_y) #print(skf) # store results losses = [] accuracies = [] f1s = [] fold = 1 for train_index, test_index in skf.split(noShow_X, noShow_y): #print("Fold: "+str(fold)) fold += 1 trainX, testX = noShow_X[train_index], noShow_X[test_index] trainY, testY = noShow_y[train_index], noShow_y[test_index] # separate training in validation and training set trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size=0.15, random_state=42, stratify=trainY) # number of features numFeatures = trainX.shape[1] # number of classes numLabels = trainY.shape[1] # init NN = nn.NN_Sigmoid(hiddenLayers, numFeatures, numLabels, learning_rate=0.05, cross_entropy_weight=4, optimizer="Adam") # train NN.train(numEpochs, trainX, trainY, valX=valX, valY=valY, val_epochs=25, val_patience=5) # test _, loss, acc, f1 = NN.predict(testX, testY) # close tf session NN.close_session() losses.append(loss) accuracies.append(acc) f1s.append(f1) return losses, accuracies, f1s
def _performCV(X, y, sel_SAVs, n_estimators=1000, max_features='auto', n_splits=10, ROC_fig='ROC.png', feature_names=None, CVseed=666, stratification=None, **kwargs): assert stratification in [None, 'protein', 'residue'] # set classifier classifier = RandomForestClassifier( n_estimators=n_estimators, max_features=max_features, oob_score=True, n_jobs=-1, class_weight='balanced') # define folds cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CVseed) CV_folds = [] for train, test in cv.split(X, y): CV_folds.append([train, test]) # protein-stratification: a same protein should not be found in # both training and test sets if stratification is not None: # for each fold, count occurrences of each protein/residue occurrences = {} if stratification == 'protein': # e.g. 'P01112' accs = np.array([s.split()[0] for s in sel_SAVs['SAV_coords']]) else: # e.g. P01112 99 accs = np.array([' '.join(s.split()[:2]) for s in sel_SAVs['SAV_coords']]) for k, (train, test) in enumerate(CV_folds): counts = Counter(accs[test]) for acc, count in counts.items(): occurrences.setdefault(acc, np.zeros(n_splits, dtype=int)) occurrences[acc][k] = count # for each acc. number, find fold with largest occurrences best_fold = {a: np.argmax(c) for a, c in occurrences.items()} new_folds = np.array([best_fold[a] for a in accs]) # update folds for k in range(n_splits): CV_folds[k][0] = np.where(new_folds != k)[0] CV_folds[k][1] = np.where(new_folds == k)[0] # cross-validation loop CV_info = {k: [] for k in [ 'AUROC', 'AUPRC', 'OOB score', 'optimal cutoff', 'MCC', 'precision (0)', 'recall (0)', 'F1 score (0)', 'precision (1)', 'recall (1)', 'F1 score (1)', 'precision', 'recall', 'F1 score', 'feat. importances', 'predictions_0', 'predictions_1']} mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 20) i = 0 for train, test in CV_folds: # create training and test datasets X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] # train Random Forest classifier classifier.fit(X_train, y_train) # calculate probabilities over decision trees y_pred = classifier.predict_proba(X_test)[:, 1] # compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc. sm = calcScoreMetrics(y_test, y_pred) for stat in ['AUROC', 'AUPRC', 'optimal cutoff']: CV_info[stat].append(sm[stat]) # compute Matthews corr. coeff., precision/recall, etc. on classes y_pred_binary = np.where(y_pred > sm['optimal cutoff'], 1, 0) cm = calcClassMetrics(y_test, y_pred_binary) for stat in cm.keys(): CV_info[stat].append(cm[stat]) # other info mean_tpr += np.interp(mean_fpr, sm['ROC']['FPR'], sm['ROC']['TPR']) CV_info['OOB score'].append(classifier.oob_score_) CV_info['feat. importances'].append( np.array(classifier.feature_importances_)) CV_info['predictions_0'].extend(y_pred[y_test == 0]) CV_info['predictions_1'].extend(y_pred[y_test == 1]) # print log i += 1 LOGGER.info('CV iteration #{:2d}: '.format(i) + 'AUROC = {:.3f} '.format(sm['AUROC']) + 'AUPRC = {:.3f} '.format(sm['AUPRC']) + 'OOB score = {:.3f}'.format(classifier.oob_score_)) # compute average ROC curves mean_tpr /= cv.get_n_splits(X, y) mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 # compute average ROC, optimal cutoff and other stats stats = {} for s in CV_info.keys(): if s in ['predictions_0', 'predictions_1']: continue stats[s] = (np.mean(CV_info[s], axis=0), np.std(CV_info[s], axis=0)) LOGGER.info('-'*60) LOGGER.info('Cross-validation summary:') LOGGER.info(f'training dataset size: {len(y):<d}') LOGGER.info(f'fraction of positives: {sum(y)/len(y):.3f}') for s in ['AUROC', 'AUPRC', 'OOB score', 'optimal cutoff']: if s == 'optimal cutoff': fields = ('optimal cutoff*:', stats[s][0], stats[s][1]) else: fields = (f'mean {s}:', stats[s][0], stats[s][1]) LOGGER.info('{:24} {:.3f} +/- {:.3f}'.format(*fields)) LOGGER.info("(* argmax of Youden's index)") n_feats = len(stats['feat. importances'][0]) if feature_names is None: feature_names = [f'feature {i}' for i in range(n_feats)] LOGGER.info('feature importances:') for i, feat_name in enumerate(feature_names): LOGGER.info('{:>23s}: {:.3f}'.format( feat_name, stats['feat. importances'][0][i])) LOGGER.info('-'*60) path_prob = calcPathogenicityProbs(CV_info, **kwargs) CV_summary = { 'dataset size': len(y), 'dataset bias': sum(y)/len(y), 'mean ROC': list(zip(mean_fpr, mean_tpr)), 'optimal cutoff': stats['optimal cutoff'], 'feat. importances': stats['feat. importances'], 'path. probability': path_prob, 'training dataset': sel_SAVs, 'folds': CV_folds } for s in ['AUROC', 'AUPRC', 'OOB score', 'MCC', 'precision (0)', 'recall (0)', 'F1 score (0)', 'precision (1)', 'recall (1)', 'F1 score (1)', 'precision', 'recall', 'F1 score']: CV_summary['mean ' + s] = stats[s] # plot average ROC if ROC_fig is not None: print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, stats['AUROC']) return CV_summary
else: Y_raw.append(0) X_raw.append(float(row[6])) print len(X_raw) print len(Y_raw) X = np.array(X_raw) X = np.reshape(X,(-2,1)) Y = np.array(Y_raw) print len(X) print len(Y) # print X skf = StratifiedKFold(n_splits=10,random_state=40) skf.get_n_splits(X,Y) # X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42) index = 0 precision_score_list_LR = list() recall_score_list_LR = list() precision_score_list_SVC_poly = list() recall_score_list_SVC_poly = list() precision_score_list_RF = list() recall_score_list_RF = list() for train_index, test_index in skf.split(X,Y): print "########################" X_train, X_test = X[train_index], X[test_index] # X_train, X_test = X.iloc[train_index], X.iloc[test_index]
3, labels=['normal', 'prediabetes', 'diabetes']) df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].map({ "normal": 0, "prediabetes": 1, "diabetes": 2 }) y = df.Outcome x = df.drop('Outcome', axis=1) accuracy = [] skf = StratifiedKFold(n_splits=10, random_state=None) skf.get_n_splits(x, y) # x is the feature set and y is the target for train_index, test_index in skf.split(x, y): #print ("Train:", train_index, "validation:", test_index) X1_train, X1_test = x.iloc[train_index], x.iloc[test_index] y1_train, y1_test = y.iloc[train_index], y.iloc[test_index] ##standard scalar st_x = StandardScaler() X1_train = st_x.fit_transform(X1_train) X1_test = st_x.transform(X1_test) ##PCA pca = PCA() X1_train = pca.fit_transform(X1_train) X1_test = pca.transform(X1_test) explained_variance = pca.explained_variance_ratio_
def train_stage(df_path, lgb_path, xgb_path, cb_path): print('Load Train Data.') df = pd.read_csv(df_path) print('\nShape of Train Data: {}'.format(df.shape)) y_df = np.array(df['target']) df_ids = np.array(df.index) df.drop(['ID_code', 'target'], axis=1, inplace=True) lgb_cv_result = np.zeros(df.shape[0]) xgb_cv_result = np.zeros(df.shape[0]) cb_cv_result = np.zeros(df.shape[0]) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) skf.get_n_splits(df_ids, y_df) print('\nModel Fitting...') for counter, ids in enumerate(skf.split(df_ids, y_df)): print('\nFold {}'.format(counter + 1)) X_fit, y_fit = df.values[ids[0]], y_df[ids[0]] X_val, y_val = df.values[ids[1]], y_df[ids[1]] print('LigthGBM') lgb_cv_result[ids[1]] += fit_lgb(X_fit, y_fit, X_val, y_val, counter, lgb_path, name='lgb') print('XGBoost') xgb_cv_result[ids[1]] += fit_xgb(X_fit, y_fit, X_val, y_val, counter, xgb_path, name='xgb') print('CatBoost') cb_cv_result[ids[1]] += fit_cb(X_fit, y_fit, X_val, y_val, counter, cb_path, name='cb') del X_fit, X_val, y_fit, y_val gc.collect() auc_lgb = round(roc_auc_score(y_df, lgb_cv_result), 4) auc_xgb = round(roc_auc_score(y_df, xgb_cv_result), 4) auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4) auc_mean = round( roc_auc_score(y_df, (lgb_cv_result + xgb_cv_result + cb_cv_result) / 3), 4) auc_mean_lgb_cb = round( roc_auc_score(y_df, (lgb_cv_result + cb_cv_result) / 2), 4) print('\nLightGBM VAL AUC: {}'.format(auc_lgb)) print('XGBoost VAL AUC: {}'.format(auc_xgb)) print('Catboost VAL AUC: {}'.format(auc_cb)) print('Mean Catboost+LightGBM VAL AUC: {}'.format(auc_mean_lgb_cb)) print('Mean XGBoost+Catboost+LightGBM, VAL AUC: {}\n'.format(auc_mean)) return 0
del df['length'] del tdf['length'] #del df['doc'] #del tdf['doc'] #Getting the Test and Train data #Doing a 10 fold split using StratifiedKFold y_col = 'person' target = df[y_col].values test_target = tdf[y_col].values del df[y_col] del tdf[y_col] train_data = df.values test_data = tdf.values skf.get_n_splits(df, target) #Decision Tree Classfier decisiontree_classifier = tree.DecisionTreeClassifier() precision_list = [] recall_list = [] fscore_list = [] for train_index, test_index in skf.split(train_data, target): decisiontree_classifier.fit(train_data[train_index], target[train_index]) y_pred = decisiontree_classifier.predict(train_data[test_index]) precision, recall, fscore, support = precision_recall_fscore_support( target[test_index], y_pred, average='macro') precision_list.append(precision) recall_list.append(recall) fscore_list.append(fscore)
def _performCV(X, y, n_estimators=1000, max_features='auto', n_splits=10, ROC_fig='ROC.png', feature_names=None, **kwargs): # set classifier classifier = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, oob_score=True, class_weight='balanced', n_jobs=-1) # set cross-validation procedure cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=666) # cross-validation loop CV_info = { 'AUROC': [], 'AUPRC': [], 'feat_importance': [], 'OOB_score': [], 'Youden_cutoff': [], 'predictions_0': [], 'predictions_1': [] } mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) i = 0 for train, test in cv.split(X, y): # create training and test datasets X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] # train Random Forest classifier classifier.fit(X_train, y_train) # calculate probabilities over decision trees y_pred = classifier.predict_proba(X_test) # compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc... d = calcMetrics(y_test, y_pred[:, 1]) auroc = d['AUROC'] auprc = d['AUPRC'] J_opt = d['optimal cutoff'] # store other info and metrics for each iteration mean_tpr += np.interp(mean_fpr, d['FPR'], d['TPR']) CV_info['AUROC'].append(auroc) CV_info['AUPRC'].append(auprc) CV_info['feat_importance'].append(classifier.feature_importances_) CV_info['OOB_score'].append(classifier.oob_score_) CV_info['Youden_cutoff'].append(J_opt) CV_info['predictions_0'].extend(y_pred[np.where(y_test == 0), 1][0]) CV_info['predictions_1'].extend(y_pred[np.where(y_test == 1), 1][0]) # print log i += 1 LOGGER.info(f'CV iteration #{i:2d}: AUROC = {auroc:.3f} ' + \ f'AUPRC = {auprc:.3f} OOB score = {classifier.oob_score_:.3f}') # compute average ROC, optimal cutoff and other stats mean_tpr /= cv.get_n_splits(X, y) mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 mean_auroc = auc(mean_fpr, mean_tpr) mean_auprc = np.mean(CV_info['AUPRC']) mean_oob = np.mean(CV_info['OOB_score']) avg_J_opt = np.mean(CV_info['Youden_cutoff']) std_J_opt = np.std(CV_info['Youden_cutoff']) avg_feat_imp = np.mean(np.array(CV_info['feat_importance']), axis=0) LOGGER.info('-' * 60) LOGGER.info('Cross-validation summary:') LOGGER.info(f'training dataset size: {len(y):<d}') LOGGER.info(f'fraction of positives: {sum(y)/len(y):.3f}') LOGGER.info(f'mean AUROC: {mean_auroc:.3f}') LOGGER.info(f'mean AUPRC: {mean_auprc:.3f}') LOGGER.info(f'mean OOB score: {mean_oob:.3f}') LOGGER.info( f'optimal cutoff*: {avg_J_opt:.3f} +/- {std_J_opt:.3f}') LOGGER.info("(* argmax of Youden's index)") LOGGER.info('feature importances:') if feature_names is None: feature_names = [f'feature {i}' for i in range(len(avg_feat_imp))] for feat_name, importance in zip(feature_names, avg_feat_imp): LOGGER.info(f'{feat_name:>23s}: {importance:.3f}') LOGGER.info('-' * 60) path_prob = calcPathogenicityProbs(CV_info, **kwargs) CV_summary = { 'dataset size': len(y), 'dataset bias': sum(y) / len(y), 'mean AUROC': mean_auroc, 'mean AUPRC': mean_auprc, 'mean OOB score': mean_oob, 'mean ROC': list(zip(mean_fpr, mean_tpr)), 'optimal cutoff': (avg_J_opt, std_J_opt), 'feat. importance': avg_feat_imp, 'path. probability': path_prob } # plot average ROC if ROC_fig is not None: print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, mean_auroc) return CV_summary
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train-file', type=str, default="data/train.csv", help="") parser.add_argument('--test-file', type=str, default="data/test.csv", help="") parser.add_argument('--model-type', type=str, default="SVM", help="") parser.add_argument('--train-or-test', type=str, default="train", help="") parser.add_argument('--result-file', type=str, default="result/result.csv", help="") args = parser.parse_args() train_file=args.train_file test_file=args.test_file model_type=args.model_type train_or_test=args.train_or_test result_file=args.result_file print("Loading training data...") X, Y = read_train(train_file) X = preprocessing.scale(X) #pca = PCA(n_components=2048) #pca.fit(X) #print(pca.explained_variance_ratio_) #X = pca.transform(X) print("Finish loading training data!") # model_type = {RC: RidgeClassifer, KNN:KNeighbors, GNB:Gaussian Naive-Bayes, LR:Logistic Regression, LDA:Linear Discriminant Analysis, SVM:Support Vector Machine, MLP:Multi-layer Perceptron, EL:Ensemble Learning} if model_type == 'RC': ALPHA=10 #50:67.32, 40:66.02, 30:64.71, 25:64.38, 20:64.43 10:66.35, 1:72.49 print("alpha: {}".format(ALPHA)) model = RidgeClassifier(alpha=ALPHA, normalize=True) #0.10,0,11:0.986923, 0.12,0.13,0.14:0.987179, 0.15:0.987051 elif model_type == 'KNN': model = KNeighborsClassifier(n_neighbors=12, n_jobs=4) #0.975000, elif model_type == 'GNB': model = GaussianNB() #0.925897, elif model_type == 'LR': C=0.0005 print("C: {}".format(C)) model = LogisticRegression(C=C) #1.0:0.9923077 elif model_type == 'LDA': model = LinearDiscriminantAnalysis() #QuadraticDiscriminantAnalysis() #0.978077 elif model_type == 'SVM': #model = SVC(C=3.0, kernel='rbf', gamma='auto') #0.985890,0.987180,0.987436 C=1e-4 print("C: {}".format(C)) model = LinearSVC(C=C) #0.001:0.988590,0.00075:0.988718 elif model_type == 'MLP': model = MLPClassifier(random_state=1, max_iter=500, tol=1e-4, hidden_layer_sizes=(256,256), activation='relu', solver='adam', alpha=1e-4, batch_size=256, learning_rate_init=0.0005, learning_rate='adaptive') #0.986667,0.987051 else: # EL:Ensemble Learning model = VotingClassifier( estimators=[("LR", LogisticRegression(C=0.001)), ("RC", RidgeClassifier(alpha=10, normalize=True)), ("SVM", LinearSVC(C=1e-4))], voting="hard", n_jobs=-1) ''' X1, Y1 = shuffle(X, Y, random_state=1) N=780 X_train, X_dev = X1[N:], X1[:N] Y_train, Y_dev = Y1[N:], Y1[:N] model.fit(X_train, Y_train) acc = model.score(X_dev,Y_dev) print("Accuracy: {}".format(acc)) #''' if train_or_test == 'train': k=10 #kFold skf=StratifiedKFold(n_splits=k, random_state=1, shuffle=True) #skf=KFold(n_splits=k) skf.get_n_splits(X,Y) print(skf) sum_acc = 0.0 fold = 1 for train_index, dev_index in skf.split(X,Y): #print("Train Index:", train_index, ",dev Index:", dev_index) X_train, X_dev = X[train_index], X[dev_index] Y_train, Y_dev = Y[train_index], Y[dev_index] model.fit(X_train, Y_train) acc = model.score(X_dev,Y_dev) sum_acc += acc print("Fold {} accuracy: {}".format(fold, acc)) fold += 1 average_acc = sum_acc/k print("Average accuracy: {}".format(average_acc)) else: print("Training...") model.fit(X, Y) print("Finish training. Now testing...") ids, X_test = read_test(test_file) X_test = preprocessing.scale(X_test) #X_test = pca.transform(X_test) Y_pred = model.predict(X_test) writeResultCsv(ids,Y_pred,result_file) print("Finish testing!")
def gbdt_cv_modeling(): """ :return: """ '''Data input''' data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no') data_test = pd.read_csv('../data/B_test_final.csv', index_col='no') data_train = data_b_train data_train_without_label = data_train.drop('flag', axis=1) frames = [data_train_without_label, data_test] '''给定一个随机数种子,打乱train''' s = 0 np.random.seed(s) sampler = np.random.permutation(len(data_train.values)) data_train_randomized = data_train.take(sampler) feature_name = list(data_train.columns.values) '''缺失值填充''' data_train_filled = data_train_randomized.fillna(value=10) '''构造训练集和测试集''' x_temp = data_train_filled.iloc[:, :-1].as_matrix() # 自变量 y = data_train_filled.iloc[:, -1].as_matrix() # 因变量 '''Feature selection''' X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, '0.1*mean') '''处理 验证集 B_test''' data_test_filled = data_test.fillna(value=10) data_test_filled_after_feature_selection = data_test_feature_drop(data_test_filled, dropped_feature_name) '''Split train/test data sets''' cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation '''Choose a classification model''' parameter_n_estimators = 400 classifier = GradientBoostingClassifier(n_estimators=parameter_n_estimators) '''Model fit, predict and ROC''' colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) lw = 2 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 500) i_of_roc = 0 a = 0 probability_set_of_b_test = [] for (train_indice, test_indice), color in zip(cv.split(X, y), colors): a_model = classifier.fit(X[train_indice], y[train_indice]) probas_ = a_model.predict_proba(X[test_indice]) prob_of_b_test = a_model.predict_proba(data_test_filled_after_feature_selection) # 对B_test进行预测 probability_set_of_b_test.append(prob_of_b_test[:, 1]) fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) a += 1 # 序号加1 mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) i_of_roc += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print 'mean_auc=' + str(mean_auc) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC_rd_' + str(s) + '_gbdt_' + str(len_feature_choose) + '_features') plt.legend(loc="lower right") plt.show() avg_prob = (probability_set_of_b_test[0] + probability_set_of_b_test[1] + probability_set_of_b_test[2] + probability_set_of_b_test[3] + probability_set_of_b_test[4]) * 1.0 / 5 result_file_name = '../result/B_test_gbdt_predict_cv_fillna_10_rd_' + str(s) + '_N_' + str(parameter_n_estimators) + '_features_' + \ str(len_feature_choose) + '.csv'
#with open ('index_label_tuples.json') as fh: # index_label_tuples = json.load (fh)' train_size = int(1* len(index_label_tuples)) y = np.array([index_label_tuples[i][1] for i in range(len(index_label_tuples))]) x = np.array([index_label_tuples[i][0] for i in range(train_size)]) print('before padding', x.shape) print('Pad sequences (samples x time)') x = sequence.pad_sequences(x, maxlen=maxlen) print('x shape:', x.shape) print('Build model...') #k-flod cross validation on model kfold = StratifiedKFold(n_splits=2, shuffle=True) kfold.get_n_splits(x,y) cvscores= [] print(kfold) D1 = pd.DataFrame([]) D2 = pd.DataFrame([]) for train, test in kfold.split(x,y): y_test = y[test]; model = Sequential() model.add(Embedding(max_features, 128, dropout=0.2)) model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) model.add(Dense(3, activation='softmax')) # try using different optimizers and different optimizer configs #D2 = D2.append(pd.DataFrame(y[test])) model.compile(loss='categorical_crossentropy', optimizer='rmsprop',#ROOTmns
def split_files_cv(self, fasta, top_peaks_pwm, num_splits, name, bed=False): def correction(line): try: regular = re.match( r'([0-9]+)::(.+):([0-9]+)-([0-9]+)\([+,-]\)', line) return 'chr{}'.format( regular.group(2)), regular.group(3), regular.group(4) except: print( 'this line dont match the regular expresion for bed cg%: ', line) def create_files(filetype, n_CV, dataframe): with open(filetype + name + str(n_CV) + '.bed', 'w') as finalbed: with open(filetype + name + str(n_CV) + '.fa', 'w') as finalfa: for index, row in dataframe.iterrows(): finalbed.write('{}\t{}\t{}\t{}\n'.format( row.chr, row.start, row.end, row.id)) finalfa.write('>{}\n{}\n'.format(row.id, row.sequence)) with open(fasta, 'r') as totalfastafile: df = pandas.read_csv(totalfastafile, sep='>', names=['sequence', 'id']) dfid = df['id'].dropna().reset_index(drop=True) dfsequence = df['sequence'].dropna().reset_index(drop=True) dfasta = pandas.merge(dfid, dfsequence, left_index=True, right_index=True) if 'foreground' in name: with open(bed, 'r') as totalbedfile: dfbed = pandas.read_csv(totalbedfile, sep='\t', names=['chr', 'start', 'end', 'id']) df = pandas.merge(dfbed, dfasta, on="id") del dfid, dfsequence, dfasta, dfbed self.n_samples, _ = df.shape elif 'cg' in name and self.n_samples != None: if self.n_samples == None: raise (print( 'total number of peaks in foreground is not defined, run foreground before background' )) df = dfasta # create all the bed collumns based on >names from random sequences fasta in the dataframe df['chr'], df['start'], df['end'] = zip(*df['id'].map(correction)) true_mean_gc = round(self.gc_total) print(true_mean_gc, 'rounded total mean') # extract from the 1000000 list only the ones that has same GC content as peaks mean df = df.loc[df['sequence'].apply(lambda x: round(GC(x))) == true_mean_gc].reset_index(drop=True) print(df.shape, 'hola') print(df.head()) # keep only as much sequences as in seq peaks file df = df.loc[:self.n_samples - 1] print(df.shape, 'qtal') print(df.head()) df['class'] = np.hstack(([1] * self.n_samples)) # delete the peaks used to create the PWM df = df.loc[int(top_peaks_pwm - 1):].reset_index(drop=True) print(df.shape) skf = StratifiedKFold(n_splits=num_splits, shuffle=True) skf.get_n_splits(df, df['class']) for count, (train_index, test_index) in enumerate(skf.split(df, df['class'])): X_train, X_test = df.loc[train_index], df.loc[test_index] create_files('training_', count, X_train) create_files('testing_', count, X_test)
def authentication(data,data_flip,labels,thread_cnt,data_filename): print("Authentication") # Get k-fold split of dataset (k=5) cv = StratifiedKFold(n_splits=2,shuffle=False,random_state=0) cv.get_n_splits(data,labels) ### Perform k-fold cross validation y_prob = np.array([]) y_pred = np.array([]) y_true = np.array([]) for k,(train_index,test_index) in enumerate(cv.split(data,labels)): print(" Fold - " + str(k)) # Get training and testing sets train = np.vstack([data[train_index,:],data_flip[train_index,:]]) train_labels = np.append(labels[train_index],labels[train_index]) test = data[test_index,:] test_labels = labels[test_index] # Normalize to z-scores mu = np.mean(train,axis=0) std = np.std(train,axis=0) train = (train - mu) / std test = (test - mu) / std # Get training classes classes = np.unique(train_labels) classes_split = list(split_list(classes.tolist(),thread_cnt)) ### TRAINING # Binary SVM for each class class_svms = [] c_idxes = [] threads = [] que = Queue() # Thread to train each class binary SVM for li in classes_split: for i,c in enumerate(li): threads.append(Thread(target=authentication_train,args=(c,train,train_labels,que))) threads[-1].start() # Collect training thread results _ = [ t.join() for t in threads ] while not que.empty(): (c_idx,svm) = que.get() c_idxes.append(c_idx) class_svms.append(svm) ### TESTING threads = [] que = Queue() for li in classes_split: for i,c in enumerate(li): c_idx = c_idxes.index(c) threads.append(Thread(target=authentication_test,args=(c,class_svms[c_idx],test,test_labels,que))) threads[-1].start() # Collect testing thread results _ = [ t.join() for t in threads ] while not que.empty(): result = que.get() c = int(result[2]) c_prob = result[0] c_true = result[1] c_pred = np.zeros(c_prob.shape[0]) c_pred[c_prob<0.5] = 1 y_prob = np.append(y_prob,c_prob) y_true = np.append(y_true,c_true) y_pred = np.append(y_pred,c_pred) print() ### OVERALL RESULTS TP, FN, FP, TN = metrics.confusion_matrix(y_true,y_pred,labels=[0,1]).ravel() ACC = (TP + TN) / (TP + TN + FP + FN) FAR = FP / (FP + TN) FRR = FN / (FN + TP) fpr, tpr, thresholds = metrics.roc_curve(y_true,y_prob,pos_label=0) EER = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.) EER_thresh = interp1d(fpr, thresholds)(EER) y_prob = np.ones(y_prob.shape) - y_prob AUC = metrics.roc_auc_score(y_true,y_prob) # Print results print(data_filename) print("--------------------------------------------------------------------------------------") print("Authentication Results:") print("TP: " + str(TP) + "\n" + "FP: " + str(FP) + "\n" + "FN: " + str(FN) + "\n" + "TN: " + str(TN) + "\n" + "ACC: " + str(ACC) + "\n" + "FAR: " + str(FAR) + "\n" + "FRR: " + str(FRR) + "\n" + "AUC: " + str(AUC) + "\n" + "EER: " + str(EER) + "\n" + "EER_thresh " + str(EER_thresh)) print()
}} all_data , y_train = encode_dataset(train=train,test=test,meta=meta,target_model='lightgbm') print("*****************************") print(all_data.head()) train_obs = len(y_train) train = all_data[:train_obs] test = all_data[train_obs:] train_ids = train.index test_ids = test.index skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) skf.get_n_splits(train_ids, y_train) lgb_test_result = np.zeros(test_ids.shape[0]) #lgb_train_result = np.zeros(train_ids.shape[0]) #xgb_test_result = np.zeros(test_ids.shape[0]) #xgb_train_result = np.zeros(train_ids.shape[0]) counter = 0 #Transform data using small groups to reduce memory usage m = 100000 print('\nLightGBM\n') for train_index, test_index in skf.split(train_ids, y_train): print('Fold {}\n'.format(counter + 1)) print("**************************") print("train_index:",train_index) print("**************************")
for p in range(samples): data[cont, :, :, :] = getPatch(img, patch_dim, rows[p], cols[p]) labels[cont] = i - 1 cont += 1 data /= 255 crossval_splits = 5 accuracy = numpy.zeros(crossval_splits) sensitivity = numpy.zeros(crossval_splits) specificity = numpy.zeros(crossval_splits) cont = 0 skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123) skf.get_n_splits(data, labels) for train_index, test_index in skf.split(data, labels): train_data, test_data = data[train_index], data[test_index] train_labels, test_labels = labels[train_index], labels[test_index] train_labels = keras.utils.to_categorical(train_labels, num_classes=no_classes) #Convolutional Neural Network # create model kernel1 = 3 kernel2 = 5 no_filters1 = 20 no_filters2 = 40 model = keras.models.Sequential() #First Convolutional Layer
import numpy as np from sklearn.model_selection import StratifiedKFold X = ["a", "b", "c", "d"] y = [1, 1, 2, 2] skf = StratifiedKFold(n_splits=2) #for train, test in kf.split(X): # print("%s %s" % (train, test)) splits = skf.get_n_splits(X,y) print(splits)
def validation(self, X, Y, cat_features, method=1, verbose=False, n_folds=5, short=True): """ validation method, you can choose between different validation strategies Args: X: pandas.DataFrame, shape = (, 24) Y: pandas.Series method number: [1,2,3] # deprecated for ensemble cat_features: [9,10,11] see .train docstring n_folds: > 2 always using k-fold, if n_folds is 1 it is automatically put to 2 NOTE: https://www.youtube.com/watch?v=pA6uXzrDSUs&index=23&list=PLpQWTe-45nxL3bhyAJMEs90KF_gZmuqtm """ if verbose: print("{} [{}.validation] start validation method {}".format( ctime(), self.name, method)) validation_score = 0 if n_folds < 2: n_folds = 2 from sklearn.model_selection import StratifiedKFold splitclass = StratifiedKFold(n_splits=n_folds) # the following 20 lines come from sklearn docs example # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html for train_index, test_index in splitclass.split(X, Y): train_X, train_Y = X.loc[train_index], Y.loc[train_index] validation_X, validation_Y = X.loc[test_index], Y.loc[test_index] assert train_X.shape[0] == train_Y.shape[0] assert validation_X.shape[0] == validation_Y.shape[0] train_X.reset_index(drop=True, inplace=True) train_Y.reset_index(drop=True, inplace=True) validation_X.reset_index(drop=True, inplace=True) validation_Y.reset_index(drop=True, inplace=True) self.meta_predict(train_X, train_Y, validation_X, cat_features, short=short) score = self.evaluate(validation_Y) if verbose: print("{} [{}.validation] single score = {} ".format( ctime(), self.name, score)) validation_score += score # the total validation score is an average of the single validation scores validation_score /= splitclass.get_n_splits(X) self.validation_score = validation_score if verbose: print("{} [{}.validation] validation score = {} ".format( ctime(), self.name, validation_score)) if verbose: print("{} [{}.validation] finished validation method {}".format( ctime(), self.name, method)) return validation_score
callbacks=[early_stopping]) Y_score = bilstm_model.predict(X[test]) histories.append(history) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(encoded_Y[test], Y_score) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) pyp.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) pyp.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= kfold.get_n_splits(X, encoded_Y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print 'ROC AUC: %.2f' % mean_auc #pyp.plot(mean_fpr, mean_tpr, color='g', linestyle='--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) #pyp.xlim([0, 1.0]) #pyp.ylim([0, 1.0]) #pyp.xlabel('False Positive Rate') #pyp.ylabel('True Positive Rate') #pyp.title('Receiver operating characteristic example') #pyp.legend(loc="lower right") #pyp.show() #over all events momentum_input['signal'] = np.zeros((len(full_event['signal']), n_cand_per_jet,
def xgb_lgb_cv_modeling(): """ :return: """ '''Data input''' data_train = pd.read_csv('../data/train.csv', index_col='ID') data_predict = pd.read_csv('../data/pred.csv', index_col='ID') '''trainset feature engineering 根据具体的数据集进行编写''' data_train_without_label = data_train.drop('Label', axis=1) '''Sample''' # s = 0 # np.random.seed(s) # sampler = np.random.permutation(len(data_train_without_label.values)) # data_train_randomized = data_train_without_label.take(sampler) feature_name = list(data_train_without_label.columns.values) data_predict_user_id = list(data_predict.index.values) '''fillna''' frames = [data_train_without_label, data_predict] data_all = pd.concat(frames) data_train_filled = data_train_without_label.fillna(value=data_all.median()) '''construct train and test dataset''' x_temp = data_train_filled.iloc[:, :].as_matrix() # 自变量 y = data_train.iloc[:, -1].as_matrix() # 因变量 '''Feature selection''' X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean') # 0.1*mean可以选出10个特征 # 0.00001*mean可以选出14个特征 '''online test dataset -- B_test''' # del data_predict['V17'] # data_predict['UserInfo_242x40'] = data_predict['UserInfo_242'] * data_predict['UserInfo_40'] data_predict_filled = data_predict.fillna(value=data_all.median()) data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name) '''Split train/test data sets''' cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation '''Choose a classification model''' parameter_n_estimators = 100 classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1) '''hyperparameter optimization''' # param = { # 'max_depth': 6, # 'num_leaves': 64, # 'learning_rate': 0.03, # 'scale_pos_weight': 1, # 'num_threads': 40, # 'objective': 'binary', # 'bagging_fraction': 0.7, # 'bagging_freq': 1, # 'min_sum_hessian_in_leaf': 100 # } # # param['is_unbalance'] = 'true' # param['metric'] = 'auc' # (1)num_leaves # # LightGBM使用的是leaf - wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth。 # # 大致换算关系:num_leaves = 2 ^ (max_depth) # # (2)样本分布非平衡数据集:可以param[‘is_unbalance’]=’true’ # # (3)Bagging参数:bagging_fraction + bagging_freq(必须同时设置)、feature_fraction # # (4)min_data_in_leaf、min_sum_hessian_in_leaf '''Model fit, predict and ROC''' colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) lw = 2 mean_f1 = 0.0 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 500) i_of_roc = 0 a = 0 th = 0.5 for (train_indice, test_indice), color in zip(cv.split(X, y), colors): a_model = classifier.fit(X[train_indice], y[train_indice]) # y_predict_label = a_model.predict(X[test_indice]) probas_ = a_model.predict_proba(X[test_indice]) fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) a += 1 mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) i_of_roc += 1 label_transformed = probas_[:, 1] for i in range(len(label_transformed)): if label_transformed[i] > th: label_transformed[i] = 1 else: label_transformed[i] = 0 lt = label_transformed.astype('int32') f1 = f1_score(y[test_indice], lt) mean_f1 += f1 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print('mean_auc=' + str(mean_auc)) print('mean_f1=' + str(mean_f1/5)) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1)) plt.ylabel('True Positive Rate') plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5)) plt.legend(loc="lower right") plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba_to_label_using_th_' + str(th) + '.png') # plt.show() a_model = classifier.fit(X, y) # label_predict = a_model.predict(data_predict_filled_after_feature_selection) # 对B_test进行预测 proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection) '''proba result''' result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv' write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist()) # '''写入要提交的结果''' # result_file_name = '../result/pred_result_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '.csv' # write_predict_results_to_csv(result_file_name, data_predict_user_id, label_predict.tolist()) '''results file''' label_transformed = proba_predict[:, 1] for i in range(len(label_transformed)): if label_transformed[i] > th: label_transformed[i] = 1 else: label_transformed[i] = 0 lt = label_transformed.astype('int32') result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \ '_proba_to_label_using_th_' + str(th) + '.csv' write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
np.sum(np.abs(prediction_result2 - data_labels))) # Cross validate - kNN - All data knn_all = KNeighborsClassifier(n_neighbors=knn_k) knn_scores_all = cross_val_score(knn_all, sat_data, data_labels, cv=crossval_kfold) # Add to output dict print('Accuracy, mean of ' + str(crossval_split_k) + '-fold split= ', np.mean(knn_scores_all)) knn_mean_acc[dataset_use] = np.mean(knn_scores_all) # Get split for cofusion matrix calculation skf = StratifiedKFold(n_splits=crossval_split_k) skf.get_n_splits(sat_data, labels) # Initialize output confusion matrix and kappa knn_all_confmat = np.zeros((n_classes, n_classes)) knn_all_kappa = [] # Use split for train_index, test_index in skf.split(sat_data, labels): # Split into training and test set y_train, y_test = labels[train_index], labels[test_index] X_train, X_test = sat_data[train_index], sat_data[test_index] # Fit classifier knn_all.fit(X_train, y_train) # Do prediction y_pred = knn_all.predict(X_test) # Calculate confusion matrix conf_mat_temp = confusion_matrix(y_test, y_pred) # Add contribution to overall confusion matrix
def search_models(training_i, x, train_columns): """ This function computes the performance using 5 baseline models. It was done through stratified k-folds cross-validation using the complication's respective training set, with k= 3. We performed random hyperparameter search for all the hyperparameters over 20 iterations. We finally selected the top two set of hyperparameters that achieved the highest average area under the receiving operator characteristic curve (AUROC) on the validation sets, resulting with 6 final models per complication. The function below returns the 6 models for each respective training subset. """ baselines = ["Logistic Regression", "KNN", "LGBM", "SVM"] baselines_need_transforms = ["Logistic Regression"] baselines_need_transforms2 = ["KNN", "SVM"] X = training_i[train_columns] Y = training_i[x] n_iterations = 30 # number of iterations for random search top_n = 2 # select top n parameter sets all_ = {} # prepare indexes for stratified cross validation skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0) # skf = ShuffleSplit(n_splits= 3, random_state=0) skf.get_n_splits(X, Y) print("Stratified K Fold into 3 splits ...") for base in (baselines): print(f"Search in progress : {base}") roc_auc_mean, auprc_mean, dict_list, model_list, train_list, val_list = ( [] for i in range(6)) models_1, vals_1, trains_1 = ([] for i in range(3)) for i in range(0, n_iterations): if ((i + 1) % 10 == 0): print(f"Random search {i+1}...") skf_split = skf.split(X, Y) param_dictionary, model, X_ = choose_baseline(base, X) roc_in_k, pr_in_k, clf_in_k, train_in_k, val_in_k = ( [] for i in range(5)) j = 0 for train_index, val_index in skf_split: X_train = X_.iloc[train_index] y_train = Y.iloc[train_index] X_val = X_.iloc[val_index] y_val = Y.iloc[val_index] if (model in baselines_need_transforms): X_val, X_train = apply_transforms_MinMax_Scaler( X_val, X_train, train_columns) if (model in baselines_need_transforms2): X_val, X_train = apply_transforms_STD_Scaler( X_val, X_train, train_columns) clf = model(**param_dictionary) clf = clf.fit(X_train, y_train) # predicting y_pred = clf.predict_proba(X_val)[:, 1] # calculate performance across folds roc = roc_auc_score(y_val, y_pred) AUPRC = average_precision_score(y_val, y_pred) pr_in_k.append(AUPRC) roc_in_k.append(roc) roc_array = np.asarray(roc_in_k) pr_array = np.asarray(pr_in_k) clf_in_k.append(clf) train_in_k.append(train_index) val_in_k.append(val_index) j = j + 1 # append the lists for each hyperparameter search roc_auc_mean.append(roc_array.mean()) auprc_mean.append(pr_array.mean()) dict_list.append(param_dictionary) val_list.append(val_in_k) train_list.append(train_in_k) model_list.append(clf_in_k) gc.collect() # Storing results for this model print(f"Storing results of top models for {base}") results_pd = pd.DataFrame({ "avg_roc_auc": roc_auc_mean, "avg_auprc": auprc_mean, "clf_s": model_list, "validation_sets": val_list, "train_sets": train_list }) results_pd.sort_values("avg_roc_auc", ascending=False, axis=0, inplace=True) top_pd = results_pd.head(top_n) models_1.append(top_pd['clf_s'].values[0:3][:6]) vals_1.append(top_pd['validation_sets'].values[0:3][:6]) trains_1.append(top_pd['train_sets'].values[0:3][:6]) param_df = pd.DataFrame() param_df["models"] = models_1 param_df["vals"] = vals_1 param_df["trains"] = trains_1 param_df["auc_val"] = top_pd.avg_roc_auc.mean() param_df["auprc_val"] = top_pd.avg_auprc.mean() val_sets, train_sets, models_ = ([] for i in range(3)) for i in range(len(param_df.vals[0])): for j in (param_df.vals[0][i]): val_sets.append(j) for i in range(len(param_df.trains[0])): for j in (param_df.trains[0][i]): train_sets.append(j) for i in range(len(param_df.models[0])): for j in (param_df.models[0][i]): models_.append(j) # storing top models and performance for all the different types of models all_[base] = [ models_, param_df["auc_val"].values, param_df["auprc_val"].values, val_sets, train_sets ] return (all_)
def predict(): #importing our already trained model using pickle # with open ('log_model', 'rb') as f: # lr_model = pickle.load(f) #importing the dataset as a corpus using the pandas library data = pd.read_csv('data.csv') data.head() #inspecting the data to see what it looks like data['Body'][0] data['Body'][:7] #Looking at the data, there are some missing columns, so let's take care of that data.fillna('Article unavailable') #data cleaning with text preprocessing techniques #data cleaning first round #using regular expressions and string to clean #function for first round of data cleaning def clean_text_round1(text): text = str(text).lower() #making all text lowercase text = re.sub('\[.*?\]', '', text) #removing full stops and question marks text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) #removing digits return text round1 = lambda x: clean_text_round1(x) #Let's take a look at the updated text data_clean = pd.DataFrame(data.Body.apply(round1)) data_clean #let's apply a second round of cleaning because some nonsensical text was ignored in the first clean def clean_text_round2(text): text = re.sub('[' '""...]', '', text) text = re.sub('\n', '', text) return text round2 = lambda x: clean_text_round2(x) #let's take a look at the updated text again data_clean = pd.DataFrame(data_clean.Body.apply(round2)) data_clean['Body'][0] #Concatenating our cleaned data to our corpus data['clean_body'] = data_clean data['clean_body'][0] #Extract features and target variables import numpy as np X = np.array(data['clean_body'], data['URLs']) #feature variables y = np.array(data['Label']) #target variable y = list(map(int, y)) #Split the data into folds from sklearn.model_selection import StratifiedKFold kf = StratifiedKFold(n_splits=2) kf.get_n_splits(X) #Split the data into train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) #create a document-term matrix for the train and test data using tfidf vectorizer from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer(stop_words='english', max_df=0.7) #removing all Englilsh stop words tf_train = tf.fit_transform(X_train) tf_test = tf.transform(X_test) #get feature names #tf.get_feature_names() #Now we feed our data into our classifiers to develop our model. #First we try the Naive Bayes classifier from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() #training the model nb.fit(tf_train, y_train) #predicting nb_pred = nb.predict(tf_test) nb_pred[0:10] #Evaluating the accuracy of the model nb_score = nb.score(tf_test, y_test) print('accuracy: %0.3f' % nb_score) #Next let's build another model using logistic Regression from sklearn.linear_model import LogisticRegression lr = LogisticRegression() #training the model lr.fit(tf_train, y_train) #predicting lr_pred = lr.predict(tf_test) lr_pred[0:10] #Evaluating the accuracy of the logistic regression model lr_score = lr.score(tf_test, y_test) print('accuracy: %0.3f' % lr_score) #Using the random forest classifier from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier() #training the model rf.fit(tf_train, y_train) #predicting rf_pred = rf.predict(tf_test) rf_pred[0:10] #Evaluating the accuracy of the model rf_score = rf.score(tf_test, y_test) print('accuracy: %0.3f' % rf_score) #let's pickle the data_dtm for future use #data_dtm.to_pickle("dtm.pkl") #Since the logistic regression model is the most accurate classifier, lets save it and test it with #other news articles #saving the model with open('log_model', 'wb') as f: pickle.dump(lr, f) #importing the model and testing with open('log_model', 'rb') as f: lr_model = pickle.load(f) if request.method == 'POST': article = request.form['article'] input_article = [article] vect = tf.transform(input_article).toarray() lr_predict = lr_model.predict(vect) return render_template('result.html', prediction=lr_predict)
from sklearn.preprocessing import MinMaxScaler,StandardScaler scaler = StandardScaler() X_name = scaler.fit_transform(X_name) acc_sum_color = 0 acc_sum_face = 0 acc_sum_name = 0 acc_sum_hybrid = 0 kf = StratifiedKFold(n_splits=10) kf.get_n_splits(X_color,Y) X_hybrid = [None]*(count) for train_index, test_index in kf.split(X_color,Y): X_color_train, X_color_test = X_color[train_index], X_color[test_index] X_face_train, X_face_test = X_face[train_index], X_face[test_index] X_name_train, X_name_test = X_name[train_index], X_name[test_index] y_train, y_test = Y[train_index], Y[test_index] pnn = algorithms.PNN() pnn.fit(X_color_train, y_train) predicted_color_prob = pnn.predict_proba(X_color_test) pnn = algorithms.PNN() pnn.fit(X_face_train, y_train) predicted_face_prob = pnn.predict_proba(X_face_test)
#Getting the mean accuracy and standard deviation of accuracy score mean_score=np.mean(cvscores) std_score=np.std(cvscores) #printing the results. print("####################################") print("Accuracy:") print mean_score print ("+/-") print std_score print("####################################") print("Confusion Matrix:") print conf print("####################################") print("ROC AND AUC") plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',label='Luck') mean_tpr /= kfold.get_n_splits(x, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC CURVE FOR SIFT D = '+str(p)+' pixels') plt.legend(loc="lower right") plt.show() print("##################################")
def poly(data, label, n_folds=10, scale=True, exclude=[], feature_selection=False, save=True, scoring='auc', project_name='', concurrency=1, verbose=True): ''' Input data = numpy matrix with as many rows as samples label = numpy vector that labels each data row n_folds = number of folds to run scale = whether to scale data or not exclude = list of classifiers to exclude from the analysis feature_selection = whether to use feature selection or not (anova) save = whether to save intermediate steps or not scoring = Type of score to use ['auc', 'f1'] project_name = prefix used to save the intermediate steps concurrency = number of parallel jobs to run verbose = whether to print or not results Ouput scores = matrix with scores for each fold and classifier confusions = confussion matrix for each classifier predictions = Cross validated predicitons for each classifier ''' assert label.shape[0] == data.shape[0],\ "Label dimesions do not match data number of rows" _le = LabelEncoder() _le.fit(label) label = _le.transform(label) n_class = len(np.unique(label)) if save and not os.path.exists('poly_{}/models'.format(project_name)): os.makedirs('poly_{}/models'.format(project_name)) if not verbose: logger.setLevel(logging.ERROR) logger.info('Building classifiers ...') classifiers = build_classifiers(exclude, scale, feature_selection, data.shape[1]) scores = pd.DataFrame(columns=pd.MultiIndex.from_product( [classifiers.keys(), ['train', 'test']]), index=range(n_folds)) predictions = pd.DataFrame(columns=classifiers.keys(), index=range(data.shape[0])) test_prob = pd.DataFrame(columns=classifiers.keys(), index=range(data.shape[0])) confusions = {} coefficients = {} # !fitted_clfs = # pd.DataFrame(columns=classifiers.keys(), index = range(n_folds)) logger.info('Initialization, done.') skf = StratifiedKFold(n_splits=n_folds, random_state=1988) skf.get_n_splits(np.zeros(data.shape[0]), label) kf = list(skf.split(np.zeros(data.shape[0]), label)) # Parallel processing of tasks manager = Manager() args = manager.list() args.append({}) # Store inputs shared = args[0] shared['kf'] = kf shared['X'] = data shared['y'] = label args[0] = shared args2 = [] for clf_name, val in classifiers.items(): for n_fold in range(n_folds): args2.append((args, clf_name, val, n_fold, project_name, save, scoring)) if concurrency == 1: result = list(starmap(fit_clf, args2)) else: pool = Pool(processes=concurrency) result = pool.starmap(fit_clf, args2) pool.close() fitted_clfs = {key: [] for key in classifiers} # Gather results for clf_name in classifiers: coefficients[clf_name] = [] temp = np.zeros((n_class, n_class)) temp_pred = np.zeros((data.shape[0], )) temp_prob = np.zeros((data.shape[0], )) clfs = fitted_clfs[clf_name] for n in range(n_folds): train_score, test_score, prediction, prob, confusion,\ coefs, fitted_clf = result.pop(0) clfs.append(fitted_clf) scores.loc[n, (clf_name, 'train')] = train_score scores.loc[n, (clf_name, 'test')] = test_score temp += confusion temp_prob[kf[n][1]] = prob temp_pred[kf[n][1]] = _le.inverse_transform(prediction) coefficients[clf_name].append(coefs) confusions[clf_name] = temp predictions[clf_name] = temp_pred test_prob[clf_name] = temp_prob # Voting fitted_clfs = pd.DataFrame(fitted_clfs) scores['Voting', 'train'] = np.zeros((n_folds, )) scores['Voting', 'test'] = np.zeros((n_folds, )) temp = np.zeros((n_class, n_class)) temp_pred = np.zeros((data.shape[0], )) for n, (train, test) in enumerate(kf): clf = MyVoter(fitted_clfs.loc[n].values) X, y = data[train, :], label[train] scores.loc[n, ('Voting', 'train')] = _scorer(clf, X, y) X, y = data[test, :], label[test] scores.loc[n, ('Voting', 'test')] = _scorer(clf, X, y) temp_pred[test] = clf.predict(X) temp += confusion_matrix(y, temp_pred[test]) confusions['Voting'] = temp predictions['Voting'] = temp_pred test_prob['Voting'] = temp_pred ###### # saving confusion matrices if save: with open('poly_' + project_name + '/confusions.pkl', 'wb') as f: p.dump(confusions, f, protocol=2) if verbose: print(scores.astype('float').describe().transpose() [['mean', 'std', 'min', 'max']]) return Report(scores, confusions, predictions, test_prob, coefficients)
test_acc = [] test_f1 = [] test_precision = [] test_recall = [] all_num_trainval = [] all_num_test = [] all_num_under_test = [] # constructing stratified K-Folds for Outer-Cross-Validation (OCV) # splitting into trainval & test datasets skf = StratifiedKFold(n_splits=TEST_FOLD, random_state=RANDOM_STATE, shuffle=True) skf.get_n_splits(X, y) test_count = 1 for trainval_index, test_index in skf.split(X, y): X_trainval, X_test = X[trainval_index], X[test_index] y_trainval, y_test = y[trainval_index], y[test_index] # undersampling test set print("-" * 70) print("START PROCESSING TEST SET") num_class0, num_class1 = y_test.value_counts() all_num_test.append([num_class0, num_class1]) num_us_instance = num_class0 - num_class1 sample_test = pd.concat([y_test, X_test], axis=1) under_sample_test = undersampling(sample_test, num_us_instance) num_class0, num_class1 = under_sample_test['y'].value_counts()
def load_data_stratified(filename, fold, ispca, n_component): tanggal = strftime("%d%m%y-%H%M%S") text_file = open("extract/data-" + tanggal + ".txt", "w") t0 = time() with open(filename, 'rb') as csvfile: lines = csv.reader(csvfile) dataset = list(lines) trainingSetFold = [] trainingSetTFold = [] testSetDFold = [] testSetTFold = [] trainSet = [] trainLabel = [] for x in range(len(dataset)): for y in range(len(dataset[0]) - 1): dataset[x][y] = float(dataset[x][y]) trainSet.append(dataset[x][:-1]) trainLabel.append(dataset[x][-1]) skf = StratifiedKFold(n_splits=10) skf.get_n_splits(trainSet, trainLabel) for train_index, test_index in skf.split(trainSet, trainLabel): trainingSetD = [] trainingSetT = [] testSetT = [] testSetD = [] for y in train_index: trainingSetD.append(trainSet[y]) trainingSetT.append(trainLabel[y]) for y in test_index: testSetD.append(trainSet[y]) testSetT.append(trainLabel[y]) if ispca: from sklearn.decomposition import PCA t0 = time() pca = PCA(n_components=n_component) trainingSet = pca.fit_transform(trainingSetD) testSet = pca.transform(testSetD) timepreprocesss = ("%0.3fs" % (time() - t0)) print("PCA from " + str(len(dataset[0]) - 1) + " to " + str(n_component) + " done in %s" % timepreprocesss) text_file.write( "PCA from %s to %s done in %s\n" % (str(len(dataset[0]) - 1), str(n_component), timepreprocesss)) trainingSet = trainingSet.tolist() trainingSetD = trainingSet testSet = testSet.tolist() testSetD = testSet trainingSetFold.append(trainingSetD) trainingSetTFold.append(trainingSetT) testSetDFold.append(testSetD) testSetTFold.append(testSetT) timeload = ("%0.5fs" % (time() - t0)) print "Load time > " + timeload + ", Dimension > " + str( len(dataset)) + "*" + str(len(dataset[0])) text_file.write("Load time > %s ---- Dimension > %s * %s\n" % (timeload, str(len(dataset)), str(len(dataset[0])))) return trainingSetFold, trainingSetTFold, testSetDFold, testSetTFold
] fig = plt.figure() ax = fig.add_subplot(1, 1, 1) for name, pipeline in pipelines: mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for train, test in cv.split(X, y): probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f)'.format(name) % mean_auc, lw=LW) plt.plot([0, 1], [0, 1], linestyle='--', lw=LW, color='k', label='Luck') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10))
if __name__ == '__main__': with open(op_file, 'w') as of: x_data, y_data = read_data(ip_txt_file) ext_feature = read_external_features(ip_txt_file, ip_feat_file) cv_count = 0 k_score = [] # Stratified cross-validation skf = StratifiedKFold(n_splits=sys_params['cross_val']) skf.get_n_splits(x_data, y_data) # Run the model for each splits for train_index, test_index in skf.split(x_data, y_data): cv_count += 1 print '\nRunning Stratified Cross Validation: {0}/{1}...'.format( cv_count, sys_params['cross_val']) x_train, x_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] # Convert the class labels into categorical y_train, y_test = to_categorical(y_train), to_categorical(y_test) # Reshape the data for CNN x_train = x_train.reshape(x_train.shape[0], x_train.shape[1],
i = 0 for (train, test), color in zip(cv.split(X, y), colors): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) assert isinstance(probas_, np.ndarray) print(probas_.shape) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= cv.get_n_splits(X, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig(os.path.join(local_path, 'plot_roc_crossval.png'))
def main(): feature_array_all=np.loadtxt('x_1170.txt',dtype=np.float32) f = open("y.txt", "rb") label_vector= f.read().decode() label_vector=list(label_vector) f.close() label_vector = np.array(label_vector,dtype=np.float32) #The independent testing dataset is taken out and cannot participate in 5-CV X_trainset,X_testset,y_trainset,y_testset=train_test_split(feature_array_all,label_vector,test_size=0.2,random_state=0,stratify=label_vector) X=X_trainset y=y_trainset skf = StratifiedKFold(n_splits=5, random_state=2, shuffle=True) skf.get_n_splits(X, y) ACC_sum=0 roc_auc_sum=0 Sn_sum=0 Sp_sum=0 F1_sum=0 MCC_sum=0 cnt=1 for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf=svm.SVC(probability=True,C=14.251026703029963,gamma=0.007196856730011528) clf=clf.fit(X_train,y_train) score_r=clf.score(X_test,y_test) predict_y_test = clf.predict(X_test) TP=0 TN=0 FP=0 FN=0 for i in range(0,len(y_test)): if int(y_test[i])==1 and int(predict_y_test[i])==1: TP=TP+1 elif int(y_test[i])==1 and int(predict_y_test[i])==0: FN=FN+1 elif int(y_test[i])==0 and int(predict_y_test[i])==0: TN=TN+1 elif int(y_test[i])==0 and int(predict_y_test[i])==1: FP=FP+1 Sn=float(TP)/(TP+FN) Sp=float(TN)/(TN+FP) ACC=float((TP+TN))/(TP+TN+FP+FN) prob_predict_y_test = clf.predict_proba(X_test) predictions_test = prob_predict_y_test[:, 1] y_validation=np.array(y_test,dtype=int) fpr, tpr, thresholds =metrics.roc_curve(y_validation, predictions_test,pos_label=1) roc_auc = auc(fpr, tpr) F1=metrics.f1_score(y_validation, np.array(predict_y_test, int)) MCC=metrics.matthews_corrcoef(y_validation, np.array(predict_y_test, int)) print('Times=%s'%cnt) print('svm ACC:%s'%ACC) print('svm AUC:%s'%roc_auc) print('svm Sn:%s'%Sn) print('svm Sp:%s'%Sp) print('svm F1:%s'%F1) print('svm MCC:%s'%MCC) ACC_sum+=ACC roc_auc_sum+=roc_auc Sn_sum+=Sn Sp_sum+=Sp F1_sum+=F1 MCC_sum+=MCC cnt+=1 ACC=ACC_sum/5 roc_auc=roc_auc_sum/5 Sn=Sn_sum/5 Sp=Sp_sum/5 F1=F1_sum/5 MCC=MCC_sum/5 print('') print('5-Fold cross validation_Conclusion') print('svm ACC:%s'%ACC) print('svm AUC:%s'%roc_auc) print('svm Sn:%s'%Sn) print('svm Sp:%s'%Sp) print('svm F1:%s'%F1) print('svm MCC:%s'%MCC)
def identification(data,data_flip,labels,thread_cnt,data_filename): print("Identification") # Get k-fold split of dataset (k=5) cv = StratifiedKFold(n_splits=5,shuffle=False,random_state=1) cv.get_n_splits(data,labels) ### Perform k-fold cross validation y_prob_list = [] y_pred = np.array([]) y_true = np.array([]) for k,(train_index,test_index) in enumerate(cv.split(data,labels)): print(" Fold - " + str(k)) # Get training and testing sets train = np.vstack([data[train_index,:],data_flip[train_index,:]]) train_labels = np.append(labels[train_index],labels[train_index]) test = data[test_index,:] test_labels = labels[test_index] # Normalize to z-scores mu = np.mean(train,axis=0) std = np.std(train,axis=0) train = (train - mu) / std test = (test - mu) / std # Get training classes classes = np.unique(train_labels) ### TRAINING svm = SVC(kernel='linear', probability=True) svm.fit(train,train_labels) ### TESTING prediction = svm.predict(test) prob = svm.predict_proba(test) for i,label in enumerate(test_labels): j = int(label-1) y_prob_list.append(prob[i,j]) y_true = np.append(y_true,test_labels) y_pred = np.append(y_pred,prediction) print() ### OVERALL RESULTS confusion_matrix = metrics.confusion_matrix(y_true,y_pred) TP = 0 FP = 0 FN = 0 TN = 0 for i in range(confusion_matrix.shape[0]): TP_i = confusion_matrix[i,i] FP_i = np.sum(confusion_matrix[i,:]) - TP_i FN_i = np.sum(confusion_matrix[:,i]) - TP_i TN_i = np.sum(np.sum(confusion_matrix)) - TP_i - FP_i - FN_i TP = TP + TP_i FP = FP + FP_i FN = FN + FN_i TN = TN + TN_i ACC = (TP + TN) / (TP + TN + FP + FN) FAR = FP / (FP + TN) FRR = FN / (FN + TP) # Print results print(data_filename) print("--------------------------------------------------------------------------------------") print("Identification Results:") print("TP: " + str(TP) + "\n" + "FP: " + str(FP) + "\n" + "FN: " + str(FN) + "\n" + "TN: " + str(TN) + "\n" + "ACC: " + str(ACC) + "\n" + "FAR: " + str(FAR) + "\n" + "FRR: " + str(FRR)) print(str(min(y_prob_list))) print()
def make_cnn(params): common_features = [] conv_activation = params['activ'] kernel_size = params['kern'] num_filters = params['nfilt'] pool_size = params['pool_size'] dense_layer_size = params['dense_size'] num_dense_layers = params['num_dense_layers'] dense_activation = params['dense_activ'] num_conv_layers = params['num_conv'] num_pool_layers = params['num_pool'] train_images = params['train_images'] train_labels = params['train_labels'] num_images = params['num_images'] image_width = params['image_width'] stride = params['stride'] num_train_images = int(num_images * 4/5) num_test_images = int(num_images * 1/5) try: #construct CNN for i in range(0, num_conv_layers): if i==0: common_features.append(Conv2D(num_filters, kernel_size=kernel_size, activation=conv_activation, input_shape=(image_width,image_width,1), strides=(stride, stride))) else: common_features.append(Conv2D(num_filters, kernel_size=kernel_size, activation=conv_activation, strides=(stride, stride))) if i<num_pool_layers: common_features.append(MaxPooling2D(pool_size = (pool_size, pool_size), strides=(stride, stride))) if kernel_size > 3: kernel_size -= 1 for i in range(0, num_dense_layers): if i==0: common_features.append(Flatten()) common_features.append(Dense(dense_layer_size, activation=dense_activation)) common_features.append(Dense(2, activation='sigmoid')) # measure 5-fold classification accuracy num_folds = 5 kfold = StratifiedKFold(n_splits = num_folds, shuffle=True, random_state=1) kfold.get_n_splits(train_images, train_labels) average_train_performance = 0 for train_indices, test_indices in kfold.split(train_images, train_labels): folded_train_images = [] folded_train_labels = [] folded_test_images = [] folded_test_labels = [] for train_index in train_indices: folded_train_images.append(train_images[train_index]) folded_train_labels.append(train_labels[train_index]) for test_index in test_indices: folded_test_images.append(train_images[test_index]) folded_test_labels.append(train_labels[test_index]) cnn_model = Sequential(common_features) print(cnn_model.summary()) cnn_model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'],) cnn_model.fit(np.reshape(folded_train_images, [num_train_images, image_width, image_width, 1]), np.reshape(to_categorical(folded_train_labels), [num_train_images, 2]), epochs=20, batch_size=16,) # measure classification accuracy for the validation fold train_performance = cnn_model.evaluate(np.reshape(folded_test_images, [num_test_images, image_width, image_width, 1]), np.reshape(to_categorical(folded_test_labels), [num_test_images, 2])) average_train_performance += train_performance[1] print("Accuracy on Train set: {0}".format(train_performance[1])) # return avg classification accuracy over 5 folds average_train_performance /= 5 return average_train_performance*-1 # if an error happens, return a big number. this can be triggered sometimes, for instance if there are more pool layers than conv layers except Exception as e: print("ERROR: {}".format(e)) return 1000
ycat.name = ycat.name + '_cat'; ########################################################################################## # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE> X = X.dropna(); y = np.log(200+y); ycat = pd.qcut(y, quantiles); ycat.name = ycat.name + '_cat'; # <PLACEHOLDER FOR NON-GENERIC CODE: INSERT CODE HERE> ########################################################################################## # Get first iteration of the k-fold indices, use it for the train-validation split # Other iterations may be used later #print 'Splitting training data into training and validation sets...'; skf = StratifiedKFold(n_splits=int(1./validation_size), shuffle=True); skf.get_n_splits(X, y); train_indices, valid_indices = next(iter(skf.split(X, ycat))); # Scale the numeric columns if required. X = X.join(pd.Series('TRAIN', index=train_indices, name = 'rowtype').append(pd.Series('VALID', index=valid_indices, name = 'rowtype'))); X_test=test_dataset.join(pd.Series('TEST', index=test_dataset.index, name = 'rowtype')); # Combine train, valid and test covariates to create a consolidated covariate set covariates = pd.concat([X, X_test], axis=0, ignore_index=True); # If id column does not exist, create one. if (idcol is None) or ( len(idcol) == 0 ): idcol = 'id'; covariates=covariates.join(pd.Series( range(1, len(covariates) + 1,1), index=covariates.index, name = idcol )); # Find and add columns with zero std deviation to irrelevant columns- These add no information. irrelevant_cols = irrelevant_cols + (covariates.std(axis=0, numeric_only=True) < 0.5)[(covariates.std(axis=0) == 0.0)].index.tolist();
def main(): CID = opts.cluster if (opts.load != 'none'): CID = opts.load X_train, X_test, Y_train, Y_test, X, X2, X3, enc = f.get_data_pro( testsize=0.2) #Y_inv = decode_y(Y_train) ranges = np.linspace(.1, 1.0, 10) for size in ranges: x_train, x_placeholder, y_train, yplaceholder = train_test_split( X_train, Y_train, test_size=1 - size, random_state=0) skf = StratifiedKFold(n_splits=10) y_train_dec = decode_y(y_train) skf.get_n_splits(x_train, y_train_dec) accues = [] aucs = [] for train_index, validate_index in skf.split(x_train, y_train_dec): print("TRAIN:", train_index, "TEST:", validate_index) x_cvtrain, x_validate = x_train[train_index], x_train[ validate_index] y_cvtrain, y_validate = y_train[train_index], y_train[ validate_index] model = bulid_model(x_cvtrain, x_validate, y_cvtrain, y_validate, X, X2, X3, CID, fromfile=opts.load) Y_de = decode_y(y_validate, features=enc.active_features_) Y_pred = model.predict(x_validate) Y_score = model.predict_proba(x_validate) fpr, tpr, thplaceholder = roc_curve(Y_de, Y_score[:, 1]) Y_depred = decode_y(Y_pred, features=enc.active_features_) accues.append(accuracy_score(Y_depred, Y_de)) aucs.append(auc(fpr, tpr)) print("###########################") print(accues) print(aucs) # model, history = bulid_model( # X_train, X_test, Y_train, Y_test, X, X2, X3, CID, fromfile=opts.load) # #newData = X_test.reshape(X_test.shape[0], 1, 100, 20) # Y_score = model.predict_proba(X_test) # roc.roc_plot( # Y_test, # Y_score, # 2, # filepath=os.path.join('figures', CID + opts.title + 'roc.svg'), # fmt='svg', # title=opts.title) # plt.close() # print(history.history.keys()) # # summarize history for accuracy # plt.plot(history.history['acc']) # plt.plot(history.history['val_acc']) # plt.title('model accuracy') # plt.ylabel('accuracy') # plt.xlabel('epoch') # plt.legend(['train', 'test'], loc='upper left') # plt.savefig(os.path.join('figures', CID + opts.title + 'learning-c.svg'),format='svg') # plt.close() #Y_de = decode_y(Y_test, features=enc.active_features_) #Y_pred = model.predict(X_test) #Y_depred = decode_y(Y_pred, features=enc.active_features_) #print(classification_report(Y_de, Y_depred)) return