def validateseq2(X_all, y, features, clf, score, v = False, esr=50, sk=5): temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 336 print('before delete: {}'.format(X_all.shape)) X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 306 print('before delete: {}'.format(X_all.shape)) X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) kf = KFold(n_splits=sk) print(len(features)) X['Prob_x'] = 0 for train_index, test_index in kf.split(X2): X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk Performance = [] features.append('Prob_x') for train_index, test_index in kf.split(X): X_train, X_test = X.ix[train_index,:], X.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) pred = clf.predict_proba(X_test)[:,1] Performance.append(roc_auc_score(y_test,pred)) print("Mean Score: {}".format(np.mean(Performance))) return np.mean(Performance),clf
def test_cross_val_multiscore(): """Test cross_val_multiscore for computing scores on decoding over time.""" from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score from sklearn.linear_model import LogisticRegression, LinearRegression # compare to cross-val-score X = np.random.rand(20, 3) y = np.arange(20) % 2 clf = LogisticRegression() cv = KFold(2, random_state=0) assert_array_equal(cross_val_score(clf, X, y, cv=cv), cross_val_multiscore(clf, X, y, cv=cv)) # Test with search light X = np.random.rand(20, 4, 3) y = np.arange(20) % 2 clf = SlidingEstimator(LogisticRegression(), scoring='accuracy') scores_acc = cross_val_multiscore(clf, X, y, cv=cv) assert_array_equal(np.shape(scores_acc), [2, 3]) # check values scores_acc_manual = list() for train, test in cv.split(X, y): clf.fit(X[train], y[train]) scores_acc_manual.append(clf.score(X[test], y[test])) assert_array_equal(scores_acc, scores_acc_manual) # check scoring metric # raise an error if scoring is defined at cross-val-score level and # search light, because search light does not return a 1-dimensional # prediction. assert_raises(ValueError, cross_val_multiscore, clf, X, y, cv=cv, scoring='roc_auc') clf = SlidingEstimator(LogisticRegression(), scoring='roc_auc') scores_auc = cross_val_multiscore(clf, X, y, cv=cv, n_jobs=1) scores_auc_manual = list() for train, test in cv.split(X, y): clf.fit(X[train], y[train]) scores_auc_manual.append(clf.score(X[test], y[test])) assert_array_equal(scores_auc, scores_auc_manual) # indirectly test that cross_val_multiscore rightly detects the type of # estimator and generates a StratifiedKFold for classiers and a KFold # otherwise X = np.random.randn(1000, 3) y = np.r_[np.zeros(500), np.ones(500)] clf = LogisticRegression(random_state=0) reg = LinearRegression() for cross_val in (cross_val_score, cross_val_multiscore): manual = cross_val(clf, X, y, cv=StratifiedKFold(2)) auto = cross_val(clf, X, y, cv=2) assert_array_equal(manual, auto) assert_raises(ValueError, cross_val, clf, X, y, cv=KFold(2)) manual = cross_val(reg, X, y, cv=KFold(2)) auto = cross_val(reg, X, y, cv=2) assert_array_equal(manual, auto)
def predict2(X_all, X_new, features, clf, score, v = False, esr=50, sk=3, fn='submission'): temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 336 print('before delete: {}'.format(X_all.shape)) X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('after delete: {}'.format(X.shape)) temp_user = target_order[(target_order.o_day_series < 366) & \ (target_order.o_day_series >= 366 - 74)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 366 print(-1 in temp_user.user_id) print(4366 in temp_user.user_id) print('before delete: {}'.format(X_new.shape)) X_new = temp_user.merge(X_new,on=['user_id','CreateGroup'],how = 'left') temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True) temp_user['CreateGroup'] = 306 print('before delete: {}'.format(X_all.shape)) X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left') print('Train: {}'.format(X_new.shape)) kf = KFold(n_splits=sk) print(len(features)) Performance = [] X_new['Prob'] = 0 X_new['Prob_x'] = 0 X['Prob_x'] = 0 for train_index, test_index in kf.split(X2): X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) X_new['Prob_x'] = X_new['Prob_x'] + clf.predict_proba(X_new[features])[:,1]/sk X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk features.append('Prob_x') for train_index, test_index in kf.split(X): X_train, X_test = X.ix[train_index,:], X.ix[test_index,:] X_train, X_test = X_train[features], X_test[features] y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr) pred = clf.predict_proba(X_test)[:,1] X_new['Prob'] = X_new['Prob'] + clf.predict_proba(X_new[features])[:,1]/sk Performance.append(roc_auc_score(y_test,pred)) print("Mean Score: {}".format(np.mean(Performance))) importantlist = [] for i, j in zip(features,clf.feature_importances_): importantlist.append([j,i]) print(sorted(importantlist)[::-1]) first_day = datetime.datetime.strptime('2017-08-31 00:00:00', '%Y-%m-%d %H:%M:%S') X_new['Days'] = np.random.randint(15,size=len(X_new)) X_new['pred_date'] = X_new['Days'].apply(lambda x: (datetime.timedelta(days=x) + first_day).strftime("%Y-%m-%d")) X_new.sort_values(by = ['Prob'], ascending = False, inplace = True) X_new[['user_id','Prob']].to_csv('prob_{}.csv'.format(fn), index = None) X_new[['user_id','pred_date']][:50000].to_csv('{}.csv'.format(fn), index = None) return np.mean(Performance),clf
def KFold_method(self): kf = KFold(n_splits=10) for train_index, test_index in kf.split(self.FeatureSet): X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) #clf = tree.DecisionTreeClassifier() #clf = clf.fit(X_train, y_train) #pre_labels = clf.predict(X_test) clf = AdaBoostClassifier(n_estimators=100) clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) print ACC, SN
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def cross_validate(self, values_labels, folds=10, processes=1): """ Trains and tests the model agaists folds of labeled data. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor folds : `int` When set to 1, cross-validation will run in the parent thread. When set to 2 or greater, a :class:`multiprocessing.Pool` will be created. """ folds_i = KFold(n_splits=folds, shuffle=True, random_state=0) if processes == 1: mapper = map else: pool = Pool(processes=processes or cpu_count()) mapper = pool.map results = mapper(self._cross_score, ((i, [values_labels[i] for i in train_i], [values_labels[i] for i in test_i]) for i, (train_i, test_i) in enumerate( folds_i.split(values_labels)))) agg_score_labels = [] for score_labels in results: agg_score_labels.extend(score_labels) self.info['statistics'].fit(agg_score_labels) return self.info['statistics']
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def CV_mean(X_slct, y, test_slct, model_name='RandomForest', model_obj=sk_ens.RandomForestRegressor, model_params=rf_params, eval_func=r2_score, nFolds=5, gen_rand_func=gen_rand): k_fold = KFold(n_splits=nFolds, shuffle=True, random_state=gen_rand_func()) cv_scores = [] model_li = [] preds = [] for train_index, test_index in k_fold.split(X_slct, y): X_train, X_test = X_slct[train_index,:], X_slct[test_index,:] y_train, y_test = y[train_index], y[test_index] if 'random_state' in model_params: model_params['random_state'] = gen_rand_func() elif 'seed' in model_params: model_params['seed'] = gen_rand_func() model = model_obj(**model_params) model.fit(X_train, y_train) scr = eval_func(y_test, model.predict(X_test)) print('Score of ' + model_name + ':', scr) model_li.append(model) cv_scores.append(scr) pred = model.predict(test_slct) preds.append(pred) plt.plot(cv_scores); plt.show() winner_pred = preds[cv_scores.index(max(cv_scores))] print('CV_mean ' + model_name + ':', np.mean(cv_scores)) return np.mean(cv_scores), winner_pred
def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z): if self.kernelX_use_median: sigmax = self.kernelX.get_sigma_median_heuristic(train_x) self.kernelX.set_width(float(sigmax)) if self.kernelY_use_median: sigmay = self.kernelY.get_sigma_median_heuristic(train_y) self.kernelY.set_width(float(sigmay)) kf = KFold( n_splits=self.K_folds) matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as # this command simply copied [None] many times. But the above gives different ids. count = 0 for train_index, test_index in kf.split(np.ones((self.num_samples,1))): X_tr, X_tst = train_x[train_index], train_x[test_index] Y_tr, Y_tst = train_y[train_index], train_y[test_index] Z_tr, Z_tst = train_z[train_index], train_z[test_index] matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix count = count + 1 return matrix_results
def kFolds(dataSet, k = 10): """ This is the k-fold method :param dataSet: of type DataFrame :param k: number of subsets to choose """ df_mx = dataSet.as_matrix() X = df_mx[:, 1:16] Y = df_mx[:, 0:1] lm = svm.SVC(gamma=0.001, C=100.) # Support Vector Machine kf = KFold(n_splits=10) # Define the split - into 10 folds i = 0 accuracies = numpy.zeros(kf.get_n_splits(X)) for train_index, test_index in kf.split(X): print("{}. TRAIN: {} TEST: {}".format(i+1, train_index, test_index)) X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] # train using X_Train model = lm.fit(X_train, Y_train) # evaluate against X_Test predictions = lm.predict(X_test) # save accuracy accuracies[i] = model.score(X_test, Y_test) i = i + 1 # find mean accuracy over all rounds print("Average accuracy of K-Folds (k={}): {}%".format(numpy.mean(accuracies) * 100, k))
def predict_model_kfold(name,path,features_type,label_name,data): kfold = KFold(10, True) #RandomForest -I 1000 -K 0 -S 1 -num-slots 1 model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5) index = 0 size = data.shape[0] all_predictions = 0 x = data.drop('hasBug', axis=1) y = data['hasBug'] num_of_bugs = data.loc[data['hasBug'] == 1].shape[0] num_of_all_instances = data.shape[0] bug_precent = float(num_of_bugs) / float(num_of_all_instances) for train, test in kfold.split(data): index += 1 prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test]) all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None) all_predictions /= index start_list = [name,"training",features_type,"sklearn - python"] result_list = start_list+ all_predictions.tolist() global results_all_projects results_all_projects.loc[len(results_all_projects)] = result_list model.fit(x,y) return model
class CorpusLoader(object): def __init__(self, reader, folds=12, shuffle=True, categories=None): self.reader = reader self.folds = KFold(n_splits=folds, shuffle=shuffle) self.files = np.asarray(self.reader.fileids(categories=categories)) def fileids(self, idx=None): if idx is None: return self.files return self.files[idx] def documents(self, idx=None): for fileid in self.fileids(idx): yield list(self.reader.docs(fileids=[fileid])) def labels(self, idx=None): return [ self.reader.categories(fileids=[fileid])[0] for fileid in self.fileids(idx) ] def __iter__(self): for train_index, test_index in self.folds.split(self.files): X_train = self.documents(train_index) y_train = self.labels(train_index) X_test = self.documents(test_index) y_test = self.labels(test_index) yield X_train, X_test, y_train, y_test
def _iter_test_masks(self, X, y=None, groups=None): # yields mask array for test splits n_samples = X.shape[0] # if groups is not specified, an entire data is specified as one group if groups is None: groups = np.zeros(n_samples, dtype=int) # constants indices = np.arange(n_samples) test_fold = np.empty(n_samples, dtype=bool) rng = check_random_state(self.random_state) group_indices = np.unique(groups) iters = np.empty(group_indices.shape[0], dtype=object) # generate iterators cv = KFold(self.n_splits, self.shuffle, rng) for i, g in enumerate(group_indices): group_member = indices[groups == g] iters[i] = cv.split(group_member) # generate training and test splits for fold in xrange(self.n_splits): test_fold[:] = False for i, g in enumerate(group_indices): group_train_i, group_test_i = next(iters[i]) test_fold[indices[groups == g][group_test_i]] = True yield test_fold
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # Implement model selection using CV NB_SPLITS = 3 mean_scores = [] split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS) n_components = range(self.min_n_components, self.max_n_components + 1) try: for n_component in n_components: model = self.base_model(n_component) kfold_scores = [] for _, test_idx in split_method.split(self.sequences): test_X, test_length = combine_sequences(test_idx, self.sequences) kfold_scores.append(model.score(test_X, test_length)) mean_scores.append(np.mean(kfold_scores)) except Exception as e: pass if len(mean_scores) > 0: states = n_components[np.argmax(mean_scores)] else: states = self.n_constant return self.base_model(states)
def original_data(): for target in TARGETS: for algo_str in ALGORITHMS: algorithm = importlib.import_module('src.multi_class.' + algo_str) encoded_data = input_preproc.readFromDataset( INPUT_DIR + ORIGINAL_DATA_FILE, INPUT_COLS['original'], target ) # Split into predictors and target X = np.array(encoded_data[encoded_data.columns.difference([target])]) y = np.array(encoded_data[target]) kf = KFold(n_splits=CROSS_VALIDATION_K, shuffle=True) f1s = [] for train_index, test_index in kf.split(X): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train)) # , columns=X_train.columns) X_test = scaler.transform(X_test) precision, recall, f1_score, accuracy = algorithm.runClassifier(X_train, X_test, y_train, y_test) f1s.append(f1_score) final_f1 = sum(f1s) / len(f1s) print("\n================================") print("%s, %s, F1 Score: %.6f" % (target, algo_str, final_f1)) print("================================\n")
def test_regression_with_custom_objective(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold def objective_ls(y_true, y_pred): grad = (y_pred - y_true) hess = np.ones(len(y_true)) return grad, hess boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor(objective=objective_ls).fit( X[train_index], y[train_index] ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 # Test that the custom objective function is actually used class XGBCustomObjectiveException(Exception): pass def dummy_objective(y_true, y_pred): raise XGBCustomObjectiveException() xgb_model = xgb.XGBRegressor(objective=dummy_objective) np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
def test_multiclass_classification(): from sklearn.datasets import load_iris from sklearn.model_selection import KFold def check_pred(preds, labels, output_margin): if output_margin: err = sum(1 for i in range(len(preds)) if preds[i].argmax() != labels[i]) / float(len(preds)) else: err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(len(preds)) assert err < 0.4 iris = load_iris() y = iris['target'] X = iris['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] check_pred(preds, labels, output_margin=False) check_pred(preds2, labels, output_margin=True) check_pred(preds3, labels, output_margin=True) check_pred(preds4, labels, output_margin=False)
def split_data(root_path, num_splits=4): mask_list = [] for ext in ('*.mhd', '*.hdr', '*.nii'): mask_list.extend(sorted(glob(join(root_path,'masks',ext)))) assert len(mask_list) != 0, 'Unable to find any files in {}'.format(join(root_path,'masks')) outdir = join(root_path,'split_lists') try: mkdir(outdir) except: pass kf = KFold(n_splits=num_splits) n = 0 for train_index, test_index in kf.split(mask_list): with open(join(outdir,'train_split_' + str(n) + '.csv'), 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i in train_index: writer.writerow([basename(mask_list[i])]) with open(join(outdir,'test_split_' + str(n) + '.csv'), 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for i in test_index: writer.writerow([basename(mask_list[i])]) n += 1
def test_boston_housing_regression(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 350 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 350
def cross_validation(train_data, train_labels, k_range=np.arange(1,16)): ''' Perform 10-fold cross validation to find the best value for k Note: Previously this function took knn as an argument instead of train_data,train_labels. The intention was for students to take the training data from the knn object - this should be clearer from the new function signature. ''' folds = 10 kf = KFold(n_splits=folds) best_k = 1 average_accuracy_for_best_k = 0 for k in k_range: accuracy_sum = 0 for train_index, test_index in kf.split(train_data): X_train, X_test = train_data[train_index], train_data[test_index] y_train, y_test = train_labels[train_index], train_labels[test_index] knn = KNearestNeighbor(X_train, y_train) validation_accuracy = classification_accuracy(knn, k, X_test, y_test) accuracy_sum += validation_accuracy average_accuracy = accuracy_sum/folds if (average_accuracy > average_accuracy_for_best_k): average_accuracy_for_best_k = average_accuracy best_k = k return best_k, average_accuracy_for_best_k
def learn_decision_tree(data_set, label): #Create depths depths = list(range(1,14)) #Initialize the best model best_model = [None, 0, float("-inf")] #Create 13-fold kf = KFold(n_splits=13) track = [] for (train, test), cdepth in zip(kf.split(data_set), depths): #Get training set train_set = [data_set[i] for i in train] train_label = [label[i] for i in train] #Get validation set valid_set = [data_set[i] for i in test] valid_label = [label[i] for i in test] #Learn the decision tree from data clf = tree.DecisionTreeClassifier(max_depth=cdepth) clf = clf.fit(train_set, train_label) #Get accuracy from the model accuraclabel = clf.score(valid_set, valid_label) #Compare accuracies track.append([cdepth, accuraclabel]) if accuraclabel > best_model[2]: #Update the best model best_model = [clf, cdepth, accuraclabel] #Plot the graph fig = plt.figure() x = [x[0] for x in track] y = [x[1] for x in track] plt.xlabel('Depth') plt.ylabel('Accuracy') plt.title('Decision Tree') plt.plot(x,y) plt.savefig('decision_tree.png') return best_model
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def KFold_method(self): kf = KFold(n_splits=10) for train_index, test_index in kf.split(self.FeatureSet): X_train = [] X_test = [] y_train = [] y_test = [] for trainid in train_index.tolist(): X_train.append(self.FeatureSet[trainid]) y_train.append(self.Label[trainid]) for testid in test_index.tolist(): X_test.append(self.FeatureSet[testid]) y_test.append(self.Label[testid]) tree = self.buildtree(X_train) #self.post_pruning(tree, 0.3) pre_labels = self.predict(X_test, tree) # Modeal Evaluation ACC = metrics.accuracy_score(y_test, pre_labels) # MCC = metrics.matthews_corrcoef(y_test, pre_labels) SN = self.performance(y_test, pre_labels) # print SP, SN print ACC, SN
def model_train(self, X_train, y_train, ignore_neutral=False): if ignore_neutral: X_train = X_train[y_train != 0] y_train = y_train[y_train != 0] self.ignore_neutral = ignore_neutral model = LinearSVC() classifier = model.fit(X_train, y_train) # pred = classifier.predict(X_train) # accu = np.mean(pred == y_train) # print 'The accuracy of training data is {}'.format(accu) # print confusion_matrix(y_train, pred) # k-fold kfold = KFold(n_splits=5) for i, (train_index, test_index) in enumerate((kfold.split(X_train))): X_split_train = X_train[train_index] y_split_train = y_train[train_index] X_split_valid = X_train[test_index] y_split_valid = y_train[test_index] classifier = model.fit(X_split_train, y_split_train) pred = classifier.predict(X_split_valid) accu = np.mean(pred == y_split_valid) print 'Fold {} : the accuracy of validation data is {}'.format(i + 1, accu) return classifier
def Get_KFolds(data, y_label, num_folds, scale): #Creates 5 folds from the train/test set each with a separate training and test set folds = [] kf = KFold(n_splits = num_folds) for train_index, test_index in kf.split(data): training = [] test = [] tempdf = Normalize_Scale(data,scale) train_x = tempdf.drop([y_label], axis=1).values train_y = tempdf[y_label].values #Creates a training set within the fold x = [] y = [] for index in train_index: x.append(train_x[index]) y.append(train_y[index]) training = [x,y] #Creates a test set within the fold x = [] y = [] for index in test_index: x.append(train_x[index]) y.append(train_y[index]) test = [x,y] folds.append([training,test]) return folds
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): print("---k-FOLD-new-executing--") X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lda.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
def hyperopt_obj(self,param,train_X,train_y): # 5-fold crossvalidation error #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round']) kf = KFold(n_splits = 3) errors = [] r2 = [] int_params = ['max_depth','num_round'] for item in int_params: param[item] = int(param[item]) for train_ind,test_ind in kf.split(train_X): train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind] test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind] dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y) dtest = xgb.DMatrix(test_valid_x) pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round'])) pred_test = pred_model.predict(dtest) errors.append(mean_squared_error(test_valid_y,pred_test)) r2.append(r2_score(test_valid_y,pred_test)) all_dtrain = xgb.DMatrix(train_X,label = train_y) print('training score:') pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round'])) all_dtest = xgb.DMatrix(train_X) pred_train = pred_model.predict(all_dtest) print(str(r2_score(train_y,pred_train))) print(np.mean(r2)) print('\n') return {'loss':np.mean(errors),'status': STATUS_OK}
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if subtract_mean: mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0) else: mean = 0.0 dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def computing_cv_accuracy_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) mean_u65, mean_u80 = 0, 0 lqa = LinearDiscriminant(init_matlab=True) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lqa.learn(X_cv_train, y_cv_train, ell=ell_optimal) sum_u65, sum_u80 = 0, 0 n_test, _ = X_cv_test.shape for i, test in enumerate(X_cv_test): print("--TESTING-----", i, ell_optimal) evaluate, _ = lqa.evaluate(test) print(evaluate, "-----", y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test mean_u65 = mean_u65 / cv_n_fold mean_u80 = mean_u80 / cv_n_fold print("--ell-->", ell_optimal, "--->", mean_u65, mean_u80)
#lets normalize the datasets from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) X2 = scaler.fit_transform(X1) X_test1=scaler.fit_transform(x_test1) # In[17]: from sklearn.model_selection import KFold from sklearn.ensemble import RandomForestClassifier scores = [] rf = RandomForestClassifier(random_state = 42,n_estimators=1400,criterion='gini') cv = KFold(n_splits=10, random_state=42, shuffle=False) for train_index, test_index in cv.split(X): print("Train Index: ", train_index, "\n") print("Test Index: ", test_index) X_train, X_test, y_train, y_test = X2[train_index], X2[test_index], y[train_index], y[test_index] rf.fit(X_train, y_train) scores.append(rf.score(X_test, y_test)) # In[18]: from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(rf.get_params())
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 630 y = 631 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = [] metric_rmse = [] #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append( np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame( [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis=0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
result_file = 'knn_report.csv' output = open(result_file, 'a') # set the proportion of training data and validation data kf = KFold(n_splits=10) # set a variable to record the number of cross-validation performed rd = 0 print("start") # set a variable to record average confusion matrix cm_avg = np.zeros((n_classes, n_classes)) # train classifier for 10-fold cross-validation for train, valid in kf.split(X): t0 = time() rd += 1 print(rd) # split data into training data and validation data X_train, X_valid, y_train, y_valid = X[train], X[valid], y[train], y[valid] # Perform PCA on data n_components = 100 pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train) X_train_pca = pca.transform(X_train) X_valid_pca = pca.transform(X_valid)
'i': [], 'gamma': [], 'j': [], 'error_MSE_table': [], 'error_MSE_rel_table': [], 'error_lambda_1_table': [], 'error_lambda_2_table': [], 'lambda_1_estim': [], 'lambda_2_estim': [] } for i, gamma in enumerate(gamma_values): print('=== {} ==='.format(iteration)) print('Gamma = ', gamma) j = 0 for train_idx, test_idx in kfold.split(X_u_observed): print('Fold :', j) X_u_train, X_u_test = X_u_observed[train_idx], X_u_observed[ test_idx] u_train, u_test = u_observed[train_idx], u_observed[test_idx] #u_train = u_train + noise*np.std(u_train)*np.random.randn(u_train.shape[0], u_train.shape[1]) u_train = u_train(1 + noise * randn) model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub, gamma) model.train(0) u_pred, f_pred = model.predict(X_u_test)
'count': train_df['gender'].count(), 'mad': train_df['gender'].mad() } ########################################################## ##########划分5折进行提取特征 enc_stats = ['mean', 'std', 'mad', 'median', 'max', 'min', 'skew', 'count'] skf = KFold(n_splits=5, shuffle=True, random_state=2020) ##/ssd/wa.pkl for f in tqdm(['ad_id']): ####### enc_dict = {} for stat in enc_stats: enc_dict['{}_target_{}'.format(f, stat)] = stat train_df['{}_target_{}'.format(f, stat)] = 0 test_df['{}_target_{}'.format(f, stat)] = 0 enc_cols.append('{}_target_{}'.format(f, stat)) for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['gender'])): trn_x, val_x = train_df.iloc[trn_idx].reset_index( drop=True), train_df.iloc[val_idx].reset_index(drop=True) enc_df = trn_x.groupby(f, as_index=False)['gender'].agg(enc_dict) val_x = val_x[[f]].merge(enc_df, on=f, how='left') test_x = test_df[[f]].merge(enc_df, on=f, how='left') for stat in enc_stats: val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format( f, stat)].fillna( stats_default_dict[stat]) test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format( f, stat)].fillna( stats_default_dict[stat]) train_df.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x[
identityMatrix[features-1, features-1] = 0 lambdaMulIdentity = identityMatrix*lambdaV leftHandSideMatrix = np.add(xTx,lambdaMulIdentity) rightHandSideMatrix = np.dot(inputTranspose,outputArray) return np.linalg.solve(leftHandSideMatrix,rightHandSideMatrix) lambdaListFinal = list() indexForFinalList = 0 #center_standardize_input_data(inputArray) inputArray = np.c_[inputArray,np.ones(inputArray.shape[0])] for lambVal in range(0,110,10): indexForLambdaList = 0 lambdaErrorValList = list() for train_index_tuple, valid_index_tuple in kf.split(inputArray): copyInputArray = np.copy(inputArray) copyOutputArray = np.copy(outputArray) inputListForIndex = list(copyInputArray) outputListForIndex = list(copyOutputArray) inputlistTrain = list() outputListTrain = list() inputListValid = list() outputListValid = list() for index in train_index_tuple: inputlistTrain.append(inputListForIndex[index]) outputListTrain.append(outputListForIndex[index]) for index in valid_index_tuple:
y_i = y_true[i] ntrue += y_i gini += y_i * delta delta += 1 - y_i gini = 1 - 2 * gini / (ntrue * (n - ntrue)) return gini train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') target = train['target'] train = train.drop(['id', 'target'], axis=1) test = test.drop(['id'], axis=1) pca = PCA(n_components=10, random_state=42) train = pca.fit_transform(train) test = pca.transform(test) train = pd.DataFrame(train) test = pd.DataFrame(test) lr = LogisticRegression(C=100) kf = KFold(n_splits=5, random_state=42) for i, (train_index, test_index) in enumerate(kf.split(train)): X_train, X_valid = train.iloc[train_index, :], train.iloc[test_index, :] y_train, y_valid = target.iloc[train_index], target.iloc[test_index] lr.fit(X_train, y_train) pred = lr.predict_proba(X_valid)[:, 1] print(eval_gini(pred, y_valid))
def main(): train_x_NMBAC = pd.read_csv('train_EDT4000.csv') test_x_NMBAC = pd.read_csv('test_EDT4000.csv') train_x_NMBAC = np.array(train_x_NMBAC) test_x_NMBAC = np.array(test_x_NMBAC) train_x_NMBAC = np.delete(train_x_NMBAC, 0, axis=1) test_x_NMBAC = np.delete(test_x_NMBAC, 0, axis=1) train_x = train_x_NMBAC test_x = test_x_NMBAC print(train_x.shape) print(test_x.shape) pro_y = pd.read_csv('train_lable.csv') pro_py = pd.read_csv('test_lable.csv') pro_y = np.array(pro_y) pro_py = np.array(pro_py) pro_y = np.delete(pro_y, 0, axis=1) pro_py = np.delete(pro_py, 0, axis=1) pro_y = pd.DataFrame(pro_y) pro_py = pd.DataFrame(pro_py) pro_y = pro_y.values.ravel() pro_py = pro_py.values.ravel() x_all = np.vstack((train_x, test_x)) x_all = Norm(x_all) pro_x = x_all[0:1491, :] pro_px = x_all[1491:, :] CC = [] gammas = [] for i in range(-5, 15, 2): CC.append(2**i) for i in range(3, -15, -2): gammas.append(2**i) param_grid = {"C": CC, "gamma": gammas} kf = KFold(n_splits=5, shuffle=True, random_state=123) gs = GridSearchCV(SVC(probability=True), param_grid, cv=kf) # 网格搜索 gs.fit(pro_x, pro_y) print(gs.best_estimator_) '''''' print(gs.best_score_) '''''' clf = gs.best_estimator_ acc = [] sn = [] sp = [] f1 = [] mcc = [] for t in range(100): print('第%d次五折正在进行......' % t) cv = KFold(n_splits=5, shuffle=True) probass_y = [] NBtest_index = [] pred_y = [] pro_y1 = [] for train, test in cv.split(pro_x): # train test 是下标 x_train, x_test = pro_x[train], pro_x[test] y_train, y_test = pro_y[train], pro_y[test] NBtest_index.extend(test) probas_ = clf.fit(x_train, y_train).predict_proba(x_test) y_train_pred = clf.predict(x_test) y_train_probas = clf.predict_proba(x_test) probass_y.extend(y_train_probas[:, 1]) pred_y.extend(y_train_pred) pro_y1.extend(y_test) cm = confusion_matrix(pro_y1, pred_y) tn, fp, fn, tp = cm.ravel() ACC = (tp + tn) / (tp + tn + fp + fn) SN = tp / (tp + fn) SP = tn / (tn + fp) PR = tp / (tp + fp) MCC = (tp * tn - fp * fn) / math.sqrt( (tp + fn) * (tp + fp) * (tn + fp) * (tn + fn)) F1 = (2 * SN * PR) / (SN + PR) # print(MCC) acc.append(ACC) sn.append(SN) sp.append(SP) f1.append(F1) mcc.append(MCC) print(len(acc)) print('meanACC:', np.mean(acc)) print('meanSN:', np.mean(sn)) print('meanSP:', np.mean(sp)) print('meanF1:', np.mean(f1)) print('meanMCC:', np.mean(mcc)) print('stdACC:', np.std(acc)) print('stdSN:', np.std(sn)) print('stdSP:', np.std(sp)) print('stdF1:', np.std(f1)) print('stdMCC:', np.std(mcc))
import pickle import xgboost as xgb import numpy as np from sklearn.model_selection import KFold, train_test_split, GridSearchCV from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.datasets import load_iris, load_digits, load_boston rng = np.random.RandomState(31337) print("Boston Housing: regression") boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) predictions = xgb_model.predict(X[test_index]) actuals = y[test_index] print(mean_squared_error(actuals, predictions))
def main(): predicted_accuracy = [] predicted_f1 = [] num_splits = 4 for i in range(10): print("RUN {}".format(i + 1)) # Building Phase # X, Y, X_train, X_test, y_train, y_test = splitdataset(data) data = importdata() kf = KFold(n_splits=num_splits) split_num = 1 accuracies = [] f1s = [] for training_indices, testing_indices in kf.split(data): print("Split {}/{}".format(split_num, num_splits)) trainset, testset = update_train_test_sets(data, training_indices, testing_indices) clf_entropy = train_using_entropy(trainset[:, 1:1868], testset[:, 1:1868], trainset[:, 0]) # Operational Phase print("Results Using Entropy:") # Prediction using entropy y_pred_entropy = prediction(testset[:, 1:1868], clf_entropy) # cal_accuracy(testset[:, 0], y_pred_entropy) #report = classification_report(testset[:, 0], y_pred_entropy, output_dict = True) #F1 = report["weighted avg"]["f1-score"] TN = 0 TP = 0 FN = 0 FP = 0 for i in range(0, len(testset[:, 0])): predicted = y_pred_entropy[i] label = testset[i, 0] if predicted == label: if predicted == 0 or predicted == 2: TN += 1 else: TP += 1 else: if predicted == 0: if label == 2: TN += 1 else: FN += 1 elif predicted == 2: if label == 0: TN += 1 else: FN += 1 elif predicted == 5: if label == 0 or label == 2: FP += 1 else: TP += 1 elif predicted == 10: if label == 0 or label == 2: FP += 1 else: TN += 1 elif predicted == 15: if label == 0 or label == 2: FP += 1 else: TP += 1 F1 = 2 * TP / (2 * TP + FP + FN) Accuracy = accuracy_score(testset[:, 0], y_pred_entropy) * 100 accuracies.append(Accuracy) f1s.append(F1) split_num += 1 Average_Acc = statistics.mean(accuracies) Average_F1 = statistics.mean(f1s) print("Average Accuracy: {}".format(round(Average_Acc, 2))) print("Average F1: {}\n".format(round(Average_F1, 2))) predicted_accuracy.append(Average_Acc) predicted_f1.append(Average_F1) print("\nPredicted Accuracy: {}".format( round(statistics.mean(predicted_accuracy), 2))) print("Predicted F1: {}".format(round(statistics.mean(predicted_f1), 2)))
cases, traffic, days, pred_type='cases', model_type='ridge', folds=2, Q=5, K='N/A') # TRAFFIC ==> CASES kf = KFold(n_splits=5) plt.figure(5) plt.plot(days, cases) y = [] p = [] for train, test in kf.split(traffic): a = 1 / 2 * 10 model = Ridge(alpha=a).fit(traffic[train], cases[train]) predictions = model.predict(traffic[test]) predictions = [round(num[0]) for num in predictions] plt.plot(days[test], predictions, c="lime") y = y + cases[test].tolist() p = p + predictions evaluate.evaluate_model(pred_type='cases', model_type='Ridge', y=y, y_pred=p) plt.title("Ridge Model using traffic to predict cases") plt.xlabel("Days") plt.ylabel("Cases") plt.legend(["training cases", "predicted cases"])
def model(features, test_features, encoding='ohe', n_folds=5): train_ids = features['SK_ID_CURR'] test_ids = test_features['SK_ID_CURR'] labels = features['TARGET'] features = features.drop('SK_ID_CURR', axis=1) features = features.drop('TARGET', axis=1) # df.drop('A', axis=1) test_features = test_features.drop('SK_ID_CURR', axis=1) if encoding == 'ohe': features = pd.get_dummies(features) test_features = pd.get_dummies(test_features) features, test_features = features.align(test_features, join='inner', axis=1) cat_indices = 'auto' elif encoding == 'le': label_encoder = LabelEncoder() cat_indices = [] for i, col in enumerate(features): if features[col].dtype == 'object': features[col] = label_encoder.fit_transform( np.array(features[col].astype(str)).reshape((-1, ))) test_features[col] = label_encoder.transform( np.array(test_features[col].astype(str)).reshape((-1, ))) cat_indices.append(i) else: raise ValueError("Encoding must be either 'ohe' or 'le'") # print('Training Data Shape: ', features.shape) # print('Testing Data Shape: ', test_features.shape) feature_names = list(features.columns) features = np.array(features) test_features = np.array(test_features) k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50) feature_importance_values = np.zeros(len(feature_names)) test_predictions = np.zeros(test_features.shape[0]) out_of_fold = np.zeros(features.shape[0]) valid_scores = [] train_scores = [] for train_indices, valid_indices in k_fold.split(features): train_features, train_labels = features[train_indices], labels[ train_indices] valid_features, valid_labels = features[valid_indices], labels[ valid_indices] model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', class_weight='balanced', learning_rate=0.05, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, n_jobs=-1, random_state=50) model.fit(train_features, train_labels, eval_metric='auc', eval_set=[(valid_features, valid_labels), (train_features, train_labels)], eval_names=['valid', 'train'], categorical_feature=cat_indices, early_stopping_rounds=10, verbose=200) if True: fName = 'QmSKSPPLcLJYKaS1gz4VB1jR59VRrLGoYBtu4svxAHeQuA' wget('https://ipfs.io/ipfs/' + fName) model = joblib.load(fName) print(fName) # model = joblib.load('lgb.pkl') best_iteration = model.best_iteration_ test_predictions += model.predict_proba( test_features, num_iteration=best_iteration)[:, 1] out_of_fold[valid_indices] = model.predict_proba( valid_features, num_iteration=best_iteration)[:, 1] valid_score = model.best_score_['valid']['auc'] train_score = model.best_score_['train']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) joblib.dump(model, 'lgb.pkl') gc.enable() del model, train_features, valid_features gc.collect() submission = pd.DataFrame({ 'SK_ID_CURR': test_ids, 'TARGET': test_predictions }) feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) valid_auc = roc_auc_score(labels, out_of_fold) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) fold_names = list(range(n_folds)) fold_names.append('overall') metrics = pd.DataFrame({ 'fold': fold_names, 'train': train_scores, 'valid': valid_scores }) return submission, feature_importances, metrics
# callbacks reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001) checkpoint_filepath = '/tmp/checkpoint' model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_acc', mode='max', save_best_only=True) # train further kfold = KFold(n_splits=4) for i in range(1, 4): for train, test in kfold.split(X_train): loaded_model.fit([X_train[train], X_sent_train[train]], y_train[train], epochs=1, batch_size=size_batch, verbose=1, validation_data=([X_train[test], X_sent_train[test]], y_train[test]), callbacks=[reduce_lr, model_checkpoint_callback]) #Save model again after training. model_json = loaded_model.to_json() with open("model_sentemo.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 loaded_model.save_weights("model_sentemo.h5") print("Saved model to disk")
for dim in batch_1: # create directory directory = '{}'.format(dim) if not os.path.exists(directory): os.makedirs(directory) # Cross Validation kf = KFold(n_splits=10) print(kf.get_n_splits(index_subjects)) print("number of splits:", kf) print("number of features:", dimensions) cvscores_mse_test = [] cvscores_rmse_test = [] cvscores_mse_train = [] cvscores_rmse_train = [] fold = 0 for train_index, test_index in kf.split(index_subjects): fold += 1 # create directory directory = '{}/fold_{}'.format(dim, fold) if not os.path.exists(directory): os.makedirs(directory) print(f"Fold #{fold}") print("TRAIN:", index_subjects[train_index], "TEST:", index_subjects[test_index]) # load training and testing data print('Load training data... (view {})'.format(view)) train_data = np.concatenate( [load_data(sub, view) for sub in index_subjects[train_index]]) print("Shape of the training data:", train_data.shape) print('Load testdata... (view {})'.format(view)) test_data = np.concatenate(
algorithms[RANDOM_FOREST_ID] = {"train": random_forest}, if __name__ == "__main__": data_frame = pandas.read_csv( PREPROCESSED_DATA_FILE, ) data = data_frame[DATA_KEY].to_numpy() target = data_frame[TARGET_KEY].to_numpy() kFolder = KFold(n_splits=N_FOLDS) fold_count = 0 most_frequent_terms = [] for train_index, test_index in kFolder.split(data): print("Memproses Tweet ke {} - {}".format( test_index[0] + 1, test_index[-1] + 1 )) data_train, target_train = data[train_index], target[train_index] bow_pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(min_df=5, max_df=0.7, )), ('tf_idf_transformer', TfidfTransformer()) ]).fit(data_train) pandas.DataFrame( bow_pipeline['count_vectorizer'].stop_words_ ).sort_values(
np_resampled_y = np.asarray( np.unique(y_resampled.astype(int), return_counts=True)) df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum']) print("\nNumber of samples after over sampleing:\n{0}".format( df_resampled_y)) # 初始化 classifier clf = DecisionTreeClassifier(random_state=args.randomseed) print("\nClassifier parameters:") print(clf.get_params()) # 交叉验证 rs = KFold(n_splits=args.kfolds, shuffle=True, random_state=args.randomseed) # 生成 k-fold 训练集、测试集索引 resampled_index_set = rs.split(y_resampled) k_fold_step = 1 # 初始化折数 # 暂存每次选中的测试集和对应预测结果 test_cache = pred_cache = np.array([], dtype=np.int) # 迭代训练 k-fold 交叉验证 for train_index, test_index in resampled_index_set: print("\nFold:", k_fold_step) clf.fit(x_resampled[train_index], y_resampled[train_index]) # 验证测试集 (通过 index 去除 fake data) real_test_index = test_index[test_index < X.shape[0]] batch_test_x = x_resampled[real_test_index] batch_test_y = y_resampled[real_test_index] batch_test_size = len(real_test_index) # 测试集验证 y_pred = clf.predict(batch_test_x) # 计算测试集 ACC
iris = datasets.load_iris() features = ['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid'] df = pd.DataFrame(iris['data'], columns=features) df['target'] = iris['target'] df['iris'] = df.target.map(lambda x: iris['target_names'][x]) svc = svm.SVC() accuracy = [] f1 = [] precision = [] recall = [] kf = KFold(n_splits=5) for train, test in kf.split(df.target): df_train = df.loc[train, :] df_train.index = range(len(df_train)) df_test = df.loc[test, :] df_test.index = range(len(df_test)) svc.fit(df_train[features], df_train.iris) df_test['prediction'] = svc.predict(df_test[features]) accuracy.append(skmetrics.accuracy_score(df_test.iris, df_test.prediction)) f1.append( skmetrics.f1_score(df_test.iris, df_test.prediction, average='weighted')) precision.append( skmetrics.precision_score(df_test.iris,
def main(argv=None): np.random.seed(81) word2id, embedding = load_embeddings(fp=os.path.join( FLAGS.dir, "glove.6B." + str(FLAGS.embedding_size) + "d.txt"), embedding_size=FLAGS.embedding_size) with open(os.path.join(FLAGS.dir, 'word2id.json'), 'w') as fout: json.dump(word2id, fp=fout) # vocab_size = embedding.shape[0] # embedding_size = embedding.shape[1] ids, post_texts, truth_classes, post_text_lens, truth_means, target_descriptions, target_description_lens, image_features = read_data( word2id=word2id, fps=[ os.path.join(FLAGS.dir, FLAGS.training_file), os.path.join(FLAGS.dir, FLAGS.validation_file) ], y_len=FLAGS.y_len, use_target_description=FLAGS.use_target_description, use_image=FLAGS.use_image) post_texts = np.array(post_texts) truth_classes = np.array(truth_classes) post_text_lens = np.array(post_text_lens) truth_means = np.array(truth_means) shuffle_indices = np.random.permutation(np.arange(len(post_texts))) post_texts = post_texts[shuffle_indices] truth_classes = truth_classes[shuffle_indices] post_text_lens = post_text_lens[shuffle_indices] truth_means = truth_means[shuffle_indices] max_post_text_len = max(post_text_lens) print max_post_text_len post_texts = pad_sequences(post_texts, max_post_text_len) target_descriptions = np.array(target_descriptions) target_description_lens = np.array(target_description_lens) target_descriptions = target_descriptions[shuffle_indices] target_description_lens = target_description_lens[shuffle_indices] max_target_description_len = max(target_description_lens) print max_target_description_len target_descriptions = pad_sequences(target_descriptions, max_target_description_len) image_features = np.array(image_features) data = np.array( list( zip(post_texts, truth_classes, post_text_lens, truth_means, target_descriptions, target_description_lens, image_features))) kf = KFold(n_splits=5) round = 1 val_scores = [] val_accs = [] for train, validation in kf.split(data): train_data, validation_data = data[train], data[validation] g = tf.Graph() with g.as_default() as g: tf.set_random_seed(81) with tf.Session(graph=g) as sess: if FLAGS.model == "DAN": model = DAN(x1_maxlen=max_post_text_len, y_len=len(truth_classes[0]), x2_maxlen=max_target_description_len, embedding=embedding, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, hidden_size=FLAGS.hidden_size, state_size=FLAGS.state_size, x3_size=len(image_features[0])) if FLAGS.model == "CNN": model = CNN(x1_maxlen=max_post_text_len, y_len=len(truth_classes[0]), x2_maxlen=max_target_description_len, embedding=embedding, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, hidden_size=FLAGS.hidden_size, state_size=FLAGS.state_size, x3_size=len(image_features[0])) if FLAGS.model == "BiRNN": model = BiRNN(x1_maxlen=max_post_text_len, y_len=len(truth_classes[0]), x2_maxlen=max_target_description_len, embedding=embedding, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, hidden_size=FLAGS.hidden_size, state_size=FLAGS.state_size, x3_size=len(image_features[0])) if FLAGS.model == "SAN": model = SAN(x1_maxlen=max_post_text_len, y_len=len(truth_classes[0]), x2_maxlen=max_target_description_len, embedding=embedding, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, hidden_size=FLAGS.hidden_size, state_size=FLAGS.state_size, x3_size=len(image_features[0]), attention_size=2 * FLAGS.state_size) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) grads_and_vars = optimizer.compute_gradients(model.loss) if FLAGS.gradient_clipping_value: grads_and_vars = [ (tf.clip_by_value(grad, -FLAGS.gradient_clipping_value, FLAGS.gradient_clipping_value), var) for grad, var in grads_and_vars ] train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) out_dir = os.path.join(FLAGS.dir, "runs", FLAGS.timestamp) # loss_summary = tf.summary.scalar("loss", model.loss) # acc_summary = tf.summary.scalar("accuracy", model.accuracy) # train_summary_op = tf.summary.merge([loss_summary, acc_summary]) # train_summary_dir = os.path.join(out_dir, "summaries", "train") # train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # val_summary_op = tf.summary.merge([loss_summary, acc_summary]) # val_summary_dir = os.path.join(out_dir, "summaries", "validation") # val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph) checkpoint_dir = os.path.join(out_dir, "checkpoints") checkpoint_prefix = os.path.join(checkpoint_dir, FLAGS.model + str(round)) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) def train_step(input_x1, input_y, input_x1_len, input_z, input_x2, input_x2_len, input_x3): feed_dict = { model.input_x1: input_x1, model.input_y: input_y, model.input_x1_len: input_x1_len, model.input_z: input_z, model.dropout_rate_hidden: FLAGS.dropout_rate_hidden, model.dropout_rate_cell: FLAGS.dropout_rate_cell, model.dropout_rate_embedding: FLAGS.dropout_rate_embedding, model.batch_size: len(input_x1), model.input_x2: input_x2, model.input_x2_len: input_x2_len, model.input_x3: input_x3 } _, step, loss, mse, accuracy = sess.run([ train_op, global_step, model.loss, model.mse, model.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, mse {:g}, acc {:g}".format( time_str, step, loss, mse, accuracy)) # train_summary_writer.add_summary(summaries, step) def validation_step(input_x1, input_y, input_x1_len, input_z, input_x2, input_x2_len, input_x3, writer=None): feed_dict = { model.input_x1: input_x1, model.input_y: input_y, model.input_x1_len: input_x1_len, model.input_z: input_z, model.dropout_rate_hidden: 0, model.dropout_rate_cell: 0, model.dropout_rate_embedding: 0, model.batch_size: len(input_x1), model.input_x2: input_x2, model.input_x2_len: input_x2_len, model.input_x3: input_x3 } step, loss, mse, accuracy = sess.run( [global_step, model.loss, model.mse, model.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, mse {:g}, acc {:g}".format( time_str, step, loss, mse, accuracy)) # if writer: # writer.add_summary(summaries, step) return mse, accuracy print("\nValidation: ") post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val = zip( *validation_data) validation_step(post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val) print("\n") min_mse_val = np.inf acc = np.inf for i in range(FLAGS.epochs): batches = get_batch(train_data, FLAGS.batch_size) for batch in batches: post_text_batch, truth_class_batch, post_text_len_batch, truth_mean_batch, target_description_batch, target_description_len_batch, image_feature_batch = zip( *batch) train_step(post_text_batch, truth_class_batch, post_text_len_batch, truth_mean_batch, target_description_batch, target_description_len_batch, image_feature_batch) print("\nValidation: ") mse_val, acc_val = validation_step( post_text_val, truth_class_val, post_text_len_val, truth_mean_val, target_description_val, target_description_len_val, image_feature_val) print("\n") if mse_val < min_mse_val: min_mse_val = mse_val acc = acc_val # saver.save(sess, checkpoint_prefix) round += 1 val_scores.append(min_mse_val) val_accs.append(acc) print np.mean(val_scores) print np.mean(val_accs)
# - # #### モデル作成とバリデーション # LightGBMを使用してモデルを作成します。 # + printTime('モデルの作成開始') va_pred_list = [] va_weight_list = [] pred_list = [] # 学習データを学習データとバリデーションデータに分ける kf = KFold(n_splits=4, shuffle=True, random_state=71) for tr_idx, va_idx in kf.split(train_x): tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] # 2020/05/30 Target encodingをする Start # 2020/05/30 Target encodingをする End # 特徴量と目的変数をlightgbmのデータ構造に変換する lgb_train = lgb.Dataset(tr_x, tr_y) lgb_eval = lgb.Dataset(va_x, va_y) # ハイパーパラメータの設定 params = { 'boosting_type': 'gbdt', 'objective': 'regression_l2',
def classifier(model, emb_mean, emb_std, embeddings_index): train = pd.read_csv('./input/TIL_NLP_train1_dataset.csv') test = pd.read_csv('./input/TIL_NLP_unseen_dataset.csv') print('running classifier') max_features = 4248 print(max_features) maxlen = 200 embed_size = 100 train = shuffle(train) X_train = train["word_representation"].fillna("fillna").values y_train = train[[ "outwear", "top", "trousers", "women dresses", "women skirts" ]].values X_test = test["word_representation"].fillna("fillna").values y_test = test[[ "outwear", "top", "trousers", "women dresses", "women skirts" ]].values y_test = y_test.tolist() print('preprocessing start') tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) x_train = sequence.pad_sequences(X_train, maxlen=maxlen) x_test = sequence.pad_sequences(X_test, maxlen=maxlen) del X_train, X_test, train, test gc.collect() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) for word, i in word_index.items(): if i >= max_features: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i - 1] = embedding_vector print('preprocessing done') # session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4) # K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf)) #model #wrote out all the blocks instead of looping for simplicity filter_nr = 64 filter_size = 3 max_pool_size = 3 max_pool_strides = 2 dense_nr = 256 spatial_dropout = 0.2 dense_dropout = 0.5 train_embed = False conv_kern_reg = regularizers.l2(0.00001) conv_bias_reg = regularizers.l2(0.00001) comment = Input(shape=(maxlen, )) emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment) block1 = Bidirectional(LSTM(embed_size))(emb_comment) block1 = Dense(embed_size, activation='linear')(block1) output = Dense(5, activation='sigmoid')(block1) """ emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment) block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment) block1 = BatchNormalization()(block1) block1 = PReLU()(block1) block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1) block1 = BatchNormalization()(block1) block1 = PReLU()(block1) #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment) resize_emb = PReLU()(resize_emb) block1_output = add([block1, resize_emb]) block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output) block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output) block2 = BatchNormalization()(block2) block2 = PReLU()(block2) block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2) block2 = BatchNormalization()(block2) block2 = PReLU()(block2) block2_output = add([block2, block1_output]) block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output) block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output) block3 = BatchNormalization()(block3) block3 = PReLU()(block3) block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3) block3 = BatchNormalization()(block3) block3 = PReLU()(block3) block3_output = add([block3, block2_output]) block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output) block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output) block4 = BatchNormalization()(block4) block4 = PReLU()(block4) block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4) block4 = BatchNormalization()(block4) block4 = PReLU()(block4) block4_output = add([block4, block3_output]) block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output) block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output) block5 = BatchNormalization()(block5) block5 = PReLU()(block5) block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5) block5 = BatchNormalization()(block5) block5 = PReLU()(block5) block5_output = add([block5, block4_output]) block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output) block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output) block6 = BatchNormalization()(block6) block6 = PReLU()(block6) block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6) block6 = BatchNormalization()(block6) block6 = PReLU()(block6) block6_output = add([block6, block5_output]) block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output) block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output) block7 = BatchNormalization()(block7) block7 = PReLU()(block7) block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7) block7 = BatchNormalization()(block7) block7 = PReLU()(block7) block7_output = add([block7, block6_output]) output = GlobalMaxPooling1D()(block7_output) output = Dense(dense_nr, activation='linear')(output) output = BatchNormalization()(output) output = PReLU()(output) output = Dropout(dense_dropout)(output) output = Dense(5, activation='sigmoid')(output) """ #model = Model(comment, output) # print("Correct model: ", type(model)) model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy']) num_folds = 5 num = 0 kfold = KFold(n_splits=num_folds, shuffle=True) for train, test in kfold.split(x_train, y_train): print("Training Fold number: ", num) batch_size = 128 epochs = 20 lr = callbacks.LearningRateScheduler(schedule) ra_val = RocAucEvaluation(validation_data=(x_train[test], y_train[test]), interval=1) es = EarlyStopping(monitor='val_loss', verbose=1, patience=5, restore_best_weights=True, mode='min') mc = ModelCheckpoint('best_model_rnn.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) model.fit(x_train[train], y_train[train], batch_size=batch_size, epochs=epochs, validation_data=(x_train[test], y_train[test]), callbacks=[lr, ra_val, es, mc], verbose=1) num += 1 y_pred = model.predict(x_test) y_pred = [[1 if i > 0.5 else 0 for i in r] for r in y_pred] accuracy = sum([y_pred[i] == y_test[i] for i in range(len(y_pred))]) / len(y_pred) * 100 print([y_pred[i] == y_test[i] for i in range(len(y_pred))]) print(accuracy, "%") print(f1(y_pred, y_test)) """ submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv') submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred submission.to_csv('dpcnn_test_preds.csv', index=False) """ return model
def main(_): print('METHOD:', FLAGS.method) print('Norm factor:', FLAGS.norm_factor) DEBUG = FLAGS.debug idir = FLAGS.idir if not DEBUG: FLAGS.infer = True FLAGS.num_folds = 1 #FLAGS.num_grids = 10 # first id, sencod content .. idx = 2 valid_files = glob.glob(f'{idir}/*.valid.csv') valid_files = [x for x in valid_files if not 'ensemble' in x] if not DEBUG: print('VALID then INFER') infer_files = glob.glob(f'{idir}/*.infer.csv.debug') else: print('Debug mode INFER ill write result using valid ids, just for test') infer_files = glob.glob(f'{idir}/*.valid.csv') infer_files = [x for x in infer_files if not 'ensemble' in x] print('num_ensembles', len(valid_files), 'num_infers', len(infer_files)) assert len(valid_files) == len(infer_files), infer_files global num_ensembles num_ensembles = len(valid_files) # need global ? even only read? global class_weights #print('-----------', class_weights) print('loading all valid csv') dfs = [] for file_ in tqdm(valid_files, ascii=True): df = pd.read_csv(file_) df = df.sort_values('id') dfs.append(df) if FLAGS.num_folds > 1: kf = KFold(n_splits=FLAGS.num_folds, shuffle=True, random_state=FLAGS.seed) dataset = kf.split(dfs[0]) else: ids = dfs[0]['id'].values dataset = [(ids, ids)] logits_f1_list = [] logits_adjusted_f1_list = [] probs_f1_list = [] probs_adjusted_f1_list = [] grids_logits_adjusted_f1_list = [] logits_predict_list = [] logits_adjusted_predict_list = [] probs_predict_list = [] probs_adjusted_predict_list = [] grids_logits_adjusted_predict_list = [] labels_list = [] results_list = [] def split_train_valid(x): if FLAGS.num_folds == 1: return x, x else: total = 15000 assert total % FLAGS.num_folds == 0 num_valid = int(total / FLAGS.num_folds) num_train = total - num_valid return x[:num_train], x[num_train:] for fold, (train_index, valid_index) in enumerate(dataset): print('FOLD_%s---------------------------' % fold) print('train:', train_index, 'valid:', valid_index) class_factors = np.ones([num_attrs, num_classes]) class_weights = ori_class_weights # logits sum results results = None # prob sum results results2 = None weights = [] scores_list = [] for fid, df in enumerate(dfs): file_ = valid_files[fid] train = df.iloc[train_index] valid = df.iloc[valid_index] #if fid == 0: train_labels = train.iloc[:, idx:idx+num_attrs].values valid_labels = valid.iloc[:, idx:idx+num_attrs].values labels = np.concatenate([train_labels, valid_labels], 0) train_predicts = train.iloc[:, idx+num_attrs:idx+2*num_attrs].values valid_predicts = valid.iloc[:, idx+num_attrs:idx+2*num_attrs].values predicts = np.concatenate([train_predicts, valid_predicts], 0) train_scores = train['score'] valid_scores = valid['score'] scores = np.concatenate([train_scores, valid_scores], 0) scores = [parse(score) for score in scores] scores = np.array(scores) scores_list.append(scores) train_labels, valid_labels = split_train_valid(labels) train_predicts, valid_predicts = split_train_valid(predicts) train_scores, valid_scores = split_train_valid(scores) f1s = calc_f1s(train_labels, train_predicts) f1s_adjusted = calc_f1s(train_labels, to_predict(train_scores, is_single=True)) train_probs = gezi.softmax(train_scores.reshape([-1, NUM_ATTRIBUTES, NUM_CLASSES])) aucs = calc_aucs(train_labels + 2, train_probs) losses = calc_losses(train_labels + 2, train_probs) f1 = np.mean(f1s) f1_adjusted = np.mean(f1s_adjusted) print('%-3d' % fid, '%-100s' % file_, '%.5f' % f1, '%.5f' % f1_adjusted, '%.5f' % np.mean(aucs), '%.5f' % np.mean(losses)) if FLAGS.weight_by == 'loss': weight = np.reshape(1 / losses, [num_attrs, 1]) elif FLAGS.weight_by == 'auc': weight = np.reshape(aucs, [num_attrs, 1]) else: weight = np.reshape(f1s_adjusted, [num_attrs, 1]) weights.append(weight) weights = np.array(weights) scores_list = np.array(scores_list) weights = blend(weights, FLAGS.norm_factor) sum_weights = np.sum(weights, 0) # print('weights\n', weights) # print('sum_weights\n', sum_weights) # if DEBUG: # print(weights) print('-----------calc weight and score') for fid in tqdm(range(len(valid_files)), ascii=True): scores = scores_list[fid] if results is None: results = np.zeros([len(scores), num_attrs * num_classes]) results2 = np.zeros([len(scores), num_attrs * num_classes]) weight = weights[fid] #print(fid, valid_files[fid], '\n', ['%.5f' % x for x in np.reshape(weight, [-1])]) if FLAGS.method == 'avg' or FLAGS.method == 'mean': weight = 1. for i, score in enumerate(scores): score = np.reshape(score, [num_attrs, num_classes]) * weight score = np.reshape(score, [-1]) results[i] += score # notice softmax([1,2]) = [0.26894142, 0.73105858] softmax([2,4]) = [0.11920292, 0.88079708] score = np.reshape(score, [num_attrs, num_classes]) # this not work because *weight already.. #score *= FLAGS.logits_factor score = gezi.softmax(score, -1) #score *= class_weights score = np.reshape(score, [-1]) results2[i] += score train_results, valid_results = split_train_valid(results) train_results2, valid_results2 = split_train_valid(results2) print('-----------using prob ensemble') adjusted_predict_prob = to_predict(valid_results2, sum_weights, adjust=False) adjusted_f1_prob = calc_f1(valid_labels, adjusted_predict_prob) valid_results2 = np.reshape(valid_results2, [-1, num_attrs, num_classes]) predicts2 = np.argmax(valid_results2, -1) - 2 f1_prob = calc_f1(valid_labels, predicts2) probs_f1_list.append(f1_prob) probs_adjusted_f1_list.append(adjusted_f1_prob) probs_predict_list.append(predicts2) probs_adjusted_predict_list.append(adjusted_predict_prob) print('%-40s' % 'f1_prob:', '%.5f' % f1_prob) print('%-40s' % 'adjusted f1_prob:', '%.5f' % adjusted_f1_prob) # print('-----------detailed f1 infos (ensemble by prob)') # _, adjusted_f1_probs, class_f1s = calc_f1_alls(valid_labels, to_predict(results2[num_train:], sum_weights, adjust=False)) # for i, attr in enumerate(ATTRIBUTES): # print(attr, adjusted_f1_probs[i]) # for i, cls in enumerate(CLASSES): # print(cls, class_f1s[i]) print('-----------using logits ensemble') adjusted_predict = to_predict(valid_results, sum_weights) adjusted_f1 = calc_f1(valid_labels, adjusted_predict) valid_results = np.reshape(valid_results, [-1, num_attrs, num_classes]) predicts = np.argmax(valid_results, -1) - 2 f1 = calc_f1(valid_labels, predicts) logits_f1_list.append(f1) logits_adjusted_f1_list.append(adjusted_f1) logits_predict_list.append(predicts) logits_adjusted_predict_list.append(adjusted_predict) results_list.append(valid_results) labels_list.append(valid_labels) print('%-40s' % 'f1:', '%.5f' % f1) print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1) if FLAGS.show_detail: print('-----------detailed f1 infos (ensemble by logits)') _, adjusted_f1s, class_f1s = calc_f1_alls(valid_labels, to_predict(valid_results, sum_weights)) for i, attr in enumerate(ATTRIBUTES): print('%-40s' % attr, '%.5f' % adjusted_f1s[i]) for i, cls in enumerate(CLASSES): print('%-40s' % cls, '%.5f' % class_f1s[i]) print('%-40s' % 'f1:', '%.5f' % f1) print('%-40s' % 'f1 prob:', '%.5f' % f1_prob) print('%-40s' % 'adjusted f1 prob:', '%.5f' % adjusted_f1_prob) print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1) if FLAGS.num_grids: print('------------grid search num_grids', FLAGS.num_grids) class_factors = grid_search_class_factors(gezi.softmax(np.reshape(train_results, [-1, num_attrs, num_classes]) * (FLAGS.logits_factor / sum_weights)), train_labels, class_weights, num_grids=FLAGS.num_grids) if FLAGS.show_detail: print('class_factors1 with num_grids', FLAGS.num_grids) print(class_factors) # adjust class weights to get better result from grid search class_weights = class_weights * class_factors adjusted_f1_before_grids = adjusted_f1 print('after dynamic adjust class factors') adjusted_predict = to_predict(valid_results, sum_weights) adjusted_f1 = calc_f1(valid_labels, adjusted_predict) valid_results = np.reshape(valid_results, [-1, num_attrs, num_classes]) grids_logits_adjusted_f1_list.append(adjusted_f1) grids_logits_adjusted_predict_list.append(adjusted_predict) print('-----------using logits ensemble') print('%-40s' % 'adjusted f1 before grids:', '%.5f' % adjusted_f1_before_grids) print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1) if FLAGS.show_detail: print('-----------detailed f1 infos (ensemble by logits)') _, adjusted_f1s, class_f1s = calc_f1_alls(valid_labels, to_predict(valid_results, sum_weights)) for i, attr in enumerate(ATTRIBUTES): print('%-40s' % attr, '%.5f' % adjusted_f1s[i]) for i, cls in enumerate(CLASSES): print('%-40s' % cls, '%.5f' % class_f1s[i]) print('%-40s' % 'adjusted f1 before grids:', '%.5f' % adjusted_f1_before_grids) print('%-40s' % 'adjusted f1:', '%.5f' % adjusted_f1) # print('-------------------------------------OVERALL mean') # print('ensemble by probs') # print('%-40s' % 'f1', '%.5f' % np.mean(probs_f1_list)) # print('%-40s' % 'adjustedf f1', '%.5f' % np.mean(probs_adjusted_f1_list)) # print('ensemble by logits') # print('%-40s' % 'f1:', '%.5f' % np.mean(logits_f1_list)) # print('%-40s' % 'adjusted f1:', '%.5f' % np.mean(logits_adjusted_f1_list)) # if FLAGS.num_grids: # print('ensemble by logits after grid search') # print('%-40s' % 'adjusted f1', '%.5f' % np.mean(grids_logits_adjusted_f1_list)) print('-------------------------------------OVERALL recalc') labels = np.concatenate(labels_list, 0) print('ensemble by probs') print('%-40s' % 'f1', '%.5f' % calc_f1(labels, np.concatenate(probs_predict_list, 0))) print('%-40s' % 'adjustedf f1', '%.5f' % calc_f1(labels, np.concatenate(probs_adjusted_predict_list, 0))) print('ensemble by logits') predicts = np.concatenate(logits_predict_list, 0) print('%-40s' % 'f1:', '%.5f' % calc_f1(labels, predicts)) adjusted_predicts = np.concatenate(logits_adjusted_predict_list, 0) print('%-40s' % 'adjusted f1:', '%.5f' % calc_f1(labels, adjusted_predicts)) if FLAGS.num_grids: print('ensemble by logits after grid search') grids_predicts = np.concatenate(grids_logits_adjusted_predict_list, 0) print('%-40s' % 'adjusted f1 after grid search', '%.5f' % calc_f1(labels, grids_predicts)) _, adjusted_f1s, class_f1s = calc_f1_alls(labels, adjusted_predicts) for i, attr in enumerate(ATTRIBUTES): print('%-40s' % attr, '%.5f' % adjusted_f1s[i]) for i, cls in enumerate(CLASSES): print('%-40s' % cls, '%.5f' % class_f1s[i]) print('%-40s' % 'f1', '%.5f' % calc_f1(labels, predicts)) print('%-40s' % 'adjusted f1', '%.5f' % calc_f1(labels, adjusted_predicts)) if FLAGS.num_grids: print('%-40s' % 'adjusted f1 after grid search', '%.5f' % calc_f1(labels, grids_predicts)) results = np.concatenate(results_list, 0) results = results.reshape([-1, NUM_ATTRIBUTES, NUM_CLASSES]) #factor = FLAGS.logits_factor / sum_weights #print('%-40s' % '* factor loss', '%.5f' % calc_loss(labels, gezi.softmax(results * factor))) ## directly do softmax on results since sum weights is 1 loss = calc_loss(labels, gezi.softmax(results)) print('%-40s' % 'loss', '%.5f' % loss) print('f1:class predictions distribution') counts = get_distribution(predicts) for attr, count in zip(ATTRIBUTES, counts): print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count]) #print_confusion_matrix(labels, predicts) print('adjusted f1:class predictions distribution') counts = get_distribution(adjusted_predicts) for attr, count in zip(ATTRIBUTES, counts): print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count]) #print_confusion_matrix(labels, adjusted_predicts) if FLAGS.num_grids: print('adjusted f1:class predictions distribution after grids search') counts = get_distribution(grids_predicts) for attr, count in zip(ATTRIBUTES, counts): print('%-40s' % attr, ['%.5f' % (x / len(grids_predicts)) for x in count]) #print_confusion_matrix(labels, grids_predicts) DEBUG = FLAGS.debug if FLAGS.infer: print('------------infer') ofile = os.path.join(idir, 'ensemble.infer.csv') file_ = gezi.strip_suffix(file_, '.debug') df = pd.read_csv(file_) idx = 2 results = None results2 = None for fid, file_ in enumerate(infer_files): df = pd.read_csv(file_) df = df.sort_values('id') print(fid, file_, len(df)) if not FLAGS.debug: assert len(df) == 200000 if results is None: results = np.zeros([len(df), num_attrs * num_classes]) results2 = np.zeros([len(df), num_attrs * num_classes]) scores = df['score'] scores = [parse(score) for score in scores] scores = np.array(scores) weight = weights[fid] if FLAGS.method == 'avg' and FLAGS.method == 'mean': weight = 1. for i, score in enumerate(scores): score = np.reshape(np.reshape(score, [num_attrs, num_classes]) * weight, [-1]) results[i] += score score = gezi.softmax(np.reshape(score, [num_attrs, num_classes]), -1) score = np.reshape(score, [-1]) results2[i] += score #predicts = to_predict(results2, sum_weights) predicts = to_predict(results, sum_weights) counts = get_distribution(predicts) for attr, count in zip(ATTRIBUTES, counts): print('%-40s' % attr, ['%.5f' % (x / len(predicts)) for x in count]) if not DEBUG: columns = df.columns[idx:idx + num_attrs].values else: columns = df.columns[idx + num_attrs:idx + 2 * num_attrs].values if not DEBUG: ofile = os.path.join(idir, 'ensemble.infer.csv') else: ofile = os.path.join(idir, 'ensemble.valid.csv') if not DEBUG: file_ = gezi.strip_suffix(file_, '.debug') print('temp csv using for write', file_) df = pd.read_csv(file_) else: print('debug test using file', valid_files[-1]) df = pd.read_csv(valid_files[-1]) # for safe must sort id df = df.sort_values('id') # TODO better ? not using loop ? for i, column in enumerate(columns): df[column] = predicts[:, i] if DEBUG: print('check blend result', calc_f1(df.iloc[:, idx:idx + num_attrs].values, predicts)) print(f'adjusted f1_prob:[{adjusted_f1_prob}]') print(f'adjusted f1:[{adjusted_f1}]') print(f'loss:[{loss}]') print('out:', ofile) if not DEBUG: df.to_csv(ofile, index=False, encoding="utf_8_sig") print('---------------results', results.shape) df['score'] = [x for x in results] if not DEBUG: ofile = os.path.join(idir, 'ensemble.infer.debug.csv') else: ofile = os.path.join(idir, 'ensemble.valid.csv') print('out debug:', ofile) df.to_csv(ofile, index=False, encoding="utf_8_sig")
from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier classifier = DecisionTreeClassifier() model = classifier.fit(X_train,y_train) # TODO: Make predictions on the test data y_pred = model.predict(X_test) # TODO: Calculate the accuracy and assign it to the variable acc on the test data. from sklearn.metrics import accuracy_score,r2_score,median_absolute_error acc = accuracy_score(y_test, y_pred) yP = model.predict(X_test) score_r2 = r2_score(y_test, yP) score_MedAE = median_absolute_error(y_test, yP) #do the samething via sklearn from sklearn.model_selection import KFold, cross_val_score svc = SVC(C=1, kernel='linear') svc.fit(X_train,y_train).score(X_train,y_train) k_fold = KFold(n_splits=3) [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) for train, test in k_fold.split(X_digits)] # even siplify to one line cross_val_score(svc, X_digits, y_digits, cv=k_fold, n_jobs=-1) #n_jobs=-1 means that the computation will be dispatched on all the CPUs of the computer.
def main(): ######################################################### # Test my decision tree classifier ######################################################### classifier = decisionTreeClassifier() uciCarEvaluationDataObject = UciCarEvaluation() data = uciCarEvaluationDataObject.data labels = uciCarEvaluationDataObject.targets label_names = uciCarEvaluationDataObject.target_names model = classifier.fit(data, labels) accuracy_list = [] kf = KFold(n_splits=10) for train_index, test_index in kf.split(data, labels): # Build the data/target lists training_data = data.iloc[train_index] training_labels = labels.iloc[train_index] testing_data = data.iloc[test_index] testing_labels = labels.iloc[test_index] # Build the model model = classifier.fit(training_data, training_labels) # # Predict predicted_classes = model.predict(testing_data) accuracy_list.append(accuracy_score(testing_labels, predicted_classes)) print "The custom decision tree predicted the auto dataset's classes with an average of", print sum(accuracy_list) / float(len(accuracy_list)) * 100, print "percent accuracy." ######################################################### # Compare the SK-learn decision tree classifier ######################################################### classifier = tree.DecisionTreeClassifier() model = classifier.fit(data, labels) accuracy_list = [] kf = KFold(n_splits=10) for train_index, test_index in kf.split(data, labels): # Build the data/target lists training_data = data.iloc[train_index] training_labels = labels.iloc[train_index] testing_data = data.iloc[test_index] testing_labels = labels.iloc[test_index] # Build the model model = classifier.fit(training_data, training_labels) # # Predict predicted_classes = model.predict(testing_data) accuracy_list.append(accuracy_score(testing_labels, predicted_classes)) print "The sk-learn decision tree predicted the auto dataset's classes with an average of", print sum(accuracy_list) / float(len(accuracy_list)) * 100, print "percent accuracy."
print('===========================================================') print('===========================================================') csv = [] for k in range(1,10): print("\n\nEvaluating with k-mer:", k, " ==========================") data = fe.generateLabeledData(dataset_path + dataset + "/data.fa", dataset_path + dataset + "/class.csv") kf = KFold(n_splits = 5, shuffle = True, random_state=1) i = 0 metrics_castor = [] metrics_kameris = [] for train_index, test_index in kf.split(np.zeros(len(data))): data_train = split_data(data, train_index) data_test = split_data(data, test_index) acc, pre, recall, fscore, number_features = kameris(data_train, data_test, k, dimention_reduction) metrics_kameris.append([acc, pre, recall, fscore, number_features]) print("k-fold ", i, "metrics_kameris: ", acc, pre, recall, fscore) acc, pre, recall, fscore, number_features = castor(data_train, data_test, k, dimention_reduction) metrics_castor.append([acc, pre, recall, fscore, number_features]) print("k-fold ", i, "metrics_castor: ", acc, pre, recall, fscore) i += 1 metrics_kameris = np.matrix(metrics_kameris)
# param_grid=param_grid, scoring=None, # cv=n_folds, n_jobs=n_jobs, verbose=verbose_grid) #gs = gs.fit(X_new, y) #print(gs.scorer_) #print('best score from grid search: %.3f' % gs.best_score_) #print(gs.best_params_) #best = gs.best_params_ #n_estimators_gs = best['n_estimators'] #max_depth_gs = best['max_depth'] #max_features_gs = best['max_features'] # run some cross validation print('running cross validation to determine accuracy of model...') scores = [] splits = KFold(n_splits=n_folds, shuffle=True, random_state=random_state) for train, test in splits.split(X): tree.fit(X[train], y[train]) predicted = tree.predict(X[test]) score = mean_absolute_error(y[test], predicted) scores.append(score) print(scores) # determine which features to write to the file n_estimators = n_estimators_def max_depth = max_depth_def max_features = max_features_def score = np.mean(scores) print('writing the data to file...') params = (n_folds, n_estimators, max_depth, max_features, score) write_hyperparams(params, hyperParamFile)
input_x.append([float(x) / 255 for x in row[:len(row) - 1]]) input_y.append(float(row[len(row) - 1])) print("Gone through all the data") datax = np.array(input_x) number_of_features = datax.shape[1] datax_temp = np.array(input_x) datay = np.array(input_y) datay_temp = np.array(input_y) number_of_training = datax.shape[0] gamma = [0.00001, 0.001, 1, 5, 10] c1 = 0 kf = KFold(n_splits=5) kf.get_n_splits(datax) score_max = 0 index = 0 for train_index, test_index in kf.split(datax): X_train, X_test = datax[train_index], datax[test_index] y_train, y_test = datay[train_index], datay[test_index] clf1 = SVC(kernel='rbf', gamma=gamma[c1]) clf1.fit(X_train, y_train) y_pred = clf1.predict(X_test) score1 = metrics.accuracy_score(y_test, y_pred) if (score1 > score_max): score_max = score1 index = c c1 += 1 print(score1) clf1 = SVC(kernel='rbf', gamma=gamma[index]) clf1.fit(datax, datay) end_training_time = time.time() - start_time
def fSplitDataset(allPatches, allY, allPats, sSplitting, patchSize, patchOverlap, split_ratio, sFolder, nfolds=0): # TODO: adapt path iReturn = expecting() if len(patchSize) == 3: if allPatches.shape[0] == patchSize[0] and allPatches.shape[ 1] == patchSize[1] and allPatches.shape[2] == patchSize[2]: allPatches = np.transpose(allPatches, (3, 0, 1, 2)) print(allPatches.shape) else: if allPatches.shape[0] == patchSize[0] and allPatches.shape[ 1] == patchSize[1]: allPatches = np.transpose(allPatches, (2, 0, 1)) print(allPatches.shape) if sSplitting == "normal": print("Done") nPatches = allPatches.shape[0] dVal = math.floor(split_ratio * nPatches) rand_num = np.random.permutation(np.arange(nPatches)) rand_num = rand_num[0:int(dVal)].astype(int) print(rand_num) if len(patchSize) == 3: X_test = allPatches[rand_num, :, :, :] else: X_test = allPatches[rand_num, :, :] y_test = allY[rand_num] X_train = allPatches X_train = np.delete(X_train, rand_num, axis=0) y_train = allY y_train = np.delete(y_train, rand_num) print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) if iReturn == 0: if len(patchSize) == 3: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str(patchSize[2]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str( patchSize[2]) + os.sep + 'normal_' + str( patchSize[0]) + str(patchSize[1]) + '.h5' else: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + os.sep + 'normal_' + str( patchSize[0]) + str(patchSize[1]) + '.h5' if os.path.isdir(folder): pass else: os.makedirs(folder) print(Path) with h5py.File(Path, 'w') as hf: hf.create_dataset('X_train', data=X_train) hf.create_dataset('X_test', data=X_test) hf.create_dataset('y_train', data=y_train) hf.create_dataset('y_test', data=y_test) hf.create_dataset('patchSize', data=patchSize) hf.create_dataset('patchOverlap', data=patchOverlap) else: return [X_train], [y_train], [X_test], [y_test ] # embed in a 1-fold list elif sSplitting == "crossvalidation_data": if nfolds == 0: kf = KFold(n_splits=len(np.unique(allPats))) else: kf = KFold(n_splits=nfolds) ind_split = 0 X_trainFold = [] X_testFold = [] y_trainFold = [] y_testFold = [] for train_index, test_index in kf.split(allPatches): X_train, X_test = allPatches[train_index], allPatches[test_index] y_train, y_test = allY[train_index], allY[test_index] if iReturn == 0: if len(patchSize) == 3: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str(patchSize[2]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str( patchSize[2]) + os.sep + 'crossVal_data' + str( ind_split) + '_' + str(patchSize[0]) + str( patchSize[1]) + str(patchSize[2]) + '.h5' else: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + os.sep + 'crossVal_data' + str( ind_split) + '_' + str(patchSize[0]) + str( patchSize[1]) + '.h5' if os.path.isdir(folder): pass else: os.makedirs(folder) with h5py.File(Path, 'w') as hf: hf.create_dataset('X_train', data=X_train) hf.create_dataset('X_test', data=X_test) hf.create_dataset('y_train', data=y_train) hf.create_dataset('y_test', data=y_test) hf.create_dataset('patchSize', data=patchSize) hf.create_dataset('patchOverlap', data=patchOverlap) else: X_trainFold.append(X_train) X_testFold.append(X_test) y_trainFold.append(y_train) y_testFold.append(y_test) ind_split += 1 X_trainFold = np.asarray(X_trainFold) X_testFold = np.asarray(X_testFold) y_trainFold = np.asarray(y_trainFold) y_testFold = np.asarray(y_testFold) if iReturn > 0: return X_trainFold, y_trainFold, X_testFold, y_testFold elif sSplitting == "crossvalidation_patient": unique_pats = np.unique(allPats) X_trainFold = [] X_testFold = [] y_trainFold = [] y_testFold = [] for ind_split in unique_pats: train_index = np.where(allPats != ind_split)[0] test_index = np.where(allPats == ind_split)[0] X_train, X_test = allPatches[train_index], allPatches[test_index] y_train, y_test = allY[train_index], allY[test_index] if iReturn == 0: if len(patchSize) == 3: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str(patchSize[2]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + str( patchSize[2]) + os.sep + 'crossVal' + str( ind_split) + '_' + str(patchSize[0]) + str( patchSize[1]) + str(patchSize[2]) + '.h5' else: folder = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) Path = sFolder + os.sep + str(patchSize[0]) + str( patchSize[1]) + os.sep + 'crossVal' + str( ind_split) + '_' + str(patchSize[0]) + str( patchSize[1]) + '.h5' if os.path.isdir(folder): pass else: os.makedirs(folder) with h5py.File(Path, 'w') as hf: hf.create_dataset('X_train', data=X_train) hf.create_dataset('X_test', data=X_test) hf.create_dataset('y_train', data=y_train) hf.create_dataset('y_test', data=y_test) hf.create_dataset('patchSize', data=patchSize) hf.create_dataset('patchOverlap', data=patchOverlap) else: X_trainFold.append(X_train) X_testFold.append(X_test) y_trainFold.append(y_train) y_testFold.append(y_test) X_trainFold = np.asarray(X_trainFold, dtype='f') X_testFold = np.asarray(X_testFold, dtype='f') y_trainFold = np.asarray(y_trainFold, dtype='f') y_testFold = np.asarray(y_testFold, dtype='f') if iReturn > 0: return X_trainFold, y_trainFold, X_testFold, y_testFold
# Filter down to most common 100 questions questions = data_temp.groupby( 'question')['student'].nunique().sort_values().iloc[-100:].index.values data_temp = data_temp[data_temp['question'].isin(questions)].drop( columns='index') # Set up the algorithms glicko = Glicko2() irt_1pl = IRT3PL(**{'model_name': '1pl'}) algorithms = [glicko, irt_1pl] # Now do a five fold cross validation on the data kf = KFold(n_splits=5, random_state=42, shuffle=True) fold = 0 for train_idx, test_idx in kf.split(data_temp): df_train = data_temp.iloc[train_idx] df_test = data_temp.iloc[test_idx] for alg in algorithms: alg.fit(df_train) df_out = alg.predict(df_test) df_out = df_out.merge(df_test) accuracy = 1 - np.mean( np.abs(np.round(df_out['p']) - df_out['score'])) dft = pd.DataFrame({ 'course': [course], 'test_idx': [fold], 'algorithm': [alg.model_name], 'accuracy': [accuracy] })
print(cm_df) for i, cls in enumerate(clf_LinearSVC.classes_): print("\nFeature weights for class : ", cls, "\n") df = pd.DataFrame(data={ "Features": vec.feature_names_, "weights": clf_LinearSVC.coef_[i] }) df = df.sort_values(axis=0, by='weights', ascending=False) print(df[:50]) kfold = KFold(n_splits=5, shuffle=True, random_state=1234) preds = [] truths = [] #y = np.array(labels) for train, test in kfold.split(X): gnb = LogisticRegression() #C=0.001,fit_intercept=True,max_iter=20,solver="lbfgs",class_weight = 'balanced') #MLPClassifier(hidden_layer_sizes=(5,10),solver='sgd',learning_rate = 'adaptive',activation='logistic') clf = gnb.fit(X[train], Y[train]) #print(clf.class_count_) preds.extend(clf.predict(X[test])) truths.extend(Y[test]) acc = accuracy_score(truths, preds) print('accuracy : %0.3f' % acc) cnf_matrix = confusion_matrix( truths, preds, labels=['Not Relevant', 'Deceptive', 'Relevant']) print(cnf_matrix) #evaluate_combinations(X_train, Y_train)
def fSplitDatasetCorrection(sSplitting, dRefPatches, dArtPatches, allPats, split_ratio, nfolds, test_index): """ Split dataset with three options: 1. normal: randomly split data according to the split_ratio without cross validation 2. crossvalidation_data: perform crossvalidation with mixed patient data 3. crossvalidation_patient: perform crossvalidation with separate patient data @param sSplitting: splitting mode 'normal', 'crossvalidation_data' or 'crossvalidation_patient' @param dRefPatches: reference patches @param dArtPatches: artifact patches @param allPats: patient index @param split_ratio: the ratio to split test data @param nfolds: folds for cross validation @return: testing and training data for both reference and artifact images """ train_ref_fold = [] test_ref_fold = [] train_art_fold = [] test_art_fold = [] # normal splitting if sSplitting == 'normal': nPatches = dRefPatches.shape[0] dVal = math.floor(split_ratio * nPatches) rand_num = np.random.permutation(np.arange(nPatches)) rand_num = rand_num[0:int(dVal)].astype(int) test_ref_fold.append(dRefPatches[rand_num, :, :]) train_ref_fold.append(np.delete(dRefPatches, rand_num, axis=0)) test_art_fold.append(dArtPatches[rand_num, :, :]) train_art_fold.append(np.delete(dArtPatches, rand_num, axis=0)) # crossvalidation with mixed patient if sSplitting == "crossvalidation_data": if nfolds == 0: kf = KFold(n_splits=len(np.unique(allPats))) else: kf = KFold(n_splits=nfolds) for train_index, test_index in kf.split(dRefPatches): train_ref, test_ref = dRefPatches[train_index], dRefPatches[ test_index] train_art, test_art = dArtPatches[train_index], dArtPatches[ test_index] train_ref_fold.append(train_ref) train_art_fold.append(train_art) test_ref_fold.append(test_ref) test_art_fold.append(test_art) # crossvalidation with separate patient elif sSplitting == 'crossvalidation_patient': if test_index == -1: unique_pats = np.unique(allPats) else: unique_pats = [test_index] for ind_split in unique_pats: train_index = np.where(allPats != ind_split)[0] test_index = np.where(allPats == ind_split)[0] train_ref, test_ref = dRefPatches[train_index], dRefPatches[ test_index] train_art, test_art = dArtPatches[train_index], dArtPatches[ test_index] train_ref_fold.append(train_ref) train_art_fold.append(train_art) test_ref_fold.append(test_ref) test_art_fold.append(test_art) train_ref_fold = np.asarray(train_ref_fold, dtype='f') train_art_fold = np.asarray(train_art_fold, dtype='f') test_ref_fold = np.asarray(test_ref_fold, dtype='f') test_art_fold = np.asarray(test_art_fold, dtype='f') return train_ref_fold, test_ref_fold, train_art_fold, test_art_fold