def __init__(self, categorical_features_indices, params): super().__init__(params) self.categorical_features_indices = categorical_features_indices self.name = 'CatBoostRegressor' self.cv = KFold(5, shuffle=True, random_state=1) self.metrics = {'MAE': mae_score, 'RMSE': rmse_score}
res = [] for md in max_depths: # print() for nest in n_estimators: mdl = RandomForestClassifier(max_depth=md, n_estimators=nest, oob_score=True, max_features='auto', random_state=88) ''' Kfold cross validation ''' scores_test, scores_train = [], [] kf = KFold(n_splits=k_fold) for train_index, test_index in kf.split(X): X_train = X[train_index] X_test = X[test_index] y_train, y_test = y[train_index], y[test_index] mdl.fit(X_train, y_train) score_test = mdl.score(X_test, y_test) score_train = mdl.score(X_train, y_train) # print("\t score test {:.3f} train {:.3f} ".format(score_test, score_train)) scores_test.append(score_test) scores_train.append(score_train) # print("md {}, nest {}, test {:.3f} +- {:.2f} train {:.3f}".format(md,nest,np.mean(scores_test), np.std(scores_test), np.mean(scores_train) )) res.append({
def __init__(self, n_splits=2, shuffle=False): self.n_splits = n_splits if self.n_splits > 1: self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE): cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42) print cv_object p, r, f1 = 0., 0., 0. p1, r1, f11 = 0., 0., 0. sentence_len = X.shape[1] for train_index, test_index in cv_object.split(X): if INITIALIZE_WEIGHTS_WITH == "glove": model.layers[0].set_weights([weights]) elif INITIALIZE_WEIGHTS_WITH == "random": shuffle_weights(model) else: print "ERROR!" return X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) for epoch in xrange(epochs): for X_batch in batch_gen(X_temp, batch_size): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] class_weights = None if SCALE_LOSS_FUN: class_weights = {} class_weights[0] = np.where( y_temp == 0)[0].shape[0] / float(len(y_temp)) class_weights[1] = np.where( y_temp == 1)[0].shape[0] / float(len(y_temp)) class_weights[2] = np.where( y_temp == 2)[0].shape[0] / float(len(y_temp)) try: y_temp = np_utils.to_categorical(y_temp, nb_classes=3) except Exception as e: print e print y_temp print x.shape, y.shape loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights) print loss, acc y_pred = model.predict_on_batch(X_test) y_pred = np.argmax(y_pred, axis=1) print classification_report(y_test, y_pred) print precision_recall_fscore_support(y_test, y_pred) print y_pred p += precision_score(y_test, y_pred, average='weighted') p1 += precision_score(y_test, y_pred, average='micro') r += recall_score(y_test, y_pred, average='weighted') r1 += recall_score(y_test, y_pred, average='micro') f1 += f1_score(y_test, y_pred, average='weighted') f11 += f1_score(y_test, y_pred, average='micro') print "macro results are" print "average precision is %f" % (p / NO_OF_FOLDS) print "average recall is %f" % (r / NO_OF_FOLDS) print "average f1 is %f" % (f1 / NO_OF_FOLDS) print "micro results are" print "average precision is %f" % (p1 / NO_OF_FOLDS) print "average recall is %f" % (r1 / NO_OF_FOLDS) print "average f1 is %f" % (f11 / NO_OF_FOLDS)
X = X_train_df.values.astype('float32') X_scaler = MinMaxScaler(feature_range=(0, 1)) X = X_scaler.fit_transform(X) X_test = X_df.loc[test_dates_in_X_df] Y_df = pd.read_csv('swell_Y.csv', index_col=[0]) Y_train_df = Y_df.loc[set(X_train_df.index.values)] Y = Y_train_df.values # 24시간 100101011... 같은 형태의 Y값 number_of_var = len(X_train_df.columns) first_layer_node_cnt = int(number_of_var * (number_of_var - 1) / 2) print("first_layer_node_cnt %d" % first_layer_node_cnt) epochs = 300 patience_num = 100 n_fold = 10 kf = KFold(n_splits=n_fold, shuffle=True, random_state=seed) # 빈 accuracy 배열 accuracy = [] filename = os.path.basename(os.path.realpath(sys.argv[0])) # 모델의 설정, 컴파일, 실행 for train_index, validation_index in kf.split(X): # 이하 모델을 학습한 뒤 테스트. print("TRAIN:", train_index, "TEST:", validation_index) X_train, X_Validation = X[train_index], X[validation_index] Y_train, Y_Validation = Y[train_index], Y[validation_index] model = Sequential() model.add( Dense(first_layer_node_cnt, input_dim=number_of_var, activation='relu')) model.add(Dense(int(first_layer_node_cnt / 2), activation='relu'))
def calc_model(): global word_features, classifier, word_features_2gram # documents = [(list(movie_reviews.words(fileid)), category) # for category in movie_reviews.categories() # for fileid in movie_reviews.fileids(category)] documents = [] documents2gram = [] with open("positive.txt", 'r') as csv_file: pos = 1 for record in csv_file: documents.append((word_tokenize(record), pos)) # sixgrams = get_ngrams(record, 2) # documents2gram.append((get_ngrams(record, 2), pos)) with open("negative.txt", 'r') as csv_file: for record in csv_file: documents.append((word_tokenize(record), 0)) # documents2gram.append((get_ngrams(record, 2), 0)) random.shuffle(documents) # random.shuffle(documents2gram) all_words = [] for lst in documents: for w in lst[0]: all_words.append(w.lower()) # all_words_2gram = [] # for lst in documents2gram: # for w in lst[0]: # all_words_2gram.append(w.lower()) all_words = nltk.FreqDist(all_words) print("getting features") word_features = list(all_words.keys())[:5000] # all_words_2gram = nltk.FreqDist(all_words_2gram) # print("getting features") # word_features_2gram = list(all_words_2gram.keys())[:5000] save_pickle(pickle_word_features, word_features) print("saved word features") print("setting features per tweet") feature_sets = [(find_features(rev), category) for (rev, category) in documents] # feature_sets_2gram = [(find_features(rev), category) for (rev, category) in documents2gram] k = 10 cv = KFold(k) accur = [] i = 0 testing_set = feature_sets[1900:] #+ feature_sets_2gram[1900:] training_set = feature_sets[:1900] #+ feature_sets_2gram[:1900] linear_svc_classifier = SklearnClassifier(LinearSVC()) # classifier = nltk.NaiveBayesClassifier.train(testing_set) classifier = linear_svc_classifier.train(testing_set) accur.insert(i, nltk.classify.util.accuracy(classifier, training_set)) print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier(nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) display_importances(feature_importance_df) return feature_importance_df
def train_classifier(name, classifier, parameters, data, seed, k=10, plot=False): """ Train the given classifier using a 10-fold cross validation. The given hyper-parameters are chosen using a 5-fold cross validation. :param name: Name of the classifier to train. :param classifier: Classifier to train. :param parameters: Hyper-parameters to choose from. :param data: Samples to use for the training and test. :param seed: Seed to use for the random generator, used for the k-fold split. :param k: Number of fold for the cross validation (default 10). :param plot: If True, a plot for the hyper-parameters will be generated. :return: Performances (tuple of accuracy, f1, roc_auc) of the classifiers trained with the best hyper-parameters for each k-fold. """ # save results of the 10-folds folds = [] # store the performances of the best 10 trained classifiers accuracy = [] f1 = [] roc_auc = [] # divide features and classes X, y = data # split the data for the 10-fold cross validation kf = KFold(n_splits=k, shuffle=True, random_state=seed) for i, (train_index, test_index) in enumerate(kf.split(X, y)): print("Training {}... Fold {}".format(name, i + 1)) # extract training and test data X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] # tune the hyper-parameters (5-fold cross validation, F1 scoring function) best_classifier = GridSearchCV(classifier, parameters, scoring="f1", cv=5, n_jobs=5, refit=True) best_classifier.fit(X_train, y_train) # store the classifier for the plot (tuning of hyper-parameter) folds.append(best_classifier) # measure the performances prediction = best_classifier.predict(X_test) accuracy.append(accuracy_score(y_test, prediction)) f1.append(f1_score(y_test, prediction)) roc_auc.append(roc_auc_score(y_test, prediction)) # plot the choice of the best hyper-parameters if plot is True: # extract the parameter if len(parameters.keys()) != 1: raise NotImplementedError( "The number of hyper-parameters is not equal to 1. " "I do not know how to plot them.") parameter_name = list(parameters.keys())[0] # extract the hyper-parameter values assert len(folds) == k x_axis_values = folds[0].cv_results_["param_" + parameter_name] for i in range(k): np.testing.assert_array_equal( x_axis_values, folds[i].cv_results_["param_" + parameter_name]) # extract the F1 scores y_axis_values = list( map(lambda x: x.cv_results_["mean_test_score"], folds)) # plot the graph for i in range(k): plt.semilogx(x_axis_values, y_axis_values[i], label="Fold " + str(i + 1)) plt.title(name + " - tuning of the best value for " + parameter_name) plt.xlabel(parameter_name + " parameter") plt.ylabel("F1 scores (on the test set)") plt.legend(loc=4) plt.savefig(name.lower().replace(" ", "_") + ".png") plt.close() # return the performances return accuracy, f1, roc_auc
# The following figure illustrates k-fold cross-validation with k=4. There are some other schemes to divide the training set, we'll look at them briefly later. # ### K-Fold Cross Validation # It is a statistical technique which enables us to make extremely efficient use of available data It divides the data into several pieces, or 'folds', and uses each piece as test data one at a time # In[103]: from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV # In[104]: xgb = XGBClassifier(n_estimators=18) scores = cross_val_score(xgb, X_smote, y_smote, scoring='r2', cv=5) scores # In[105]: # the other way of doing the same thing (more explicit) # create a KFold object with 5 splits folds = KFold(n_splits=5, shuffle=True, random_state=100) scores_1 = cross_val_score(xgb, X_smote, y_smote, scoring='r2', cv=folds) scores_1 # We used several methods to predict the default the best result we got by using XGboost on data which was sampled using SMOTE the Accuracy of XGB on the testing dataset is :0.981. Also the important features are:V4,V14,V12,V16,V11. Also by performing logistic regression we got a good score of Accuracy: 97.6780% Recall: 87.8378% ROC AUC: 92.7664% For classification model. # In[ ]: