def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import accuracy_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) log = SGDClassifier(loss="log", n_jobs=55) log.fit(train_embeds, train_labels) print("Test scores") print(accuracy_score(test_labels, log.predict(test_embeds))) print("Train scores") print(accuracy_score(train_labels, log.predict(train_embeds))) print("Random baseline") print(accuracy_score(test_labels, dummy.predict(test_embeds)))
def get_scores(X, y): nfolds = 40 cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=.05) dumb = DummyClassifier(strategy='most_frequent') clf = svm.SVC(class_weight='auto') param_dist = {"C": [.1, 1, 10], "kernel": ['rbf', 'linear', 'poly'] } search = GridSearchCV(clf, param_grid=param_dist, scoring='mean_absolute_error') stest, strain, sdummy = [], [], [] for nfeats in range(X.shape[1]): test_scores, train_scores, dummy_scores = [], [], [] # figure out our possible feature combinations feats = itertools.combinations(range(X.shape[1]), nfeats + 1) for my_feats in feats: for oidx, (train, test) in enumerate(cv): idx = np.array(my_feats) y_train, y_test = y[train], y[test] X_train, X_test = X[train, :], X[test, :] search.fit(X_train, y_train) clf = search.best_estimator_ clf.fit(X_train[:, idx], y_train) train_scores.append(accuracy_score(clf.predict(X_train[:, idx]), y_train)) test_scores.append(accuracy_score(clf.predict(X_test[:, idx]), y_test)) dumb.fit(X_train[:, idx], y_train) dummy_scores.append(accuracy_score(dumb.predict(X_test[:, idx]), y_test)) sdummy.append(np.mean(dummy_scores)) strain.append(np.mean(train_scores)) stest.append(np.mean(test_scores)) return stest, strain, sdummy
def do_cross_validation(labels): """Perform the k-fold cross validation. Perform the k-fold cross validation, collect the result and return the single test instance predictions, as well as the classification results for each single fold and for the combination of all folds. Keyword arguments: features -- all features labels -- all labels """ skf = StratifiedKFold(labels, NO_OF_FOLDS) single_predictions = [] # Store each single classification decision # Store classification results for each fold and for the entire task (i.e., # entire cross validation). classification_result = np.zeros((NO_OF_FOLDS + 1, 5)) for cur_fold, (train_idx, test_idx) in enumerate(skf): model = DummyClassifier(strategy='most_frequent') model.fit(None, labels[train_idx]) pred_labels = model.predict(np.zeros(labels[test_idx].shape[0])) fold_array = np.empty(test_idx.shape[0]) fold_array.fill(cur_fold) single_predictions.append(np.transpose(np.vstack((fold_array, test_idx, labels[test_idx], pred_labels)))) classification_result[cur_fold, :] = get_classification_result(cur_fold, labels[test_idx], pred_labels) single_predictions = np.vstack(single_predictions) return single_predictions, classification_result
def get_scores(X, y): nfolds = 200 cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=0.2) dumb = DummyClassifier(strategy="most_frequent") clf = svm.SVC(class_weight="auto") clf = linear_model.LogisticRegression() param_dist = {"C": [0.1, 1, 10], "kernel": ["rbf", "linear", "poly"]} param_dist = {"C": [1e6, 1e5, 1e4, 1e3, 1e2, 10, 1, 0.1, 0.01, 0.001]} search = GridSearchCV(clf, param_grid=param_dist, scoring="mean_absolute_error") test_scores, train_scores, dummy_scores = [], [], [] preds, true_labels = [], [] for oidx, (train, test) in enumerate(cv): y_train, y_test = y[train], y[test] X_train, X_test = X[train, :], X[test, :] search.fit(X_train, y_train) clf = search.best_estimator_ print search.best_params_ clf.fit(X_train, y_train) train_scores.append(accuracy_score(clf.predict(X_train), y_train)) test_scores.append(accuracy_score(clf.predict(X_test), y_test)) dumb.fit(X_train, y_train) dummy_scores.append(accuracy_score(dumb.predict(X_test), y_test)) preds += list(clf.predict(X_test)) true_labels += list(y_test) return test_scores, train_scores, dummy_scores, preds, true_labels
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1): working_directory = config['DATA_DIRECTORY'] data_X, data_y = load_data(working_directory, filename, cols, question) data = leave_one_subject_out(data_X, data_y, 'User') score = 0 score_dummy_mf = 0 score_dummy_sf = 0 dummy_clf_mf = DummyClassifier('most_frequent') dummy_clf_sf = DummyClassifier('stratified') for (training_X, training_y), (testing_X, testing_y) in data: clf.fit(training_X, training_y) dummy_clf_mf.fit(training_X, training_y) dummy_clf_sf.fit(training_X, training_y) single_score = clf.score(testing_X, testing_y) single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y) single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y) #print 'Single run score: ' + ("%0.2f" % single_score.mean()) #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean()) #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean()) score = score + single_score.mean() score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean() score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean() score = round(float(score / len(data)), 2) score_dummy_mf = round(float(score_dummy_mf / len(data)), 2) score_dummy_sf = round(float(score_dummy_sf / len(data)), 2) #print 'Total score: ' + str(score) #print 'Total score (dummy most frequent): ' + str(score_dummy_mf) #print 'Total score (dummy stratified): ' + str(score_dummy_sf) if return_index == -1: return score, score_dummy_mf, score_dummy_sf else: return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
def _run_dummy_detection(x_train, x_test, y_train, y_test): clf = DummyClassifier(strategy='most_frequent') print "Training Dummy..." clf.fit(x_train, y_train) print "Predicting Test Set..." print "Score for test set: {}".format(clf.score(x_test, y_test))
def test_dummy_classifier_on_nan_value(): X = [[np.NaN]] y = [1] y_expected = [1] clf = DummyClassifier() clf.fit(X, y) y_pred = clf.predict(X) assert_array_equal(y_pred, y_expected)
def test_most_frequent_strategy(): X = [[0], [0], [0], [0]] # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="most_frequent", random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y)
def test_constant_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]]) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y)
def test_dummy_classifier_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = [2, 2, 2] y_expected = [2, 2, 2] y_proba_expected = [[1], [1], [1]] cls = DummyClassifier() cls.fit(X, y) y_pred = cls.predict(X) y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected)
def test_constant_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]])) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
def test_stratified_strategy(): X = [[0]] * 5 # ignored y = [1, 2, 1, 1, 2] clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 1000 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 3. / 5, decimal=1) assert_almost_equal(p[2], 2. / 5, decimal=1) _check_predict_proba(clf, X, y)
def test_uniform_strategy(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y)
def test_most_frequent_and_prior_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]]) n_samples = len(X) for strategy in ("prior", "most_frequent"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_classifier_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf1.fit(X1, y) predictions1 = clf1.predict(X1) X2 = [[1]] * 4 clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf2.fit(X2, y) predictions2 = clf2.predict(X2) assert_array_equal(predictions1, predictions2)
def test_most_frequent_and_prior_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]])) n_samples = len(X) y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]) for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) assert_array_equal(y_pred.toarray(), y_expected)
def main(training_set, language, gold_standard, gazetteer): """ Searches for the best hyperparameters """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} logger.info('Building training set') extractor = FactExtractorFeatureExtractor(language) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes'], add_unknown=True, gazetteer=gazetteer) logger.info('Finalizing training set') x, y = extractor.get_features() logger.info('Searching for the best model parameters') svc = LinearSVC() search = GridSearchCV( svc, param_grid=[{ 'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'multi_class': ['ovr', 'crammer_singer'], }], scoring='f1_weighted', cv=10) search.fit(x, y) logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s', search.best_score_, search.best_params_) if not gold_standard: logger.info('Skipping gold standard evaluation') return logger.info('Evaluating on the gold standard') for row in gold_standard: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes']) x_gold, y_gold = extractor.get_features() dummy = DummyClassifier(strategy='stratified') dummy.fit(x, y) y_dummy = dummy.predict(x_gold) logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_dummy, average='weighted')) y_best = search.predict(x_gold) logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_best, average='weighted'))
def test_most_frequent_and_prior_strategy(): X = [[0], [0], [0], [0]] # ignored y = [1, 2, 1, 1] for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) if strategy == "prior": assert_array_equal(clf.predict_proba(X[0]), clf.class_prior_.reshape((1, -1))) else: assert_array_equal(clf.predict_proba(X[0]), clf.class_prior_.reshape((1, -1)) > 0.5)
def test_most_frequent_and_prior_strategy_with_2d_column_y(): # non-regression test added in # https://github.com/scikit-learn/scikit-learn/pull/13545 X = [[0], [0], [0], [0]] y_1d = [1, 2, 1, 1] y_2d = [[1], [2], [1], [1]] for strategy in ("most_frequent", "prior"): clf_1d = DummyClassifier(strategy=strategy, random_state=0) clf_2d = DummyClassifier(strategy=strategy, random_state=0) clf_1d.fit(X, y_1d) clf_2d.fit(X, y_2d) assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None, n=1): """Try all dummy models.""" X = X.reshape((len(X) ,-1)) # y = y.reshape((len(y) ,-1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) dummy_scores = [] for i in range(n): for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']: clf = DummyClassifier(strategy=strategy) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = clf.score(X_test, y_test) matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh) report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) dummy_scores.append( collections.OrderedDict( strategy='classifier_' + strategy, matthews_corrcoef=matthews_corrcoef, score=score, report=report ) ) for strategy in ['mean', 'median']: clf=DummyRegressor(strategy=strategy) clf.fit(X_train, y_train) y_pred=clf.predict(X_test) score=clf.score(X_test, y_test) matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh) report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) dummy_scores.append( collections.OrderedDict( strategy='regressor_' + strategy, matthews_corrcoef=matthews_corrcoef, score=score, report=report ) ) df=pd.DataFrame(dummy_scores) df=df.sort_values('matthews_corrcoef', ascending=False) return df, df[:1].iloc[0].to_dict()
def test_constant_strategy(): X = [[0], [0], [0], [0]] # ignored y = [2, 1, 2, 2] clf = DummyClassifier(strategy="constant", random_state=0, constant=1) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) X = [[0], [0], [0], [0]] # ignored y = ['two', 'one', 'two', 'two'] clf = DummyClassifier(strategy="constant", random_state=0, constant='one') clf.fit(X, y) assert_array_equal(clf.predict(X), np.array(['one'] * 4)) _check_predict_proba(clf, X, y)
def main(training_set, language, gold_standard, gazetteer, n_folds, n_jobs, scoring, output, test, word2vec_model, independent_lus): """ Searches for the best hyperparameters """ logger.info('Searching for the best model and parameters') training_sets = get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus) models = get_models(test) search = MultimodelGridSearchCV(*models, cv=n_folds, n_jobs=n_jobs, scoring=Scorer(scoring, True)) (x_tr, y_tr, best_training_meta), best_score, best_params, best_model = search.fit(training_sets) logger.info('Evaluation Results') logger.info(' Best model: %s', best_model.__class__.__name__) logger.info(' Score: %f', best_score) logger.info(' Parameters: %s', best_params) logger.info(' Gazetteer: %s', best_training_meta['gazetteer']) logger.info(' Extractor: %s', best_training_meta['extractor_cls'].__name__) logger.info(' Extractor args: %s', best_training_meta['extractor_args']) joblib.dump((best_model, best_training_meta), output) logger.info("Done, dumped model to '%s'", output) if not gold_standard: logger.info('Skipping gold standard evaluation') return logger.info('Evaluating on the gold standard') extractor = best_training_meta['extractor'] gazetteer = best_training_meta['gazetteer'] extractor.start() for row in gold_standard: data = json.loads(row) extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=False, gazetteer=gazetteer) x_gold, y_gold = extractor.get_features(refit=False) dummy = DummyClassifier(strategy='stratified') dummy.fit(x_tr, y_tr) logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f', Scorer(scoring, True)(dummy, x_gold, y_gold)) logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f', Scorer(scoring, True)(best_model, x_gold, y_gold))
def test_dtype_of_classifier_probas(strategy): y = [0, 2, 1, 1] X = np.zeros(4) model = DummyClassifier(strategy=strategy, random_state=0, constant=0) probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64
def test_uniform_strategy_multioutput(): X = [[0]] * 4 # ignored y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]]) clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_stratified_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]])) clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) y_pred = y_pred.toarray() for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 3.0 / 5, decimal=1) assert_almost_equal(p[0], 1.0 / 5, decimal=1) assert_almost_equal(p[4], 1.0 / 5, decimal=1)
def compare_dummy_classification(self): """ Compares classifier to dummy classifiers. Return results (resultscores_tuple, N.A., N.A.)""" X_train = self.train_vectors y_train = self.train_tweetclasses X_test = self.test_vectors y_test = self.test_tweetclasses dummy_results = [] dummy = DummyClassifier(strategy="most_frequent", random_state=0) dummy.fit(X_train, y_train) y_true, y_preddum = y_test, dummy.predict(X_test) tuples = precision_recall_fscore_support(y_true, y_preddum) dummy1 = DummyClassifier(strategy="stratified", random_state=0) dummy1.fit(X_train, y_train) y_true, y_preddum1 = y_test, dummy1.predict(X_test) tuples1 = precision_recall_fscore_support(y_true, y_preddum1) dummy2 = DummyClassifier(strategy="uniform", random_state=0) dummy2.fit(X_train, y_train) y_true, y_preddum2 = y_test, dummy2.predict(X_test) tuples2 = precision_recall_fscore_support(y_true, y_preddum2) resulttuple = ("dummy freq", "N.A.", "N.A.", "N.A.", "N.A.", tuples) resulttuple1 = ("dummy strat", "N.A.", "N.A.", "N.A.", "N.A.", tuples1) resulttuple2 = ("dummy uni", "N.A.", "N.A.", "N.A.", "N.A.", tuples2) dummy_results.append(resulttuple) dummy_results.append(resulttuple1) dummy_results.append(resulttuple2) return dummy_results
def compare_dummy(self): """ Compares classifier to dummy classifiers""" #print "\nDetailed classification report:\n" #print "The model is trained on the full development set.\n" #print "The scores are computed on the full evaluation set.\n" X_train = self.train_vectors y_train = self.train_tweetclasses X_test = self.test_vectors y_test = self.test_tweetclasses dummy = DummyClassifier(strategy='most_frequent',random_state=0) dummy.fit(X_train, y_train) y_true, y_preddum = y_test, dummy.predict(X_test) tuples = precision_recall_fscore_support(y_true, y_preddum) dummy1 = DummyClassifier(strategy='stratified',random_state=0) dummy1.fit(X_train, y_train) y_true, y_preddum1 = y_test, dummy1.predict(X_test) tuples1 = precision_recall_fscore_support(y_true, y_preddum1) dummy2 = DummyClassifier(strategy='uniform',random_state=0) dummy2.fit(X_train, y_train) y_true, y_preddum2 = y_test, dummy2.predict(X_test) tuples2 = precision_recall_fscore_support(y_true, y_preddum2) return (tuples, tuples1,tuples2)
def eval_against_dumm(FS, aut_target, myclf, folder): real_acc = [] dummy1_acc, dummy2_acc, dummy3_acc = [], [], [] clf = copy.deepcopy(myclf) for train_index, test_index in folder: clf.fit(FS[train_index, :],aut_target[train_index]) labels = np.asarray(clf.predict(FS[test_index, :])) acc = np.mean(aut_target[test_index] == labels) real_acc.append(acc) clf = DummyClassifier("stratified") clf.fit(FS[train_index, :], aut_target[train_index]) labels = np.asarray(clf.predict(FS[test_index, :])) acc = np.mean(aut_target[test_index] == labels) dummy1_acc.append(acc) clf = DummyClassifier("most_frequent") clf.fit(FS[train_index, :], aut_target[train_index]) labels = np.asarray(clf.predict(FS[test_index, :])) acc = np.mean(aut_target[test_index] == labels) dummy2_acc.append(acc) clf = DummyClassifier("uniform") clf.fit(FS[train_index, :], aut_target[train_index]) labels = np.asarray(clf.predict(FS[test_index, :])) acc = np.mean(aut_target[test_index] == labels) dummy3_acc.append(acc) return np.mean(real_acc), np.mean(dummy1_acc), np.mean(dummy2_acc),\ np.mean(dummy3_acc)
def kfolds_evaluation(folds, model, scoring, skip_majority, x, y): kf = KFold(x.shape[0], folds, shuffle=True) scorer = Scorer(scoring, skip_majority) scores_dummy, scores_test, scores_train = [], [], [] for train_index, test_index in kf: x_train, y_train = x[train_index], y[train_index] x_test, y_test = x[test_index], y[test_index] model.fit(x_train, y_train) dummy = DummyClassifier() dummy.fit(x_train, y_train) scores_test.append(scorer(model, x_test, y_test)) scores_dummy.append(scorer(dummy, x_test, y_test)) scores_train.append(scorer(model, x_train, y_train)) logger.info("%d-folds cross evaluation results", folds) logger.info(" minimum test %f dummy %f training %f", min(scores_test), min(scores_dummy), min(scores_train)) logger.info(" maximum test %f dummy %f training %f", max(scores_test), max(scores_dummy), max(scores_train)) logger.info( " average test %f dummy %f training %f", np.average(scores_test), np.average(scores_dummy), np.average(scores_train), ) logger.info( " median test %f dummy %f training %f", np.median(scores_test), np.median(scores_dummy), np.median(scores_train), ) logger.debug("full test scores: %s", scores_test) logger.debug("full dummy scores: %s", scores_dummy) logger.debug("full train scores: %s", scores_train)
def get_xs_ys_predictions(embeddings_dict, classifier): """ Run a classifier of type 'classifier' (one of: majority vote baseline, tratified sampling baseline, 10-NN classifier). Return: - xs: the word embeddings - ys: the gold standard labels - y_pred: the predicted labels """ assert classifier in ['majority_vote', 'stratified', '10-NN'] pos_ints = {'v': 0, 'n': 1, 'adj': 2, 'fn': 3} ys = [] xs = [] words = sorted(embeddings_dict.keys()) for w in words: xs.append(embeddings_dict[w]) # get embeddings's pos tag, look up pos tag's unique integer label = pos_ints[get_pos_tag(w)] ys.append(label) clf = None if classifier == 'majority_vote': clf = DummyClassifier(strategy='most_frequent', random_state=0) elif classifier == 'stratified': clf = DummyClassifier(strategy='stratified', random_state=0) elif classifier == '10-NN': clf = KNeighborsClassifier(n_neighbors=10, algorithm='ball_tree') clf.fit(xs, ys) y_pred = clf.predict(xs) return xs, ys, y_pred
# Train and test data split from training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # print(X_train.shape, y_train.shape) # print(X_test.shape, y_test.shape) from sklearn.dummy import DummyClassifier # create model model_dummy = DummyClassifier(strategy='most_frequent', random_state=0) # train model model_dummy.fit(X_train, y_train) # print(f'Accuracy for baseline model : {model_dummy.score(X_test, y_test)}') # performance metrics from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score # print(f'Accuracy for baseline model : {accuracy_score(y_test, model_dummy.predict(X_test) )}') # print(f'Accuracy for baseline model : {confusion_matrix(y_test, model_dummy.predict(X_test) )}') # print(f'Accuracy for baseline model : {precision_score(y_test, model_dummy.predict(X_test) )}') # print(f'Accuracy for baseline model : {recall_score(y_test, model_dummy.predict(X_test) )}') # import logistic regression from sklearn from sklearn.linear_model import LogisticRegression # create model
def get_dummy_classifier(self): """ Return a dummy classifier object. """ clf = DummyClassifier() return clf.fit(self.x_train, self.y_train)
y_temp = y_temp.to_numpy() y = [] for i in range(len(y_temp)): if np.array_equal(y_temp[i], np.array([0, 0])): y.append(0) elif np.array_equal(y_temp[i], np.array([1, 0])): y.append(1) elif np.array_equal(y_temp[i], np.array([0, 1])): y.append(2) elif np.array_equal(y_temp[i], np.array([1, 1])): y.append(3) y = np.array(y) model = DummyClassifier(strategy="most_frequent") model.fit(X, y) y_pred = model.predict(X) accuracy1 = accuracy_score(y, y_pred) print('Base Accuracy', accuracy1, file=f) model = DummyClassifier(strategy="stratified") model.fit(X, y) y_pred = model.predict(X) accuracy2 = accuracy_score(y, y_pred) print('Stratified Class Base Accuracy', accuracy2, file=f) conf_matrix_list_of_arrays = [] scores = [] for i in range(10): for fold_ind, (train_index, test_index) in enumerate( stratified_group_k_fold(X, y, ids, k=8)):
def main(): data = util.load_data(filenameX, filenamey, header=1) X, y = data.X, data.y print data.Xnames set_data_weights(y, data) n_splits = 5 kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=None) metric_list = [ "accuracy", "f1_score", "auroc", "precision", "sensitivity", "specificity" ] max_f1_linear = 0 # for train, test in kf.split(X, y): # X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] # train_weights, test_weights = data.weights[train], data.weights[test] # print select_param_rbf(X_train, y_train, kf, metric="f1_score") # max_f1_linear = max(max_f1_linear, select_param_linear(X, y, kf, metric="f1_score")) print 'max_f1_linear', max_f1_linear C, gamma = (10.0, 0.1) X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split( X, y, data.weights, test_size=0.2, stratify=y) dumclf = DummyClassifier(strategy="most_frequent") dumclf.fit(X_train, y_train, sample_weight=weight_train) rbfclf = SVC(C=C, gamma=gamma, class_weight="balanced") rbfclf.fit(X_train, y_train) y_pred = rbfclf.predict(X_test) # compute classifier performance for metric in metric_list: print metric + ":", performance(y_test, y_pred, metric) svc_train_score = rbfclf.score(X_train, y_train) svc_test_score = rbfclf.score(X_test, y_test) dummy_train_score = dumclf.score(X_train, y_train, weight_train) dummy_test_score = dumclf.score(X_test, y_test, weight_test) print metrics.confusion_matrix(y_test, y_pred, labels=[1, 0], sample_weight=weight_test) print "RBFSVC train accuracy: %.6f" % (svc_train_score) print "Dummy train accuracy: %.6f" % (dummy_train_score) print "RBFSVC test accuracy: %.6f" % (svc_test_score) print "Dummy test accuracy: %.6f" % (dummy_test_score) max_f1_linear = 10.0 linclf = SVC(C=max_f1_linear, kernel="linear") linclf.fit(X_train, y_train) print "Linear SVC train accuracy: %.6f" % (linclf.score(X_train, y_train)) print "Linear SVC test accuracy: %.6f" % (linclf.score(X_test, y_test)) y_pred = linclf.predict(X_test) print "Linear SVC test F1 score: %.6f" % (performance( y_test, y_pred, metric="f1_score")) print "Top ten features (probably) in order from largest to smallest:" indices = linclf.coef_[0].argsort()[-10:][::-1] print[data.Xnames[i] for i in indices] print "RBF F1/accuracy score with each of the top ten features removed." for i in indices: X_train_mod = np.delete(X_train, i, 1) X_test_mod = np.delete(X_test, i, 1) rbfclf.fit(X_train_mod, y_train) y_train_pred = rbfclf.predict(X_train_mod) y_test_pred = rbfclf.predict(X_test_mod) print "%s:" % (data.Xnames[i]) print "\tAccuracy: " print "\t\tTrain: %.6f" % (rbfclf.score(X_train_mod, y_train)) print "\t\tTest: %.6f" % (rbfclf.score(X_test_mod, y_test)) print "\tF1 Score:" print "\t\tTrain: %.6f" % (performance( y_train, y_train_pred, metric="f1_score")) print "\t\tTest: %.6f" % (performance( y_test, y_test_pred, metric="f1_score")) print "RBF Least predictive ten features (probably) in order from smallest to largest:" indices = linclf.coef_[0].argsort()[:10] print[data.Xnames[i] for i in indices] print "F1/accuracy score with each of the bottom ten features removed cumulatively" for i in indices: X_train_mod = np.delete(X_train, indices[:i + 1], 1) X_test_mod = np.delete(X_test, indices[:i + 1], 1) rbfclf.fit(X_train_mod, y_train) y_train_pred = rbfclf.predict(X_train_mod) y_test_pred = rbfclf.predict(X_test_mod) print "%s:" % (data.Xnames[i]) print "\tAccuracy: " print "\t\tTrain: %.6f" % (rbfclf.score(X_train_mod, y_train)) print "\t\tTest: %.6f" % (rbfclf.score(X_test_mod, y_test)) print "\tF1 Score:" print "\t\tTrain: %.6f" % (performance( y_train, y_train_pred, metric="f1_score")) print "\t\tTest: %.6f" % (performance( y_test, y_test_pred, metric="f1_score"))
train_X, test_X = X[train_idx], X[test_idx] train_Y, test_Y = Y[train_idx], Y[test_idx] train_m_id, test_m_id = m_id[train_idx], m_id[test_idx] train_m_id = set(train_m_id.tolist()) print len(train_X), len(test_X) train_label_freq = Counter(train_Y) print train_label_freq, len(train_label_freq) test_label_freq = Counter(test_Y) print test_label_freq, len(test_label_freq) #majority classifier maj_clf = DummyClassifier(strategy='most_frequent') maj_clf.fit(train_X, train_Y) maj_pred_Y = maj_clf.predict(test_X) maj_label = maj_pred_Y[0] print pred_eval(test_Y, maj_pred_Y), maj_label # print Counter(maj_pred_Y) #linear svm svc_clf = LinearSVC(penalty='l2', C=10.0, dual=False, multi_class='ovr') svc_clf.fit(train_X, train_Y) svc_pred_Y = svc_clf.predict(test_X) print pred_eval(test_Y, svc_pred_Y) svc_label_freq = Counter(svc_pred_Y)
scores = cross_val_score(clf, X, y, cv=5) end = time.time() accuracy_all.append(accuracy_score(prediction,test_labels)) cvs_all.append(np.mean(scores)) #print("1-GaussianNB accuracy:",clf.score(test,test_labels)) print("1-GaussianNB accuracy :",accuracy_score(prediction,test_labels)) print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2)) print("Execution time: {0:.5} seconds \n".format(end-start)) # 2-Initialize our dummy classifier start = time.time() dummy=DummyClassifier() # Train our classifier dummy.fit(train, train_labels) prediction = dummy.predict(test) scores = cross_val_score(dummy, X, y, cv=5) end = time.time() accuracy_all.append(accuracy_score(prediction,test_labels)) cvs_all.append(np.mean(scores)) #print("2-Dummy accuracy:",dummy.score(test,test_labels)) print("2-Dummy Accuracy: {0:.2%}".format(accuracy_score(prediction,test_labels))) print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2)) print("Execution time: {0:.5} seconds \n".format(end-start)) # 3-Initialize our KNeighbors classifier start = time.time() clf = neighbors.KNeighborsClassifier()
y_temp=y_temp.to_numpy() y=[] for i in range(len(y_temp)): if np.array_equal(y_temp[i],np.array([0,0])): y.append(0) elif np.array_equal(y_temp[i],np.array([1,0])): y.append(1) elif np.array_equal(y_temp[i],np.array([0,1])): y.append(2) elif np.array_equal(y_temp[i],np.array([1,1])): y.append(3) y=np.array(y) model = DummyClassifier(strategy="most_frequent") model.fit(X, y) y_pred = model.predict(X) accuracy1 = accuracy_score(y, y_pred) print('Majority Class Base Accuracy',accuracy1,file=f) model = DummyClassifier(strategy="stratified") model.fit(X, y) y_pred = model.predict(X) accuracy2 = accuracy_score(y, y_pred) print('Stratified Class Base Accuracy',accuracy2,file=f) from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import Pipeline # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
# model_mem = InMemoryModel(model.predict_proba, examples=test_data) # interpreter.feature_importance.plot_feature_importance(model_mem, ascending=False, ax=ax) # ax.set_title(f"{title} on fold {fold}") # print("\n") # modelno += 1 # fold += 1 # plt.tight_layout() for train_index, test_index in kFold.split(yeastAttrib): print(f"------------" f"Fold {fold}") modelno = 1 train_data, train_target = yeastAttrib[train_index], yeastTarget[ train_index] test_data, test_target = yeastAttrib[test_index], yeastTarget[ test_index] dummy.fit(train_data, train_target) prediction = dummy.predict(test_data) print("Dummy prediction") print(classification_report(test_target, prediction)) for model, title in zip(models, titles): clf = model.fit(train_data, train_target) prediction = clf.predict(test_data) print(f"{title}") print(classification_report(test_target, prediction)) print( f"Confusion Matrix: \n {confusion_matrix(test_target, prediction)}" ) # ax = axs[modelno - 1, fold - 1] interpreter = Interpretation(test_data, feature_names=featureNames[1:9])
kfold = KFold(10, True, 1) fold_number = 1 for train, test in kfold.split(data): print "........... Fold %d ..........." % fold_number training_corpus = build_corpus(train) train_labels = build_labels(train) test_corpus = build_corpus(test) test_labels = build_labels(test) #Generating dummy accuracies for each fold. dummy_clf.fit(training_corpus, train_labels) dummy_accuracies.append(dummy_clf.score(test_corpus, test_labels) * 100) vectorizer = CountVectorizer(ngram_range=args.ngrange, stop_words=stop_words, binary=args.onehot, analyzer='word', token_pattern=r'\b[^\W\d]+\b') vectors = vectorizer.fit_transform(training_corpus).toarray() sums = vectors.sum(axis=0) j = 0 print "Cleaning up the TDM according to the supplied cutoff value:" for i in tqdm(vectorizer.vocabulary_.items()):
def ModelRandomGuessing(hog_features, labels, pp): model = "RandomGuessing" clf = DummyClassifier() clf.fit(hog_features, labels) joblib.dump((clf, pp), "model0randomguessing.pkl", compress=3) return (model, clf)
def train_rf(self, features, labels): print('Training random forest ...') self.model = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=np.ceil( len(features[0]) / 5), min_samples_leaf=3, n_jobs=-1) self.model2 = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=np.ceil( len(features[0]) / 5), min_samples_leaf=3, n_jobs=-1) self.lr0 = linear_model.TheilSenRegressor() self.lr1 = linear_model.TheilSenRegressor() reg_dummy = DummyRegressor() clf_dummy = DummyClassifier() kfold = KFold(n_splits=self.kfold, shuffle=True) kfold2 = KFold(n_splits=self.kfold, shuffle=True) features, labels = shuffle(features, labels) import matplotlib.pyplot as plt import seaborn as sns sns.set(style='whitegrid', context='paper') for ifold, (train, test) in enumerate(kfold.split(labels)): self.model.fit(features[train], labels[train]) score_train = self.model.score(features[train], labels[train]) score_test = self.model.score(features[test], labels[test]) reg_dummy.fit(features[train], labels[train]) score_dummy = reg_dummy.score(features[test], labels[test]) print('Fold %d: %.4f / %.4f (%.4f)' % (ifold, score_test, score_train, score_dummy)) labels_t = labels.transpose() y_pred = self.model.predict(features) y_pred_t = y_pred.transpose() # self.lr0.fit(labels_t[0][train].reshape(-1, 1), y_pred_t[0][train]) self.lr1.fit(labels_t[1][train].reshape(-1, 1), y_pred_t[1][train]) y_lr = self.lr1.predict(labels_t[1][test].reshape(-1, 1)) dy = np.abs(y_pred_t[1][test] - y_lr) < 0.2 print('\t%d / %d' % (np.sum(dy), np.sum(1 - dy))) for jfold, (train2, test2) in enumerate(kfold2.split(dy)): self.model2.fit(features[test[train2]], dy[train2]) y_pred2 = self.model2.predict(features[test[test2]]) score_train2 = precision_score(dy[train2], self.model2.predict( features[test[train2]]), average='binary') score_test2 = precision_score(dy[test2], y_pred2, average='binary') clf_dummy.fit(features[test[train2]], dy[train2]) score_dummy = precision_score(dy[test2], clf_dummy.predict( features[test[test2]]), average='binary') print('\tFold %d: %.4f / %.4f (%.4f)' % (jfold, score_test2, score_train2, score_dummy)) score_final_train = self.model.score(features[test[train2]], labels[test[train2]]) score_final_test = self.model.score( features[test[test2[y_pred2]]], labels[test[test2[y_pred2]]]) print('\tFinal: %.4f / %.4f' % (score_final_test, score_final_train)) fig, axs = plt.subplots(2, 2) train_truth = labels[train].transpose() train_pred = self.model.predict(features[train]).transpose() test_truth = labels[test].transpose() test_pred = y_pred[test].transpose() sns.scatterplot(x=train_truth[0], y=train_pred[0], ax=axs[0, 0]) sns.scatterplot(x=train_truth[1], y=train_pred[1], ax=axs[0, 1]) sns.scatterplot(x=test_truth[0][test2[y_pred2]], y=test_pred[0][test2[y_pred2]], ax=axs[1, 0]) sns.scatterplot(x=test_truth[1][test2[y_pred2]], y=test_pred[1][test2[y_pred2]], ax=axs[1, 1]) plt.draw() plt.show() return
class ModelDummyClassifier: def __init__(self): self.version = 'dummy_classifier__' + datetime.datetime.today().strftime("%Y%m%d") self.global_config = dict() self.vardict = self.get_model_vardict() self.model_config = { 'strategy': 'prior' } self.model = DummyClassifier(**self.model_config) self.time_for_training = 0 def get_model_vardict(self): vardict = dict() # Target vardict["target"] = "result" # Numerical vardict["numerical"] = [ #"nb_characters_german", #"nb_characters_english", "levenshtein_distance_german_english", #"previous_score", #"previous_question_time", "difficulty_category", ] # Difference in time vardict["diff_time"] = [ #"days_since_last_occurrence_same_language", "days_since_first_occur_any_language", ] # Boolean vardict["boolean"] = [ #"previous_result", "is_noun", ] # Categorical vardict["categorical"] = [ #"language_asked", "previous_language_asked", ] # vardict['all'] = vardict['numerical'] + vardict['diff_time'] + vardict['boolean'] + vardict['categorical'] return vardict def preprocessing_training(self, dataset): self.vardict["into_model"] = ( self.vardict['numerical'] + self.vardict['diff_time'] + self.vardict['boolean'] + self.vardict['categorical'] ) return dataset def train(self, dataset): X_train = dataset[self.vardict["into_model"]] y_train = dataset[self.vardict["target"]] start = time.time() self.model.fit(X_train, y_train) end = time.time() self.time_for_training = end - start def preprocessing_inference(self, dataset): return dataset def predict(self, dataset, target_present=False): X_valid = dataset[self.vardict["into_model"]].copy() predictions = X_valid.copy() predictions["y_pred"] = self.model.predict(X_valid) predictions["y_proba"] = [x[1] for x in self.model.predict_proba(X_valid)] if target_present: predictions["y_true"] = dataset[self.vardict["target"]].copy() return predictions
# We will look at confusion matrices of the different predictions # %% [markdown] # | |Predicted Negative|Predicted Postive| # |-------------------|------------------|-----------------| # |**Actual Negative**|True Negative |False Positive | # |**Actual Positive**|False Negative |True Positive | # %% [markdown] # ### Dummy Classifier # %% [markdown] # **Fit and predict** # %% dummy = DummyClassifier() ## fit on the training data dummy.fit(X_train, y_train) ## make predictions on test data dummy_test_pred = dummy.predict(X_test) ## fit on the scaled training data dummy.fit(X_train_scale, y_train) ## make predictions on scaled test data dummy_test_pred_scale = dummy.predict(X_test_scale) # %% [markdown] # **Confusion matrix** # %% dummy_matrix = confusion_matrix(y_test, dummy_test_pred)
def _train_local_classifier(self, X, y, node_id): if self.graph_.out_degree(node_id) == 0: # Leaf node if self.algorithm == "lcpn": # Leaf nodes do not get a classifier assigned in LCPN algorithm mode. self.logger.debug( "_train_local_classifier() - skipping leaf node %s when algorithm is 'lcpn'", node_id, ) return X = self.graph_.node[node_id]["X"] nnz_rows = nnz_rows_ix(X) X_ = X[nnz_rows, :] y_rolled_up = rollup_nodes( graph=self.graph_, source=node_id, targets=[y[idx] for idx in nnz_rows], ) if self.is_tree_: y_ = flatten_list(y_rolled_up) else: # Class hierarchy graph is a DAG X_, y_ = apply_rollup_Xy(X_, y_rolled_up) num_targets = len(np.unique(y_)) self.logger.debug( "_train_local_classifier() - Training local classifier for node: %s, X_.shape: %s, len(y): %s, n_targets: %s", # noqa:E501 node_id, X_.shape, len(y_), num_targets, ) if X_.shape[0] == 0: # No training data could be materialized for current node # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here? self.logger.warning( "_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node %s", # noqa:E501 node_id, ) return elif num_targets == 1: # Training data could be materialized for only a single target at current node # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here? constant = y_[0] self.logger.debug( "_train_local_classifier() - only a single target (child node) available to train classifier for node %s, Will trivially predict %s", # noqa:E501 node_id, constant, ) clf = DummyClassifier(strategy="constant", constant=constant) else: clf = self._base_estimator_for(node_id) clf.fit(X=X_, y=y_) self.graph_.node[node_id][CLASSIFIER] = clf
def optimal_models_performance (X, y, optimal_k, optimal_C, y_label): ''' Grid search for optimal nlp classifier models (svm and knn). Plot ROC curves, generate confusion matrices and classification report ''' #1. Split the data testSizeX = 0.33 #67:33 split Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= testSizeX, random_state=42) #SVM svm_model = Pipeline([('vect', CountVectorizer(stop_words = nltk.corpus.stopwords.words('english'))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())]) #Knn knn_model = Pipeline([('vect', CountVectorizer(stop_words = nltk.corpus.stopwords.words('english'))), ('tfidf', TfidfTransformer()), ('clf', KNeighborsClassifier(n_neighbors = optimal_k, weights= 'uniform'))]) #Dummy classifier dummy_model = DummyClassifier(strategy='most_frequent').fit(Xtrain, ytrain) #Grid search parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)} #************************************************ #Svm: Train svm model svm_gs = GridSearchCV(svm_model, parameters, n_jobs=-1) #Performance - best performing print('*********************************************') print('====== \n Results for svm grid search model:') svm_gs = svm_gs.fit(Xtrain, ytrain) print(svm_gs.best_params_) predicted = svm_gs.predict(Xtest) print(confusion_matrix(ytest, predicted)) print(classification_report(ytest, predicted)) #************************************************ #Train knn model knn_gs = GridSearchCV(knn_model, parameters, n_jobs=-1) #Performance - best performing print('*********************************************') print('====== \n Results for knn grid search model:') knn_gs = knn_gs.fit(Xtrain, ytrain) print(knn_gs.best_params_) predicted = knn_gs.predict(Xtest) print(confusion_matrix(ytest, predicted)) print(classification_report(ytest, predicted)) #********************************************** #Dummy model print('*********************************************') print('====== \n Results for dummy model:') dummy_model_fitted = dummy_model.fit(Xtrain, ytrain) predicted = dummy_model_fitted.predict(Xtest) print(confusion_matrix(ytest, predicted)) print(classification_report(ytest, predicted)) #********************************************** #ROC plots plt.figure() #svm model scores = svm_gs.decision_function(Xtest) fpr, tpr, _= roc_curve(ytest, scores) plt.plot(fpr,tpr, label = 'SVM') print('SVM AUC = {}'.format(auc(fpr, tpr))) #knn model scores = knn_gs.predict_proba(Xtest)[:,1] fpr, tpr, _= roc_curve(ytest, scores) plt.plot(fpr,tpr, color = 'r', label = 'knn') print('knn AUC = {}'.format(auc(fpr, tpr))) #Baseline Model scores_bl = dummy_model_fitted.predict_proba(Xtest) fpr, tpr, _= roc_curve(ytest, scores_bl[:, 1]) plt.plot(fpr,tpr, color = 'orange', label = 'baseline model') print('AUC = {}'.format(auc(fpr, tpr))) #Random Choice plt.plot([0, 1], [0, 1],'g--') #Labels plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve. X = review text, y = {}'.format(y_label)) plt.legend(['Svm', 'Knn', 'Baseline (most freq)','Random Classifier']) plt.savefig('./roc_{}'.format(y_label)) plt.show()
def eval_zero_rule(args): samples_dir_path = args.samples_dir class_count = args.class_count sent_count = args.sent_count split_dir_path = args.split_dir # # Check that (input) POWER Samples Directory exists # logging.info('Check that (input) POWER Samples Directory exists ...') samples_dir = SamplesDir(Path(samples_dir_path)) samples_dir.check() # # Check that (input) POWER Split Directory exists # logging.info('Check that (input) POWER Split Directory exists ...') split_dir = SplitDir(Path(split_dir_path)) split_dir.check() # # Load entity/relation labels # logging.info('Load entity/relation labels ...') ent_to_lbl = split_dir.entities_tsv.load() rel_to_lbl = split_dir.relations_tsv.load() # # Load datasets # logging.info('Load test dataset ...') test_set = samples_dir.test_samples_tsv.load(class_count, sent_count) # # Calc class frequencies # logging.info('Calc class frequencies ...') _, _, test_classes_stack, _ = zip(*test_set) test_freqs = np.array(test_classes_stack).mean(axis=0) # # Evaluate # logging.info(f'test_freqs = {test_freqs}') for strategy in ('uniform', 'stratified', 'most_frequent', 'constant'): logging.info(strategy) mean_metrics = [] for i, gt in tqdm(enumerate(np.array(test_classes_stack).T)): if strategy == 'constant': classifier = DummyClassifier(strategy='constant', constant=1) classifier.fit([0, 1], [0, 1]) else: classifier = DummyClassifier(strategy=strategy) classifier.fit(gt, gt) metrics_list = [] for _ in range(10): pred = classifier.predict(gt) acc = accuracy_score(gt, pred) prec, recall, f1, _ = precision_recall_fscore_support( gt, pred, labels=[1], zero_division=1) metrics_list.append((acc, prec[0], recall[0], f1[0])) mean_metrics.append(np.mean(metrics_list, axis=0)) logging.info(mean_metrics[0]) logging.info(mean_metrics[-1]) logging.info(np.mean(mean_metrics, axis=0))
def test_string_labels(): X = [[0]] * 5 y = ["paris", "paris", "tokyo", "amsterdam", "berlin"] clf = DummyClassifier(strategy="most_frequent") clf.fit(X, y) assert_array_equal(clf.predict(X), ["paris"] * 5)
print(npz_file.keys()) with np.load('mnist-6k.npz', allow_pickle=False) as npz_file: X = npz_file['data'] y = npz_file['labels'] X_tr, X_te, y_tr, y_te = train_test_split(X, y, stratify=y, test_size=1 / 6, random_state=0) print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape) # Dummy classifier dummy = DummyClassifier(strategy='most_frequent') dummy.fit(X_tr, y_tr) # Accuracy on test set accuracy = dummy.score(X_te, y_te) print('Baseline accuracy: {:.3f}'.format(accuracy)) # k-NN classifier scaler = StandardScaler() # grid search for optimal k: k_values = np.arange(1, 50, 5) test_curve = [] for k in k_values:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.67, random_state=i) vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8) X_train_dtm = vect.fit_transform(X_train) X_test_dtm = vect.transform(X_test) feat_dtm = vect.get_feature_names() clf = DummyClassifier() clf.fit(X_train_dtm, y_train) y_pred = clf.predict(X_test_dtm) accuracy = metrics.accuracy_score(y_test, y_pred) #print(accuracy) arr_Accu.append(accuracy) #Vectorize vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8) X_train_dtm = vect.fit_transform(X_train) X_test_dtm = vect.transform(X_test) feat_dtm = vect.get_feature_names()
vector_strs = [] with open(VECTORS_FOLDER + (EP_NUMBER_FORMAT % ep_num), 'r') as file: vector_strs = file.read().splitlines() for vector_str in vector_strs: test_data_vec.append(ast.literal_eval(vector_str)) test_data = np.array(test_data_vec) test_data_vecs = test_data[:, :-1] test_data_labels = test_data[:, -1] # set up logistic regression classifier and fit to data log_reg_clf = LogisticRegression(solver='newton-cg', max_iter=50,\ random_state=0, multi_class='multinomial',\ verbose=0).fit(train_data_vecs, train_data_labels) print('Logistic Regression Accuracy: ' + \ str(log_reg_clf.score(test_data_vecs, test_data_labels))) ridge_clf = RidgeClassifier(solver='auto') ridge_clf.fit(train_data_vecs, train_data_labels) print('Ridge Regression Accuracy: ' + \ str(ridge_clf.score(test_data_vecs, test_data_labels))) dummy_clf = DummyClassifier(strategy='stratified') dummy_clf.fit(train_data_vecs, train_data_labels) print('Stratified Random Accuracy: ' + \ str(dummy_clf.score(test_data_vecs, test_data_labels)))
stratify=y) modelo = LinearSVC(random_state=SEED) print("Treinaremos com %d elementos e testaremos com %d elementos" % (len(treino_x), len(teste_x))) modelo.fit(treino_x, treino_y) previsoes = modelo.predict(teste_x) taxa_de_acerto = accuracy_score(teste_y, previsoes) print("Taxa de acerto %.2f%%" % (taxa_de_acerto * 100)) dummy = DummyClassifier() dummy.fit(treino_x, treino_y) previsoes = dummy.predict(teste_x) acuracia = accuracy_score(teste_y, previsoes) print("A acurácia do algoritmo Dummy foi de %.2f%%" % (acuracia * 100)) dummy = DummyClassifier(random_state=SEED) dummy.fit(treino_x, treino_y) acuracia = dummy.score(teste_x, teste_y) print("A acurácia do algoritmo Dummy foi de %.2f%%" % (acuracia * 100)) SEED = 8
random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) """#Handling the imbalance dataset from imblearn.combine import SMOTETomek smk= SMOTETomek() x_res,y_res = smk.fit_sample(x,y)""" #Building Model #K_Nearest reg = DummyClassifier() reg.fit(x_train, y_train) y_pred = reg.predict(x_test) #confusion matrix from sklearn.metrics import confusion_matrix con = confusion_matrix(y_test, y_pred) print(con) #checking Accuracy accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) #Classification report from sklearn.metrics import classification_report print(classification_report(y_test, y_pred))
def run_classifier(feature_path, labelled_path): training_data = pd.read_csv(feature_path) labelled_data = pd.read_csv(labelled_path) feature_df = training_data headers = feature_df.columns labels_df = np.array(labelled_data['label'], dtype=int) feature_df = feature_df.as_matrix() num_trees = 100 max_features = int(math.sqrt(feature_df.shape[1])) model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) dummy_model = DummyClassifier(constant=None, random_state=0, strategy='most_frequent') scores = cross_val_score(model, feature_df, labels_df, cv=ShuffleSplit(n_splits=3, train_size=0.7, random_state=0)) # scores = cross_val_score(model, feature_df , labels_df, cv=10, verbose=1) dummy_scores = cross_val_score(dummy_model, feature_df, labels_df, cv=ShuffleSplit(n_splits=3, train_size=0.7, random_state=0)) print('randomforest_scores=', np.mean(scores), np.std(scores)) print('dummy_scores=', np.mean(dummy_scores), np.std(dummy_scores)) np.random.shuffle(feature_df) dummy_model = dummy_model.fit(feature_df[:400, :], labels_df[:400]) model = model.fit(feature_df[:400, :], labels_df[:400]) # print(model.predict(feature_df[:100,:])) print('randomforest', model.score(feature_df[400:, :], labels_df[400:])) print('dummy', dummy_model.score(feature_df[400:, :], labels_df[400:])) # return scores, model ######################## # MANU ADDED from HERE # ######################## # We set the features we want to use # features_to_use = ['var_mgw', 'motif_scores', 'bDNA','compA_d', 'compT_d', 'compG_d', 'compC_d', 'compA_u', 'compT_u', 'compG_u', 'compC_u', 'tfs_D_fw', 'tfs_D_rv', 'tfs_U_fw', 'tfs_U_fw.1', 'intergenetic'] features_to_use = [ 'bDNA', 'var_mgw', 'motif_scores', 'tfs_D_fw', 'tfs_D_rv', 'tfs_U_fw', 'tfs_U_fw.1' ] seqs = ['seq' + str(i) for i in range(21)] features_to_use += seqs # Load features from files in the format requiered for scikit, filtering # desired Features features, labels, headers = prepare_data_for_classifier( feature_path, labelled_path, randomize=True, only_columns=features_to_use) # DIFFERENT WAYS of running cross-validation! # Running the custom cross-validation just one time, with feature importance # analisis run_custom_cross(features, labels, headers, single_run=True) # We can also run it in verbose mode (will print all the confussion matrixes # and scores) run_custom_cross(features, labels, headers, single_run=True) # Running the custom cross-validation multiple times (using single_run=False) # This can be used to calculate average behaviour for i in range(10): print(run_custom_cross(features, labels, headers, single_run=False)) return None, None
print(" Tf-idf, Balanced accuracy score = " + str(balanced_accuracy_score(y_test, pred_tfidf_balanced))) print(" Tf-idf, Accuracy score = " + str(accuracy_score(y_test, pred_tfidf_balanced))) report_tfidf = classification_report(y_test, pred_tfidf_balanced) # count balanced accuracy: 42 # count accuracy: 43.5 # tf-idf balanced accuracy: 43.5 # tf-idf accuracy: 41.5 ''' -------------------- Baseline Models ------------------- ''' dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf.fit(X_train, y_train) DummyClassifier(strategy='uniform') dummy_clf.predict(X_train) dummy_clf.score(X_test, y_test) # Multilabel # Most frequent: 19.7 % # Stratified: 14.2 % # Uniform: 11.3 # Binary 50.46 % ''' ------------------- Feature importance ----------------- '''
y = pd.read_csv(path) print(y.shape) print(X.shape) """##### KFold cross validation""" kf = KFold(n_splits=10, shuffle=True, random_state=4) skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=4) for train_index, test_index in kf.split(X, y): X_train, X_test = X.loc[train_index], X.loc[test_index] y_train, y_test = y.loc[train_index], y.loc[test_index] """###### Baseline Model""" dummy_model = DummyClassifier(strategy='most_frequent', random_state=0) dummy_model.fit(X_train, y_train) print('score for baseline model : {0:.2f}'.format( dummy_model.score(X_test, y_test))) print('accuracy for baseline model : {0:.2f}'.format( accuracy_score(y_test, dummy_model.predict(X_test)))) print('confusion matrix for baseline model: \n {0}'.format( confusion_matrix(y_test, dummy_model.predict(X_test)))) print('precision for baseline model : {0:.2f}'.format( precision_score(y_test, dummy_model.predict(X_test)))) print('recall for baseline model : {0:.2f}'.format( recall_score(y_test, dummy_model.predict(X_test)))) """###### Logistic regression model"""
test_report = classification_report(y_test, y_pred, output_dict=True) start_time = time.time() y_pred = model.predict(X_train) print(classification_report(y_train, y_pred)) # print('Accuracy score:', accuracy_score(y_train, y_pred)) trainingtime = (time.time() - start_time + training_time) train_report = classification_report(y_train, y_pred, output_dict=True) metric_list = ['precision', 'recall', 'f1-score'] avg_list = ['micro avg', 'macro avg', 'weighted avg'] test_str_output = "DT_sentiment\t" + f"{size}\t" + "test\t" train_str_output = "DT_sentiment\t" + f"{size}\t" + "train\t" for m in metric_list: for a in avg_list: test_str_output = test_str_output + f"{test_report[a][m]:.3f}\t" train_str_output = train_str_output + f"{train_report[a][m]:.3f}\t" test_str_output += f"{testtime:.4f}" train_str_output += f"{trainingtime:.4f}" print(test_str_output.rstrip()) print(train_str_output.rstrip()) baselineClf = DummyClassifier(strategy="most_frequent") baseline = baselineClf.fit(X_train, y_train) y_pred_base = baseline.predict(X_test) print(classification_report(y_test, y_pred_base)) print('Accuracy score:', accuracy_score(y_test, y_pred_base))
def create_models(headlines): headline = headlines['headline'] label = headlines['label'] arr_Accu = [] results = dict() for i in range(1, 20): headline_train, headline_test, label_train, label_test = train_test_split( headline, label, test_size=0.10, random_state=i) vect = CountVectorizer(max_features=1000, binary=True) headline_train_vector = vect.fit_transform(headline_train) headline_test_vector = vect.transform(headline_test) # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike # balancing = SMOTE() # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train) # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True) # print(list(zip(oversampled_headlines, counts))) dummy = DummyClassifier() dummy.fit(headline_train_vector, label_train) prediction = dummy.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) # print(accuracy) arr_Accu.append(accuracy) print(max(arr_Accu)) max_random_state = arr_Accu.index(max(arr_Accu)) + 1 print(max_random_state) for j in range(1, 20): print("Random State : ", j, " Accuracy : ", arr_Accu[j - 1]) # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy # Note: to accuracy edw einai xeirotero apo prin # arr_Accu = [] # for i in range(3, 15): # vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8) # headline_train_vector = vect.fit_transform(headline) # # dummy = DummyClassifier() # accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy') # # arr_Accu.append(np.mean(accuracy)) # # # print(arr_Accu) # for j in range(3, 15): # print("K-Fold : ", j, " Accuracy : ", arr_Accu[j - 3]) # Ksekina i dimiourgia montelwn me to veltisto random state headline_train, headline_test, label_train, label_test = train_test_split( headline, label, test_size=0.10, random_state=max_random_state) print("random state chosen: ") print(max_random_state) vect = CountVectorizer(max_features=1000, binary=True) headline_train_vector = vect.fit_transform(headline_train) headline_test_vector = vect.transform(headline_test) # ta headlines tou training kommatioy ginontai fit_transform gia to fit # ta headlines tou test ginontai transform gia to test # Multionomial Bayes mbayes = MultinomialNB() mbayes.fit(headline_train_vector, label_train) # print(mbayes.score(headline_train_vector, label_train)) # actual testing me to testing set pou diaxwrisame prediction = mbayes.predict(headline_test_vector) # print(prediction) accuracy = metrics.accuracy_score(label_test, prediction) #print('MBayes Accuracy : ', accuracy) results["bayes_accuracy"] = accuracy log_regression = LogisticRegression() log_regression.fit(headline_train_vector, label_train) prediction = log_regression.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('LogisticRegression Accuracy : ', accuracy) results["Logistic_regression"] = accuracy decision_tree = DecisionTreeClassifier(criterion='entropy') decision_tree.fit(headline_train_vector, label_train) prediction = decision_tree.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('DecisionTree Accuracy : ', accuracy) random_forest = RandomForestClassifier(criterion='entropy') random_forest.fit(headline_train_vector, label_train) prediction = random_forest.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('RandomForestClassifier Accuracy : ', accuracy) adaboost = AdaBoostClassifier() adaboost.fit(headline_train_vector, label_train) prediction = adaboost.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('Adaboost Accuracy : ', accuracy) bernoulli_bayes = BernoulliNB() bernoulli_bayes.fit(headline_train_vector, label_train) prediction = bernoulli_bayes.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('BernoulliNB Accuracy : ', accuracy) linear_SVC = LinearSVC() linear_SVC.fit(headline_train_vector, label_train) prediction = linear_SVC.predict(headline_test_vector) accuracy = metrics.accuracy_score(label_test, prediction) print('Linear_SVC Accuracy : ', accuracy) # passive_aggressive = PassiveAggressiveClassifier() # passive_aggressive.fit(headline_train_vector, label_train) # prediction = passive_aggressive.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('PassiveAggressiveClassifier Accuracy : ', accuracy) return results
class_counts = pd.Series(Counter(y_train)) class_counts /= class_counts.sum() class_counts # %% [markdown] # We can observe that the positive class, `'donated'`, comprises only 24% of # the of the samples. The good accuracy of our classifier is then linked # to its ability to predict correctly the negative class `'not donated'` # which may or may not be relevant, depending on the application. We can # illustrate the issue using a dummy classifier as a baseline. # %% from sklearn.dummy import DummyClassifier dummy_classifier = DummyClassifier(strategy="constant", constant="not donated") dummy_classifier.fit(X_train, y_train).score(X_test, y_test) # %% [markdown] # With the dummy classifier, which always predicts the negative class # `'not donated'`, # we obtain an accuracy score of 76%. Therefore, it means that this classifier, # without learning anything from the data `X`, is capable of predicting as # accurately as our logistic regression model. # # The problem illustrated above is also known as the class imbalance problem. # When the classes are imbalanced, accuracy should not be used. In this case, # one should either use # the precision, recall, or F1 score as presented above or the balanced # accuracy score instead of accuracy. # %%
def denseNN_grid_search(*, dataset_name, method_name, module_name, PATH_encoded, train_subset_names, test_subset_names, # ... class_encoding, grid, store_predictions=True, track_progres=True, verbose=False, plot_history=False # applied only if verbose==True, ): # dist to store results, model_acc_and_parameters_list = list() model_predictions_dict = dict() model_history_dict = dict() class_decoding = dict(zip(list(list(class_encoding.values())), list(class_encoding.keys()))) # reverse on class_encoding, # .. if track_progres==True: print(f"{module_name} _________________________________________ {pd.to_datetime('now')}") else: pass # Grid search, model_ID = -1 # id number for each model, its predictions, I started with -1 so the first id will be 0 ! for params in grid: # PARAMETERS, ................................... model_ID +=1 Xy_names = ["train", "valid", "test"] # these are internal names for datasets create with that function, # not the names of datatsets important that can ghave the same names, or other names, if track_progres==True: print('.', end="") else: pass # LOAD & PREPARE THE DATA ,...................... # find any logfile created while saving img files, os.chdir(PATH_encoded) logfiles = [] for file in glob.glob(f"{''.join([module_name,'_',dataset_name])}*_logfile.csv"): logfiles.append(file) # Load train data, X_tot, batch_labels = load_encoded_imgbatch_using_logfile(logfile_name=logfiles[0], load_datasetnames=train_subset_names) X_tot = X_tot.astype(np.float) y_tot = pd.Series(batch_labels.classname).map(class_encoding).values.astype("int") # Load test data, X_te, batch_labels = load_encoded_imgbatch_using_logfile(logfile_name=logfiles[0], load_datasetnames=test_subset_names) X_te = X_te.astype(np.float) y_te = pd.Series(batch_labels.classname).map(class_encoding).values.astype("int") idx_y_te = np.arange(y_te.shape[0]) # kep for compatibility issues # ... Split data into train/validation sets """ here it is done to prepare the script for future applications""" X_tr, X_valid, y_tr, y_valid = train_test_split( X_tot, y_tot, train_size=params["train_test_split__train_size"], test_size=(1-params["train_test_split__train_size"]), random_state=params["random_state"] ) # ... get xy_idx to identify raw images in train/valid datasets, _, _, idx_y_tr, idx_y_valid = train_test_split( X_tot, np.arange(X_tot.shape[0], dtype="int"), train_size=params["train_test_split__train_size"], test_size=(1-params["train_test_split__train_size"]), random_state=params["random_state"] ) # place all in dict, X_dct = dict(zip(Xy_names, [X_tr, X_valid, X_te])) y_dct = dict(zip(Xy_names, [y_tr, y_valid, y_te])) idx_y_dct = dict(zip(Xy_names, [idx_y_tr, idx_y_valid, idx_y_te])) # SHUFFLE , ................................... 'only in case X_tot is used for NN training' # shuffle the samples in tot - otherwise the model will load batches, smaller then class, ie, one batch will often haven samples from only one class ! # ... it will very fast went into overfitting with low accurqcy and huge loss for validation set, idx = np.arange(X_tot.shape[0]) my_seed = np.random.RandomState(params["random_state"]) idx_mix = my_seed.choice(a=idx, size=idx.shape[0], replace=False) X_tot = X_tot[idx_mix,:].copy() y_tot = y_tot[idx_mix].copy() # INFO , ................................... if verbose==True: print(f"\n{''.join(['-']*40)}"); print(f"{''.join(['-']*40)}");print(f"{''.join(['-']*40)}") print(f'{model_ID}: {module_name}, logfie: {logfiles[0]}'); print(f"{''.join(['-']*40)}") print("PARAMETERS:"); print(f'{model_ID}: {params}') print("INPUT DATA DIMENSIONS:"); for xyname in Xy_names: print(f"{xyname}: {X_dct[xyname].shape}") else: pass # BASELINE, ............................... 'Create Most frequet baseline - done mainly for bakccompatibility' dummy = DummyClassifier(strategy='most_frequent') dummy.fit(X_dct["train"].astype(np.float), y_dct["train"].astype(int)) # .. baseline_acc = dict() for xyname in Xy_names: baseline_acc[f"baseline_acc_{xyname}"] = dummy.score(X_dct[xyname], y_dct[xyname]) if verbose==True: print(" --- ", model_ID, baseline_acc) else: pass # CREATE AND TRAIN THE MODEL ,................ "params dict is used here to provide imputs for parameter values" # from keras import backend as K K.clear_session() # create model if params["model"]=="one_layer": model = create_keras_one_layer_dense_model( input_size = X_tot.shape[1], output_size = len(list(class_encoding.keys())), verbose = verbose, **params ) if params["model"]=="two_layers": model = create_keras_two_layer_dense_model( input_size = X_tot.shape[1], output_size = len(list(class_encoding.keys())), verbose = verbose, **params ) # define early stopping - End training when acc stops improving (optional) early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=params["EarlyStopping__patience"], restore_best_weights=True ) # Fit model history = model.fit( x=X_tot, # samples are subdivided internally, y=y_tot, validation_split=params['fit__validation_split'], batch_size=params['fit__batch_size'], epochs=params["fit__epoch"], shuffle=True, # Shuffle training samples callbacks=[early_stopping], verbose=0# no info, ) # EVALUTE MODEL ACC, .......................... model_acc = dict() loss_acc = dict() # ... n = params["EarlyStopping__patience"]# early stopping steps taken into account, acc_results = pd.DataFrame(history.history).iloc[-n::,:].mean(axis=0) model_acc["model_acc_train"] = acc_results.loc["acc"] model_acc["model_acc_valid"] = acc_results.loc["val_acc"] model_acc["model_loss_train"] = acc_results.loc["loss"] model_acc["model_loss_valid"] = acc_results.loc["val_loss"] # ... loss, acc = model.evaluate(X_dct["test"], y_dct["test"], verbose=0) model_acc["model_acc_test"] = acc model_acc["model_loss_test"] = loss # COLLECT THE RESULTS ,.............................. 'acc_restuls_and_params were added to all objects in case I woudl have some dounbts about results origine,' # 1. acc_restuls_and_params acc_restuls_and_params = { "random_state_nr": params["random_state"], # for backcompatibility, "model_ID": model_ID, "method": method_name, "module": module_name, **baseline_acc, **model_acc, **params } model_acc_and_parameters_list.append(acc_restuls_and_params) # in list, so it can be used as pd.df immediately, # 2. save model history, model_history_dict[model_ID] = { "model_history": pd.DataFrame(history.history), "acc_restuls_and_params": acc_restuls_and_params} # 3. Model predictions, """collect all model predictions also for test and valid datasets to have nice comparisons on errors and problematic files""" if store_predictions==True: one_model_predictions = dict() for xyname in Xy_names: # make predictions and decode them, predictions = model.predict_classes(X_dct[xyname]) decoded_predictions = pd.Series(predictions).map(class_decoding).values model_predictions_proba = model.predict_proba(X_dct[xyname]) decoded_y_labels = pd.Series(y_dct[xyname]).map(class_decoding).values # ... one_model_predictions[xyname] = { "idx_in_batch": idx_y_dct[xyname], "original_labels": decoded_y_labels, "model_predictions": decoded_predictions, "model_predictions_proba": model_predictions_proba, "acc_restuls_and_params": acc_restuls_and_params, "class_decoding": class_decoding }# added, in case I woudl have some dounbts about results origine, # and finally, add this to the big dict wiht all the results, model_predictions_dict[model_ID] = one_model_predictions else: model_predictions_dict[model_ID] = None # PLOT THE RESULTS ,...................... if verbose==True and plot_history==True: #.. figure, axes, fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4)) fig.suptitle(f"{params}") #.. Plot accuracy values ax1.plot(history.history['loss'], label='train loss') ax1.plot(history.history['val_loss'], label='val loss') ax1.set_title('Validation loss {:.3f} (mean last 3)'.format( np.mean(history.history['val_loss'][-3:]) # last three values )) ax1.set_xlabel('epoch') ax1.set_ylabel('loss value') ax1.grid(ls="--", color="grey") ax1.legend() #.. Plot accuracy values ax2.plot(history.history['acc'], label='train acc') ax2.plot(history.history['val_acc'], label='val acc') ax2.set_title('Validation accuracy {:.3f} (mean last 3)'.format( np.mean(history.history['val_acc'][-3:]) # last three values )) ax2.set_xlabel('epoch') ax2.set_ylabel('accuracy') ax2.set_ylim(0,1) ax2.grid(ls="--", color="grey") ax2.legend() plt.show() else: pass if track_progres==True: print(f"\nDONE _________________________________________ {pd.to_datetime('now')}",end="\n\n") else: pass # .................................................. return model_acc_and_parameters_list, model_predictions_dict, model_history_dict
def run_custom_cross(features, labels, headers, single_run=True, verbose=False): ############################ # FOREST customization # # --> change from here <-- # ############################ # Number of trees in the forest number_of_trees = 300 # Number of features to train each tree max_number_of_features = 'sqrt' # can be 'log' # Class Weight # If not given, all classes are supposed to have weight one # # The “balanced” mode uses the values of y to automatically adjust weights # inversely proportional to class frequencies in the input data as # n_samples / (n_classes * np.bincount(y)) # # The “balanced_subsample” mode is the same as “balanced” except that # weights are computed based on the bootstrap sample for every tree grown. class_weight = 'balanced' ############################ # --> to here <-- # ############################ ############################ # DUMMY customization # # --> change from here <-- # ############################ dummy_strategy = 'stratified' ############################ # --> to here <-- # ############################ ############################ # CROSS customization # # --> change from here <-- # ############################ number_of_splits = 5 ############################ # --> to here <-- # ############################ if single_run: print("Using features:") print(headers) # We create TWO estimators forest = RandomForestClassifier(n_estimators=number_of_trees, max_features=max_number_of_features, class_weight=class_weight) dummy = DummyClassifier(constant=None, random_state=0, strategy=dummy_strategy) # We create a crossvalidator cross_stratified_kfold = StratifiedKFold(n_splits=number_of_splits, shuffle=True, random_state=None) # Some list to store results forest_scores_list = [] dummy_scores_list = [] forest_cmatrix_list = [] dummy_cmatrix_list = [] forest_features_importance_list = [] forest_features_std_list = [] # Now we do the actual training and classification for train_index, test_index in cross_stratified_kfold.split( features, labels): # We define training and test subset, as conducted by the cross-validator train_features = features[train_index] train_labels = labels[train_index] test_features = features[test_index] test_labels = labels[test_index] # We fit the models forest = forest.fit(train_features, train_labels) dummy = dummy.fit(train_features, train_labels) # We use them to clasify predicted_labels_forest = forest.predict(test_features) predicted_labels_dummy = dummy.predict(test_features) # We get the MCC scores (1 is perfect classification, 0 is random, -1 is inverse prediction) forest_score = matthews_corrcoef(test_labels, predicted_labels_forest, sample_weight=None) fpr, tpr, _ = roc_curve(test_labels, predicted_labels_forest) # print(roc_auc_score(test_labels, predicted_labels_forest)) plt.plot(fpr, tpr) dummy_score = matthews_corrcoef(test_labels, predicted_labels_dummy, sample_weight=None) d_fpr, d_tpr, _ = roc_curve(test_labels, predicted_labels_dummy) # plt.plot(fpr, tpr) # plt.show() # We generate the Confusion Matrix # True negatives is C_{0,0} # False negatives is C_{1,0} # True positives is C_{1,1} # False positives is C_{0,1} forest_matrix = confusion_matrix(test_labels, predicted_labels_forest, labels=None, sample_weight=None) dummy_matrix = confusion_matrix(test_labels, predicted_labels_dummy, labels=None, sample_weight=None) # We store everything in the appropiate lists forest_scores_list.append(forest_score) dummy_scores_list.append(dummy_score) forest_cmatrix_list.append(forest_matrix) dummy_cmatrix_list.append(dummy_matrix) forest_features_importance_list.append(forest.feature_importances_) forest_feature_std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0) forest_features_std_list.append(forest_feature_std) if verbose & single_run: # We print everything print("Forest score", forest_score) print("Dummy score", dummy_score) print("\nForest Matrix") print(forest_matrix) print("\nDummy Matrix") print(dummy_matrix) # Now we calculate summurizing scores: # best, worst, average and variance of MCC for both classifiers forest_score_array = np.array(forest_scores_list) forest_max_score = forest_score_array.max() forest_min_score = forest_score_array.min() forest_avg_score = np.mean(forest_score_array) forest_var_score = np.var(forest_score_array) dummy_score_array = np.array(dummy_scores_list) dummy_max_score = dummy_score_array.max() dummy_min_score = dummy_score_array.min() dummy_avg_score = np.mean(dummy_score_array) dummy_var_score = np.var(dummy_score_array) forest_scores_tuple = (forest_max_score, forest_min_score, forest_avg_score, forest_var_score) if single_run: print('# Scores from cross-validation') print(('max', 'min', 'average', 'variance')) print('Forest: ') print(forest_scores_tuple) print('Dummy: ') print((dummy_max_score, dummy_min_score, dummy_avg_score, dummy_var_score)) # We pick the best run and we extract the importance of features bestp_index = np.argmax(forest_score_array) bestp_features_imp = forest_features_importance_list[bestp_index] bestp_features_std = forest_features_std_list[bestp_index] # And we print the Feature Importance (with plot) print('The following data is from best performing Forest') feature_importance_analysis(bestp_features_imp, bestp_features_std, features, headers, True) # And we also print confussion matrix for that forest and that dummy print('Best Forest matrix:') print(forest_cmatrix_list[bestp_index]) print(forest_scores_list[bestp_index]) print('Correspondent Dummy matrix:') print(dummy_cmatrix_list[bestp_index]) print(dummy_scores_list[bestp_index]) return forest_scores_tuple