def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import accuracy_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) log = SGDClassifier(loss="log", n_jobs=55) log.fit(train_embeds, train_labels) print("Test scores") print(accuracy_score(test_labels, log.predict(test_embeds))) print("Train scores") print(accuracy_score(train_labels, log.predict(train_embeds))) print("Random baseline") print(accuracy_score(test_labels, dummy.predict(test_embeds)))
def get_scores(X, y): nfolds = 200 cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=0.2) dumb = DummyClassifier(strategy="most_frequent") clf = svm.SVC(class_weight="auto") clf = linear_model.LogisticRegression() param_dist = {"C": [0.1, 1, 10], "kernel": ["rbf", "linear", "poly"]} param_dist = {"C": [1e6, 1e5, 1e4, 1e3, 1e2, 10, 1, 0.1, 0.01, 0.001]} search = GridSearchCV(clf, param_grid=param_dist, scoring="mean_absolute_error") test_scores, train_scores, dummy_scores = [], [], [] preds, true_labels = [], [] for oidx, (train, test) in enumerate(cv): y_train, y_test = y[train], y[test] X_train, X_test = X[train, :], X[test, :] search.fit(X_train, y_train) clf = search.best_estimator_ print search.best_params_ clf.fit(X_train, y_train) train_scores.append(accuracy_score(clf.predict(X_train), y_train)) test_scores.append(accuracy_score(clf.predict(X_test), y_test)) dumb.fit(X_train, y_train) dummy_scores.append(accuracy_score(dumb.predict(X_test), y_test)) preds += list(clf.predict(X_test)) true_labels += list(y_test) return test_scores, train_scores, dummy_scores, preds, true_labels
def get_scores(X, y): nfolds = 40 cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=.05) dumb = DummyClassifier(strategy='most_frequent') clf = svm.SVC(class_weight='auto') param_dist = {"C": [.1, 1, 10], "kernel": ['rbf', 'linear', 'poly'] } search = GridSearchCV(clf, param_grid=param_dist, scoring='mean_absolute_error') stest, strain, sdummy = [], [], [] for nfeats in range(X.shape[1]): test_scores, train_scores, dummy_scores = [], [], [] # figure out our possible feature combinations feats = itertools.combinations(range(X.shape[1]), nfeats + 1) for my_feats in feats: for oidx, (train, test) in enumerate(cv): idx = np.array(my_feats) y_train, y_test = y[train], y[test] X_train, X_test = X[train, :], X[test, :] search.fit(X_train, y_train) clf = search.best_estimator_ clf.fit(X_train[:, idx], y_train) train_scores.append(accuracy_score(clf.predict(X_train[:, idx]), y_train)) test_scores.append(accuracy_score(clf.predict(X_test[:, idx]), y_test)) dumb.fit(X_train[:, idx], y_train) dummy_scores.append(accuracy_score(dumb.predict(X_test[:, idx]), y_test)) sdummy.append(np.mean(dummy_scores)) strain.append(np.mean(train_scores)) stest.append(np.mean(test_scores)) return stest, strain, sdummy
def _run_dummy_detection(x_train, x_test, y_train, y_test): clf = DummyClassifier(strategy='most_frequent') print "Training Dummy..." clf.fit(x_train, y_train) print "Predicting Test Set..." print "Score for test set: {}".format(clf.score(x_test, y_test))
def test_dtype_of_classifier_probas(strategy): y = [0, 2, 1, 1] X = np.zeros(4) model = DummyClassifier(strategy=strategy, random_state=0, constant=0) probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64
def do_cross_validation(labels): """Perform the k-fold cross validation. Perform the k-fold cross validation, collect the result and return the single test instance predictions, as well as the classification results for each single fold and for the combination of all folds. Keyword arguments: features -- all features labels -- all labels """ skf = StratifiedKFold(labels, NO_OF_FOLDS) single_predictions = [] # Store each single classification decision # Store classification results for each fold and for the entire task (i.e., # entire cross validation). classification_result = np.zeros((NO_OF_FOLDS + 1, 5)) for cur_fold, (train_idx, test_idx) in enumerate(skf): model = DummyClassifier(strategy='most_frequent') model.fit(None, labels[train_idx]) pred_labels = model.predict(np.zeros(labels[test_idx].shape[0])) fold_array = np.empty(test_idx.shape[0]) fold_array.fill(cur_fold) single_predictions.append(np.transpose(np.vstack((fold_array, test_idx, labels[test_idx], pred_labels)))) classification_result[cur_fold, :] = get_classification_result(cur_fold, labels[test_idx], pred_labels) single_predictions = np.vstack(single_predictions) return single_predictions, classification_result
def test_dummy_classifier_on_nan_value(): X = [[np.NaN]] y = [1] y_expected = [1] clf = DummyClassifier() clf.fit(X, y) y_pred = clf.predict(X) assert_array_equal(y_pred, y_expected)
def test_most_frequent_strategy(): X = [[0], [0], [0], [0]] # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="most_frequent", random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y)
def test_constant_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]]) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y)
def test_constant_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]])) n_samples = len(X) clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0]) clf.fit(X, y) y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
def test_dummy_classifier_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = [2, 2, 2] y_expected = [2, 2, 2] y_proba_expected = [[1], [1], [1]] cls = DummyClassifier() cls.fit(X, y) y_pred = cls.predict(X) y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected)
def test_most_frequent_and_prior_strategy_multioutput(): X = [[0], [0], [0], [0]] # ignored y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]]) n_samples = len(X) for strategy in ("prior", "most_frequent"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def test_stratified_strategy(): X = [[0]] * 5 # ignored y = [1, 2, 1, 1, 2] clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 1000 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 3. / 5, decimal=1) assert_almost_equal(p[2], 2. / 5, decimal=1) _check_predict_proba(clf, X, y)
def test_uniform_strategy(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) p = np.bincount(y_pred) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y)
def test_most_frequent_and_prior_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]])) n_samples = len(X) y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]) for strategy in ("most_frequent", "prior"): clf = DummyClassifier(strategy=strategy, random_state=0) clf.fit(X, y) y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) assert_array_equal(y_pred.toarray(), y_expected)
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1): working_directory = config['DATA_DIRECTORY'] data_X, data_y = load_data(working_directory, filename, cols, question) data = leave_one_subject_out(data_X, data_y, 'User') score = 0 score_dummy_mf = 0 score_dummy_sf = 0 dummy_clf_mf = DummyClassifier('most_frequent') dummy_clf_sf = DummyClassifier('stratified') for (training_X, training_y), (testing_X, testing_y) in data: clf.fit(training_X, training_y) dummy_clf_mf.fit(training_X, training_y) dummy_clf_sf.fit(training_X, training_y) single_score = clf.score(testing_X, testing_y) single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y) single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y) #print 'Single run score: ' + ("%0.2f" % single_score.mean()) #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean()) #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean()) score = score + single_score.mean() score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean() score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean() score = round(float(score / len(data)), 2) score_dummy_mf = round(float(score_dummy_mf / len(data)), 2) score_dummy_sf = round(float(score_dummy_sf / len(data)), 2) #print 'Total score: ' + str(score) #print 'Total score (dummy most frequent): ' + str(score_dummy_mf) #print 'Total score (dummy stratified): ' + str(score_dummy_sf) if return_index == -1: return score, score_dummy_mf, score_dummy_sf else: return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None, n=1): """Try all dummy models.""" X = X.reshape((len(X) ,-1)) # y = y.reshape((len(y) ,-1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) dummy_scores = [] for i in range(n): for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']: clf = DummyClassifier(strategy=strategy) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = clf.score(X_test, y_test) matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh) report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) dummy_scores.append( collections.OrderedDict( strategy='classifier_' + strategy, matthews_corrcoef=matthews_corrcoef, score=score, report=report ) ) for strategy in ['mean', 'median']: clf=DummyRegressor(strategy=strategy) clf.fit(X_train, y_train) y_pred=clf.predict(X_test) score=clf.score(X_test, y_test) matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh) report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) dummy_scores.append( collections.OrderedDict( strategy='regressor_' + strategy, matthews_corrcoef=matthews_corrcoef, score=score, report=report ) ) df=pd.DataFrame(dummy_scores) df=df.sort_values('matthews_corrcoef', ascending=False) return df, df[:1].iloc[0].to_dict()
def main(training_set, language, gold_standard, gazetteer): """ Searches for the best hyperparameters """ gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} logger.info('Building training set') extractor = FactExtractorFeatureExtractor(language) for row in training_set: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes'], add_unknown=True, gazetteer=gazetteer) logger.info('Finalizing training set') x, y = extractor.get_features() logger.info('Searching for the best model parameters') svc = LinearSVC() search = GridSearchCV( svc, param_grid=[{ 'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'multi_class': ['ovr', 'crammer_singer'], }], scoring='f1_weighted', cv=10) search.fit(x, y) logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s', search.best_score_, search.best_params_) if not gold_standard: logger.info('Skipping gold standard evaluation') return logger.info('Evaluating on the gold standard') for row in gold_standard: data = json.loads(row) extractor.process_sentence(data['sentence'], data['fes']) x_gold, y_gold = extractor.get_features() dummy = DummyClassifier(strategy='stratified') dummy.fit(x, y) y_dummy = dummy.predict(x_gold) logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_dummy, average='weighted')) y_best = search.predict(x_gold) logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f', metrics.f1_score(y_gold, y_best, average='weighted'))
def test_uniform_strategy_sparse_target_warning(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]])) clf = DummyClassifier(strategy="uniform", random_state=0) assert_warns_message(UserWarning, "the uniform strategy would not save memory", clf.fit, X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 1 / 3, decimal=1) assert_almost_equal(p[2], 1 / 3, decimal=1) assert_almost_equal(p[4], 1 / 3, decimal=1)
def main(training_set, language, gold_standard, gazetteer, n_folds, n_jobs, scoring, output, test, word2vec_model, independent_lus): """ Searches for the best hyperparameters """ logger.info('Searching for the best model and parameters') training_sets = get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus) models = get_models(test) search = MultimodelGridSearchCV(*models, cv=n_folds, n_jobs=n_jobs, scoring=Scorer(scoring, True)) (x_tr, y_tr, best_training_meta), best_score, best_params, best_model = search.fit(training_sets) logger.info('Evaluation Results') logger.info(' Best model: %s', best_model.__class__.__name__) logger.info(' Score: %f', best_score) logger.info(' Parameters: %s', best_params) logger.info(' Gazetteer: %s', best_training_meta['gazetteer']) logger.info(' Extractor: %s', best_training_meta['extractor_cls'].__name__) logger.info(' Extractor args: %s', best_training_meta['extractor_args']) joblib.dump((best_model, best_training_meta), output) logger.info("Done, dumped model to '%s'", output) if not gold_standard: logger.info('Skipping gold standard evaluation') return logger.info('Evaluating on the gold standard') extractor = best_training_meta['extractor'] gazetteer = best_training_meta['gazetteer'] extractor.start() for row in gold_standard: data = json.loads(row) extractor.process_sentence(data['sentence'], data['lu'], data['fes'], add_unknown=False, gazetteer=gazetteer) x_gold, y_gold = extractor.get_features(refit=False) dummy = DummyClassifier(strategy='stratified') dummy.fit(x_tr, y_tr) logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f', Scorer(scoring, True)(dummy, x_gold, y_gold)) logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f', Scorer(scoring, True)(best_model, x_gold, y_gold))
def test_uniform_strategy_multioutput(): X = [[0]] * 4 # ignored y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]]) clf = DummyClassifier(strategy="uniform", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 0.5, decimal=1) assert_almost_equal(p[2], 0.5, decimal=1) _check_predict_proba(clf, X, y) _check_behavior_2d(clf)
def train_on_data(self, train): """\ Train model on the specified training data set (which must be a loaded DataSet object). """ log_info('Preparing data set...') self.data_headers = train.get_headers() self.attr_mask = self.get_attr_mask() train_vect = self.__vectorize(train) train_classes = self.get_classes(train) # if all the training data have the same class, use a dummy classifier if train.get_attrib(self.class_attr).num_values == 1: self.feature_filter = None self.classifier = DummyClassifier(strategy='most_frequent') # filter features log_info('Filtering...') train_filt = self.__filter_features(train_vect, train_classes) # train the classifier log_info('Training...') if self.use_weights: self.classifier.fit(train_filt, train_classes, sample_weight=train.inst_weights) else: self.classifier.fit(train_filt, train_classes) self.classifier_trained = True log_info('Training done.')
def test_stratified_strategy_sparse_target(): X = [[0]] * 5 # ignored y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]])) clf = DummyClassifier(strategy="stratified", random_state=0) clf.fit(X, y) X = [[0]] * 500 y_pred = clf.predict(X) assert_true(sp.issparse(y_pred)) y_pred = y_pred.toarray() for k in range(y.shape[1]): p = np.bincount(y_pred[:, k]) / float(len(X)) assert_almost_equal(p[1], 3.0 / 5, decimal=1) assert_almost_equal(p[0], 1.0 / 5, decimal=1) assert_almost_equal(p[4], 1.0 / 5, decimal=1)
def main(args): X, y, names = loadData(args.mat) t = numpy.transpose(X) ls = [] for name, col in zip(names, t): ls.append( (name, col) ) for col1,col2 in itertools.combinations(ls, 2): name1, row1 = col1 name2, row2 = col2 c,p = scipy.stats.pearsonr(row1, row2) print 'debugCor', name1, name2, c, p depth = 6 clf = tree.DecisionTreeClassifier(max_depth=depth) clf = clf.fit(X, y) dot_data = StringIO() tree.export_graphviz(clf, feature_names=names, out_file=dot_data) graph = pydot.graph_from_dot_data( dot_data.getvalue() ) graph.write_pdf(args.plotFile) sss = StratifiedShuffleSplit(y, 5, test_size=0.1, random_state=442) for train_index, test_index in sss: clf = tree.DecisionTreeClassifier(max_depth=depth) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = clf.fit(X_train, y_train) preds = clf.predict(X_test) metrics.confusion_matrix( y_test, preds ) print metrics.classification_report(y_test, clf.predict(X_test)) print '\ndummy\n' for train_index, test_index in sss: clf = DummyClassifier() X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = clf.fit(X_train, y_train) preds = clf.predict(X_test) metrics.confusion_matrix( y_test, preds ) print 'dummy', print metrics.classification_report(y_test, clf.predict(X_test))
def test_classifier_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf1.fit(X1, y) predictions1 = clf1.predict(X1) X2 = [[1]] * 4 clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0) clf2.fit(X2, y) predictions2 = clf2.predict(X2) assert_array_equal(predictions1, predictions2)
def train_clf(vectorizer, classifier, train, test, topic): train_X = vectorizer.transform(train["sentence"]).toarray() #Vectorize Training Features test_X = vectorizer.transform(test["sentence"]).toarray() #Vectorize Testing Feature train_y = train[topic] #Create Training Label Vector test_y = test[topic] #Create Testing Label Vector dummy_clf = DummyClassifier(strategy="most_frequent").fit(train_X, train_y) #Train a Dummy Classifier (for comparison) clf = classifier.fit(train_X, train_y) #Train Actual Classifier #Test Classifiers & Output Accuracy, Confusion Matrix Statistics dummy_accuracy = accuracy_score(test_y, dummy_clf.predict(test_X)) accuracy = accuracy_score(test_y, clf.predict(test_X)) cm = confusion_matrix(test_y, clf.predict(test_X)) print topic+" Dummy Accuracy: "+str(dummy_accuracy) print topic+" Accuracy: "+str(accuracy) print topic+" Confusion Matrix: " print cm print ""
def test_most_frequent_and_prior_strategy_with_2d_column_y(): # non-regression test added in # https://github.com/scikit-learn/scikit-learn/pull/13545 X = [[0], [0], [0], [0]] y_1d = [1, 2, 1, 1] y_2d = [[1], [2], [1], [1]] for strategy in ("most_frequent", "prior"): clf_1d = DummyClassifier(strategy=strategy, random_state=0) clf_2d = DummyClassifier(strategy=strategy, random_state=0) clf_1d.fit(X, y_1d) clf_2d.fit(X, y_2d) assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def svm_ssp_metrics(inputfile): """ This is essentially a helper function which returns all the metrics of an SVM's performance. Returns accuracy, precision, recall, F1 Score, confusion matrix :type inputfile: string :param inputfile: samples file :type w: float :param w: class weighting """ x, y, labels = load_csv_svm(inputfile) x_scaled = preprocessing.scale(x) if USE_PCA: pca = PCA(n_components=PCA_COMPONENTS) x = pca.fit_transform(x_scaled) print(pca.explained_variance_ratio_) else: x = x_scaled x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234,test_size=0.3) clf = svm.SVC(gamma=GAMMA, C=C, class_weight=WEIGHT, kernel=KERNEL, cache_size=400) # gamma=.01, C=.01, y_pred = clf.fit(x_train, y_train).predict(x_test) dummy_clf = DummyClassifier(strategy='stratified',random_state=0) # most_frequent, uniform, stratified dummy_y_pred = dummy_clf.fit(x_train, y_train).predict(x_test) print("\nClassification report for classifier %s:\n\n%s" % (clf, metrics.classification_report(y_test, y_pred))) print('Accuracy: {0}\n'.format(accuracy_score(y_test, y_pred))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred)) if KERNEL == 'linear': print('\nfeature_weights: {0}'.format(clf.coef_)) print("\nClassification report for classifier %s:\n\n%s" % (dummy_clf, metrics.classification_report(y_test, dummy_y_pred))) print('Accuracy: {0}\n'.format(accuracy_score(y_test, dummy_y_pred))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, dummy_y_pred))
def test_constant_strategy(): X = [[0], [0], [0], [0]] # ignored y = [2, 1, 2, 2] clf = DummyClassifier(strategy="constant", random_state=0, constant=1) clf.fit(X, y) assert_array_equal(clf.predict(X), np.ones(len(X))) _check_predict_proba(clf, X, y) X = [[0], [0], [0], [0]] # ignored y = ['two', 'one', 'two', 'two'] clf = DummyClassifier(strategy="constant", random_state=0, constant='one') clf.fit(X, y) assert_array_equal(clf.predict(X), np.array(['one'] * 4)) _check_predict_proba(clf, X, y)
def kfolds_evaluation(folds, model, scoring, skip_majority, x, y): kf = KFold(x.shape[0], folds, shuffle=True) scorer = Scorer(scoring, skip_majority) scores_dummy, scores_test, scores_train = [], [], [] for train_index, test_index in kf: x_train, y_train = x[train_index], y[train_index] x_test, y_test = x[test_index], y[test_index] model.fit(x_train, y_train) dummy = DummyClassifier() dummy.fit(x_train, y_train) scores_test.append(scorer(model, x_test, y_test)) scores_dummy.append(scorer(dummy, x_test, y_test)) scores_train.append(scorer(model, x_train, y_train)) logger.info("%d-folds cross evaluation results", folds) logger.info(" minimum test %f dummy %f training %f", min(scores_test), min(scores_dummy), min(scores_train)) logger.info(" maximum test %f dummy %f training %f", max(scores_test), max(scores_dummy), max(scores_train)) logger.info( " average test %f dummy %f training %f", np.average(scores_test), np.average(scores_dummy), np.average(scores_train), ) logger.info( " median test %f dummy %f training %f", np.median(scores_test), np.median(scores_dummy), np.median(scores_train), ) logger.debug("full test scores: %s", scores_test) logger.debug("full dummy scores: %s", scores_dummy) logger.debug("full train scores: %s", scores_train)
# Creamos el Pipeline incorporando ColumnTransformer y Clasificador pipeline = Pipeline([ ('imputer', imputer), ('scaler', scaler), ('svm', SVC(random_state=random_state, class_weight=class_weight)) ]) # InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros) rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=random_state) # inner grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, cv=rskf) # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy) scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring) # outer print('Scores: {}' .format(scores['test_score'])) print('Mean score: {}' .format(np.mean(scores['test_score']))) # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas dummy_clf = DummyClassifier(strategy='most_frequent', random_state=random_state) dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring) print('Dummy scores: {}' .format(dummy_scores['test_score'])) print('Dummy mean score: {}' .format(np.mean(dummy_scores['test_score']))) # Matriz de confusion results = cross_val_predict(grid_search, X=X, y=y, cv=5) conf_m = confusion_matrix(y, results, labels=[1, 0]) print(conf_m) # F1_Score print(f1_score(y, results))
header=0) X = feature_vectors_df.drop(columns=['class', 'buggy'], axis=1) y = feature_vectors_df.buggy X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) ''' Default version ''' clf = DecisionTreeClassifier() gnb = GaussianNB() lsvc = LinearSVC() mlpc = MLPClassifier() rfc = RandomForestClassifier() biased = DummyClassifier(strategy='constant', constant=1) ''' Fine-tuned version ''' # clf = DecisionTreeClassifier(criterion='entropy', splitter='random', presort=True) # gnb = GaussianNB(var_smoothing=1e-3) # lsvc = LinearSVC(loss='hinge', random_state=1) # mlpc = MLPClassifier(hidden_layer_sizes=1000, activation='tanh', solver='sgd', learning_rate='adaptive') # rfc = RandomForestClassifier(criterion='entropy', oob_score=True, warm_start=True) # biased = DummyClassifier(strategy='constant', constant=1) y_pred_clf = clf.fit(X_train, y_train).predict(X_test) y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test) y_pred_lsvc = lsvc.fit(X_train, y_train).predict(X_test) y_pred_mlpc = mlpc.fit(X_train, y_train).predict(X_test) y_pred_rfc = rfc.fit(X_train, y_train).predict(X_test) y_pred_biased = biased.fit(X_train, y_train).predict(X_test)
ypred = model.predict(X[test]) accuracies.append(accuracy_score(y[test], ypred)) avg_accuracy = sum(accuracies) / len(accuracies) print("Decision Tree Model average accuracy: ", avg_accuracy) if curr_best[0] is None or curr_best[1] < avg_accuracy: curr_best = (model, avg_accuracy, lang) model_names.append("DecisionTree") model_accuracies.append(avg_accuracy) # Dummy Model accuracies = [] kf = KFold(n_splits=5) for train, test in kf.split(X): model = DummyClassifier(strategy="most_frequent").fit( X[train], y[train]) ypred = model.predict(X[test]) accuracies.append(accuracy_score(y[test], ypred)) avg_accuracy = sum(accuracies) / len(accuracies) dummy_models.append((avg_accuracy, lang)) print("Dummy Model average accuracy: ", avg_accuracy) if curr_best[0] is None or curr_best[1] < avg_accuracy: curr_best = (model, avg_accuracy, lang) model_names.append("Dummy") model_accuracies.append(avg_accuracy) # RidgeClassifier Model best_ridge_accuracy = (-1, -1) mean_error = []
"\n\n") ## Naive Bayes Classification print("Number of classes used:", nr_classes) from sklearn.naive_bayes import GaussianNB from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score priors = np.array([hist / y.shape[0] for hist in histo]) nb_clf = GaussianNB(priors=priors) nb_score = cross_val_score(nb_clf, X, binned_y, cv=3) print("Naive Bayes Scores:\n ", nb_score) ## KNN Classification from sklearn.neighbors import KNeighborsClassifier knn_clf = KNeighborsClassifier(nr_classes) knn_score = cross_val_score(knn_clf, X, binned_y, cv=5) print("\n\nK-Nearest Neighbor Scores: \n", knn_score) # # ## Dummy classifier from sklearn.dummy import DummyClassifier dummy_clf = DummyClassifier(strategy="most_frequent") dummy_score = cross_val_score(dummy_clf, X, binned_y, cv=5) print("\n\nPredict most frequent class: \n", dummy_score) ## Plot histogram of data binning # plt.bar([x for x in range(1,histo.shape[0]+1)],histo) # plt.xlabel('bins') # plt.ylabel('number of training examples') # plt.savefig('histogram.jpg')
df = pd.read_csv( "https://www.openml.org/data/get_csv/1595261/adult-census.csv") # %% target_name = "class" target = df[target_name].to_numpy() data = df.drop(columns=[target_name, "fnlwgt"]) numerical_columns = [ c for c in data.columns if data[c].dtype.kind in ["i", "f"]] data_numeric = data[numerical_columns] # %% from sklearn.model_selection import cross_val_score from sklearn.dummy import DummyClassifier high_revenue_clf = DummyClassifier(strategy="constant", constant=" >50K") scores = cross_val_score(high_revenue_clf, data_numeric, target) print(f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% low_revenue_clf = DummyClassifier(strategy="constant", constant=" <=50K") scores = cross_val_score(low_revenue_clf, data_numeric, target) print(f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% most_freq_revenue_clf = DummyClassifier(strategy="most_frequent") scores = cross_val_score(most_freq_revenue_clf, data_numeric, target) print(f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% [markdown]
random_state=1, solver='liblinear', multi_class='ovr') lr.fit(train_x, train_y) accuracy = lr.score(validate_x, validate_y) print("Accuracy: %.3f" % accuracy) print("\nModel 4: SVM, C=1.0") svm = SVC(kernel='linear', C=1.0, random_state=1) svm.fit(train_x, train_y) accuracy = svm.score(validate_x, validate_y) print("Accuracy: %.3f" % accuracy) print("\n>>> Beginning Baseline model training...") print("Baseline Model 1: Strategy = \"stratified\"") dummy = DummyClassifier(strategy="stratified") dummy.fit(train_x, train_y) dummy.predict(validate_x) accuracy = dummy.score(validate_x, validate_y) print("Accuracy : %.3f" % accuracy) print("\nBaseline Model 2: Strategy = \"uniform\"") dummy = DummyClassifier(strategy="uniform") dummy.fit(train_x, train_y) dummy.predict(validate_x) accuracy = dummy.score(validate_x, validate_y) print("Accuracy : %.3f" % accuracy) print( "\n>>> Model Analysis: Logistic Regression and SVM give very similar results; best model is most likely Model 1" )
# Init selected scenario if SCENARIO == 1: scenario_name = 'naive_bayes' clf = GaussianNB() elif SCENARIO == 2: scenario_name = 'knn' clf = KNeighborsClassifier(n_neighbors=KNN_N_NEIGH) elif SCENARIO == 3: scenario_name = 'random_forrest' clf = RandomForestClassifier() elif SCENARIO == 4: scenario_name = 'adaboost' clf = AdaBoostClassifier() elif SCENARIO == 5: scenario_name = 'decision_tree' clf = DecisionTreeClassifier() elif SCENARIO == 6: scenario_name = 'most_frequent' clf = DummyClassifier(strategy='most_frequent') elif SCENARIO == 7: scenario_name = 'random' clf = DummyClassifier(strategy='uniform') else: print("\n!!!") print("Selected invalid scenario, defaulting to Naive Bayes.") print("!!!\n") scenario_name = 'naive_bayes' clf = GaussianNB() evaluate_model(clf, x, y, scenario_name, 'default_params', SAMPLE_SIZE, scale)
rfresults = {} for eval in evaluation_scores: dummyresults[eval[0]] = [] lrresults[eval[0]] = [] lsvmresults[eval[0]] = [] mlpresults[eval[0]] = [] rfresults[eval[0]] = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train.shape, y_train.shape) print(X_test.shape, y_test.shape) print() dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf.fit(X_train, y_train) y_pred_dummy = dummy_clf.predict(X_test) for (evalname, evaluator) in evaluation_scores: print("DUMMY " + evalname + ":", evaluator(y_test, y_pred_dummy)) dummyresults[evalname].append(evaluator(y_test, y_pred_dummy)) lr_clf = LogisticRegression(solver="lbfgs") lr_clf.fit(X_train, y_train) y_pred_lr = lr_clf.predict(X_test) for evalname, evaluator in evaluation_scores: print("Logistic regression " + evalname + ":", evaluator(y_test, y_pred_lr)) lrresults[evalname].append(evaluator(y_test, y_pred_lr))
def calculateBaselines(df, target, intervalType, MachineLearningMethod, FeeModel): X = df[["Open", "Low", "High", "Close", "Volume"]] X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, train_size=0.8, shuffle=False, random_state=42) # BASELINES FOR REGRESSION if MachineLearningMethod == "Regression": print( '------------- No-Change baseline: Predict same Target as last timestamp ------------- ' ) # No-Change baseline # Add new column 'Predicted' . True if previous timestamp was also true df['Predicted'] = target.shift(1) y_predict_NOCHANGE = df['Predicted'][len(X_train):] MAE_nochange = metrics.mean_absolute_error(y_test, y_predict_NOCHANGE) print('Mean Absolute Error:', MAE_nochange) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict_NOCHANGE)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict_NOCHANGE))) print( '------------- DUMMY BASELINE MODEL “mean”: always predicts the mean of the training set ------------- ' ) # Dummy classifier to create baseline to compare to the real models dummy_clf = DummyRegressor(strategy="mean") dummy_clf_ = dummy_clf.fit(X_train, y_train) y_predict_MEAN = dummy_clf_.predict(X_test) MAE_mean = metrics.mean_absolute_error(y_test, y_predict_MEAN) print('Mean Absolute Error:', MAE_mean) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict_MEAN)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict_MEAN))) print( '------------- DUMMY BASELINE MODEL “median”: always predicts the median of the training set ------------- ' ) dummy_clf = DummyRegressor(strategy="median") dummy_clf_ = dummy_clf.fit(X_train, y_train) y_predict_median = dummy_clf.predict(X_test) MAE_median = metrics.mean_absolute_error(y_test, y_predict_median) print('Mean Absolute Error:', MAE_median) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict_median)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict_median))) # BASELINES FOR CLASSIFICATION elif MachineLearningMethod == "Classification": print( '------------- No-Change baseline: Predict same Target as last timestamp ------------- ' ) # No-Change baseline # Add new column 'Predicted' . True if previous timestamp was also true df['Predicted'] = np.where(target.shift(1) == True, True, False) y_predict_NOCHANGE = df['Predicted'][len(X_train):] print('Accuracy score:') accuracy_nochange = round( accuracy_score(y_test, y_predict_NOCHANGE) * 100, 2) print(accuracy_nochange) print('Classification report:') print(classification_report(y_test, y_predict_NOCHANGE)) print( '------------- DUMMY BASELINE MODEL (Stratified) generates predictions by respecting the training set’s class distribution. Random ------------- ' ) # Dummy classifier to create baseline to compare to the real models dummy_clf = DummyClassifier(strategy="stratified", random_state=42) dummy_clf_ = dummy_clf.fit(X_train, y_train) y_predict_STRATIFIED = dummy_clf.predict(X_test) print('Accuracy score:') accuracy_stratified = round( accuracy_score(y_test, y_predict_STRATIFIED) * 100, 2) print(accuracy_stratified) print('Classification report:') print(classification_report(y_test, y_predict_STRATIFIED)) print( '------------- DUMMY BASELINE MODEL (Most Frequent) always predicts the most frequent label in the training set. ------------- ' ) dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42) dummy_clf_ = dummy_clf.fit(X_train, y_train) y_predict_MOSTFREQ = dummy_clf.predict(X_test) print('Accuracy score:') accuracy_mostfrequent = round( accuracy_score(y_test, y_predict_MOSTFREQ) * 100, 2) print(accuracy_mostfrequent) print('Classification report:') print(classification_report(y_test, y_predict_MOSTFREQ)) print( '------------- DUMMY BASELINE MODEL (Prior) always predicts the class that maximizes the class prior (like “most_frequent”) and predict_proba returns the class prior. ------------- ' ) dummy_clf = DummyClassifier(strategy="prior", random_state=42) dummy_clf_ = dummy_clf.fit(X_train, y_train) y_predict_PRIOR = dummy_clf.predict(X_test) print('Accuracy score:') accuracy_prior = round( accuracy_score(y_test, y_predict_PRIOR) * 100, 2) print(accuracy_prior) print('Classification report:') print(classification_report(y_test, y_predict_PRIOR)) # Build Matplot boxplot from results of baselines + results of all our own modelssssssssssssssssss if MachineLearningMethod == "Regression": if intervalType == "Day": DEFAULT = readTxtResults( '../accuraciesOutput/default/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/BTCUSD_1Day.csv_Regression_Difference_TRUE_RELATIONS.txt' ) elif intervalType == "Hour": DEFAULT = readTxtResults( '../accuraciesOutput/default/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/bitfinex_tBTCUSD_1h.csv_Regression_Difference_TRUE_RELATIONS.txt' ) elif intervalType == "Minute": DEFAULT = readTxtResults( '../accuraciesOutput/default/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/bitfinex_tBTCUSD_1m.csv_Regression_Difference_TRUE_RELATIONS.txt' ) xLabelNames = [ 'Default', 'RF Optimized', '68 Hyp..', '14 Hyp..', 'No-change', 'Mean', 'Median' ] data = [ DEFAULT, RF_OPTIMIZED, NO_RELATIONS, RELATIONS, [MAE_nochange], [MAE_mean], [MAE_median] ] elif MachineLearningMethod == "Classification": if FeeModel == "OFF": if intervalType == "Day": DEFAULT = readTxtResults( '../accuraciesOutput/default/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/BTCUSD_1Day.csv_Classification_TRUE_RELATIONS.txt' ) elif intervalType == "Hour": DEFAULT = readTxtResults( '../accuraciesOutput/default/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/bitfinex_tBTCUSD_1h.csv_Classification_TRUE_RELATIONS.txt' ) elif intervalType == "Minute": DEFAULT = readTxtResults( '../accuraciesOutput/default/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput/relations/bitfinex_tBTCUSD_1m.csv_Classification_TRUE_RELATIONS.txt' ) elif FeeModel == "ON": if intervalType == "Day": DEFAULT = readTxtResults( '../accuraciesOutput_feeModel/default/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput_feeModel/rf_optimized/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/no_relations/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/relations/BTCUSD_1Day.csv_Classification_TRUE_RELATIONS.txt' ) elif intervalType == "Hour": DEFAULT = readTxtResults( '../accuraciesOutput_feeModel/default/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput_feeModel/rf_optimized/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/no_relations/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/relations/bitfinex_tBTCUSD_1h.csv_Classification_TRUE_RELATIONS.txt' ) elif intervalType == "Minute": DEFAULT = readTxtResults( '../accuraciesOutput_feeModel/default/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) RF_OPTIMIZED = readTxtResults( '../accuraciesOutput_feeModel/rf_optimized/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) NO_RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/no_relations/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt' ) RELATIONS = readTxtResults( '../accuraciesOutput_feeModel/relations/bitfinex_tBTCUSD_1m.csv_Classification_TRUE_RELATIONS.txt' ) xLabelNames = [ 'Default', 'RF Optimized', '68 Hyp..', '14 Hyp..', 'No-change', 'Stratified', 'Most Frequent', 'Prior' ] data = [ DEFAULT, RF_OPTIMIZED, NO_RELATIONS, RELATIONS, [accuracy_nochange], [accuracy_stratified], [accuracy_mostfrequent], [accuracy_prior] ] # Export the predicted model df = df[len(X_train):] if MachineLearningMethod == "Classification": df['Predicted'] = y_predict_MOSTFREQ elif MachineLearningMethod == "Regression": df['Predicted'] = y_predict_MEAN dfClose = df['Close'] df['Change'] = dfClose.pct_change(periods=1) # Contains percentage change df = df[['Timestamp', 'Close', 'Change', 'Target', 'Predicted']] nameOfExportedModel = intervalType + "_" + MachineLearningMethod df.to_csv("../PredictedModels/baselineModels/" + nameOfExportedModel + ".csv") matplotlib.use("pgf") matplotlib.rcParams.update({ "pgf.texsystem": "pdflatex", 'font.family': 'serif', 'text.usetex': True, 'pgf.rcfonts': False, }) fig, ax = plt.subplots(figsize=(6.69, 4)) ax.set_xticklabels(xLabelNames) ax.set_title(MachineLearningMethod + ' Baselines for interval: ' + intervalType) ax.boxplot(data) #plt.savefig('histogram.png') plt.savefig('histogram.pgf') plt.show() exit()
imprime_titulo("Comparación de clasificación") randomf_clasif = [("Random Forest", RandomForestClassifier(n_estimators=100))] clasificador_randomf = Pipeline(preprocesado + randomf_clasif) with mensaje("Ajustando modelo de clasificación Random Forest"): clasificador_randomf.fit(digits_tra_x, digits_tra_y) y_clasif_randomf = clasificador_randomf.predict(digits_test_x) muestra_confusion(digits_test_y, y_clasif_randomf, "Random Forest") estima_error_clasif(clasificador_randomf, digits_tra_x, digits_tra_y, digits_test_x, digits_test_y, "RandomForest") dummy_clasif = DummyClassifier(strategy="stratified") dummy_clasif.fit(digits_tra_x, digits_tra_y) estima_error_clasif(dummy_clasif, digits_tra_x, digits_tra_y, digits_test_x, digits_test_y, "Estratificado (Dummy)") espera() imprime_titulo("Comparación de regresión") randomf_regr = [("Random Forest", RandomForestRegressor(n_estimators=100))] regresor_randomf = Pipeline(preprocesado + randomf_regr) with mensaje("Ajustando modelo de regresión Random Forest"): regresor_randomf.fit(airfoil_tra_x, airfoil_tra_y) estima_error_regresion(regresor_randomf, airfoil_tra_x, airfoil_tra_y, airfoil_test_x, airfoil_test_y, "RandomForest")
print(spam) print(messages.groupby('label').describe()) print('\nВыборка несбалансированна, неспам - 4825, спам - 747, примеров спама гораздо меньше') # перевод str в int (ham->0, spam->1) messages['label'] = messages['label'].map({'ham': 0, 'spam': 1}).astype(int) # Векторизация bow = CountVectorizer() bow.fit_transform(messages['message']) bowed_messages = bow.transform(messages['message']) # Обучение DummyClassifier clf = DummyClassifier(strategy='most_frequent', random_state=0) clf = clf.fit(bowed_messages, messages['label']) # Вывод результатов по Dummy Classifier print(classification_report(messages['label'], clf.predict(bowed_messages))) print('Dummy classifier, который будет всем новым наблюдениям присваивать класс ham, получит 75% precission и 87 - recall, 80 - f-score') # print('\nNaive Bayes 1') # naive_model = MultinomialNB() # naive_model.fit(bowed_messages, messages['label']) # # print(len(msg_train), len(msg_test)) # cv_results = cross_val_score(naive_model, bowed_messages, messages['label'], cv=10, scoring='accuracy') # print(cv_results.mean(), cv_results.std()) # print(classification_report(messages['label'], naive_model.predict(bowed_messages))) msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) # поделить выборку в соотновении 80:20
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.dummy import DummyClassifier from nlp4musa2020.dataloaders.alf200k import ALF200KLoader, genre_target_labels import nlp4musa2020.evaluators as evaluators from nlp4musa2020.models.simplenn_genre import SimpleGenreNN dataloader = ALF200KLoader('data/processed/dataset-lfm-genres.pickle', load_feature_groups=[ 'explicitness', ], text_vectorizers=None, target=genre_target_labels()) pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', DummyClassifier(strategy="uniform")), ]) evaluator = GridEvaluator( parameters={ "model__random_state": [42], }, grid_parameters=evaluators.grid_parameters_genres(), ) result_handlers = [ result_handlers.print_gridsearch_results, ]
# In[107]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=0) print(X_train.shape, y_train.shape) print(X_test.shape, y_test.shape) # ## Dummy classifier # In[109]: clfDummy = DummyClassifier(strategy='most_frequent', random_state=0) clfDummy.fit(X_train, y_train) # In[110]: clfDummy.score(X_test, y_test) # In[111]: dump(clfDummy, 'dummyClf.joblib') # ## Ridge classifier # In[122]: #hyperparameter tuning
if __name__ == '__main__': ########################################################### # Settings #!!! USED_EXAMPLES_NUMBER = None # 'None' means that all examples are used; otherwise randomly selected #!!! OBJECTIVE_NAME = 'Sex' # e.g. 'BMIgr', 'Sex', 'cl_sleep_interval' #!!!! sample_name = OBJECTIVE_NAME + '_1' # train-test filename SEED = 0 classifiers = [ ( "Dummy", DummyClassifier(strategy='stratified') ), # see http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html ("Nearest Neighbors", KNeighborsClassifier(3)), # ("Linear SVM", SVC(kernel="linear", C=0.025)), # ("RBF SVM", SVC(gamma=2, C=1)), # ("Decision Tree", DecisionTreeClassifier(max_depth=5)), ("Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)), # ("AdaBoost", AdaBoostClassifier()), ("Naive Bayes", GaussianNB()) ] # TODO: xgboost ############################################################### # Initial configuration np.random.seed(SEED) logg.configure_logging(
df['Target'] = 0 df.loc[df['FutureReturn'] > buy_threshold, 'Target'] = 1 df.loc[df['FutureReturn'] < sell_threshold, 'Target'] = -1 # Train/test split X = df.ix[:, :-2] # all columns except Target y = df['Target'] X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=0) # Default: train 75%, test 25% # Fit ensemble classifier clf = VotingClassifier([('knn', KNeighborsClassifier()), ('rfor', RandomForestClassifier(random_state=0)), ('lsvc', LinearSVC())]).fit(X_train, y_train) dummy = DummyClassifier(strategy='stratified').fit(X_train, y_train) print('Accuracy of Ensemble classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of Ensemble classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) print('Prediction Spread:', Counter(clf.predict(X_test))) print('Accuracy of Dummy classifier on test set: {:.2f}'.format( dummy.score(X_test, y_test))) print('Dummy Prediction Spread:', Counter(dummy.predict(X_test))) # Plot predictions X_test = X_test.sort_index() df['predictions'] = pd.DataFrame(clf.predict(X_test), index=X_test.index) fig, ax = plt.subplots()
k: int(v / self.factor) for k, v in counts.items() }) X_resampled, y_resampled = self.undersampler_.fit_resample(X, y) if self.oversampler is not None: self.oversampler_ = clone(self.oversampler).set_params( random_state=self.random_state, sampling_strategy=dict(counts)) X_resampled, y_resampled = self.oversampler_.fit_resample( X_resampled, y_resampled) return X_resampled, y_resampled SCORERS['geometric_mean_score'] = make_scorer(geometric_mean_score) CONFIG = { 'classifiers': [ ('CONSTANT CLASSIFIER', DummyClassifier(strategy='constant', constant=0), {}), ('LR', LogisticRegression(solver='liblinear', multi_class='auto'), {}), ('KNN', KNeighborsClassifier(), { 'n_neighbors': [3, 5] }), ('DT', DecisionTreeClassifier(), { 'max_depth': [3, 6] }), ('GBC', GradientBoostingClassifier(), { 'max_depth': [3, 6], 'n_estimators': [50, 100] }), ], 'scoring': ['accuracy', 'geometric_mean_score'], 'n_splits': 5,
benchmark_dir = os.environ["AMM_DATASET_DIR"] for p in BENCHMARK_FULL_SET: pname = p["name"] print("Loading {}".format(pname)) df = pd.read_pickle(os.path.join(benchmark_dir, p["data_pickle"])) target = p["target"] ltype = p["problem_type"] if ltype == AMM_REG_NAME: kf = KFold(n_splits=5, random_state=18012019, shuffle=True) estimator = DummyRegressor(strategy="mean") scoring = "neg_mean_absolute_error" multiplier = -1 elif ltype == AMM_CLF_NAME: kf = StratifiedKFold(n_splits=5, random_state=18012019, shuffle=True) estimator = DummyClassifier(strategy="stratified") multiplier = 1 scoring = "roc_auc" else: raise ValueError("problem type {} is not known.".format(ltype)) cvs = cross_val_score(estimator, df.drop(columns=[target]), y=df[target], scoring=scoring, cv=kf) cvs = multiplier * cvs mean_cvs = np.mean(cvs) print(pname, mean_cvs)
def test_classifier_exceptions(): clf = DummyClassifier(strategy="unknown") assert_raises(ValueError, clf.fit, [], []) assert_raises(ValueError, clf.predict, []) assert_raises(ValueError, clf.predict_proba, [])
from sklearn.feature_extraction.text import CountVectorizer c = CountVectorizer(stop_words='english') from sklearn.linear_model import LogisticRegression lr = LogisticRegression() predict(X, y, c, lr) """#### Accuracy is around 93.9% - not bad. However we notice that some of those significant coefficients are not meaningful, e.g. 280mg.""" from sklearn.dummy import DummyClassifier ### calling function for dummy classifier text_fit(X, y, c, DummyClassifier(), 0) """#### Logistic regression model on TFIDF""" from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(stop_words='english') text_fit(X, y, tfidf, LogisticRegression()) from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(stop_words='english') predict(X, y, tfidf, LogisticRegression()) """Accurany is roughly the same - 93.5%. However we notice that the significant words make much more sense now, with higher coefficient magnitude as well!""" """#### Upvote prediction We will be focusing on score 5 reviews, and get rid of comments with neutral votes
def test_classifier_score_with_None(y, y_test): clf = DummyClassifier(strategy="most_frequent") clf.fit(None, y) assert_equal(clf.score(None, y_test), 0.5)
knn.fit(X_train, y_train) prediction = knn.predict_proba(X_test) fpr, tpr, _ = roc_curve(y_test, prediction[:, 1]) auc_score = roc_auc_score(y_test, prediction[:, 1]) print("AUC Score:", auc_score) plt.plot(fpr, tpr, color='red', label='K-Neighbours') knn.fit(X_train, y_train) preds_train = knn.predict(X_train) preds_test = knn.predict(X_test) print_results(preds_train, y_train, "KNN train") print_results(preds_test, y_test, "KNN test") # baseline classifier dummy = DummyClassifier(strategy='most_frequent') dummy.fit(X_train, y_train) preds_train = dummy.predict(X_train) preds_test = dummy.predict(X_test) print_results(preds_train, y_train, "Dummy train") print_results(preds_test, y_test, "Dummy test") # baseline confusion matrix for plotting point matrix = confusion_matrix(y_train, preds_train) most_freq_fpr = matrix[0][1] / (matrix[0][1] + matrix[0][0]) # FP / (FP + TN) most_freq_tpr = matrix[1][1] / (matrix[1][1] + matrix[1][0]) # TP / (TP + FN) plt.plot(most_freq_fpr, most_freq_tpr,
def test_string_labels(): X = [[0]] * 5 y = ["paris", "paris", "tokyo", "amsterdam", "berlin"] clf = DummyClassifier(strategy="most_frequent") clf.fit(X, y) assert_array_equal(clf.predict(X), ["paris"] * 5)
pickle.dump(clf, open('multi_tfidf.sav', 'wb')) pred_tfidf_balanced = clf.predict(vect_tfidf.transform(X_test)) print(" Tf-idf, Balanced accuracy score = " + str(balanced_accuracy_score(y_test, pred_tfidf_balanced))) print(" Tf-idf, Accuracy score = " + str(accuracy_score(y_test, pred_tfidf_balanced))) report_tfidf = classification_report(y_test, pred_tfidf_balanced) # count balanced accuracy: 42 # count accuracy: 43.5 # tf-idf balanced accuracy: 43.5 # tf-idf accuracy: 41.5 ''' -------------------- Baseline Models ------------------- ''' dummy_clf = DummyClassifier(strategy="most_frequent") dummy_clf.fit(X_train, y_train) DummyClassifier(strategy='uniform') dummy_clf.predict(X_train) dummy_clf.score(X_test, y_test) # Multilabel # Most frequent: 19.7 % # Stratified: 14.2 % # Uniform: 11.3 # Binary 50.46 % ''' ------------------- Feature importance -----------------
action = args.action # Get the full data set, instances, and outcomes. dataset, instances, outcomes = get_data(filename) # Organize the data instances, outcomes = data_organizer(instances, outcomes) assert len(instances) == len(outcomes) # Generate labels array from the outcome data labels = generate_labels(outcomes) # Split data into training and dev sets size_of_test_set = 0.3 instance_train, instance_test, labels_train, labels_test =\ train_test_split( instances, labels, test_size = size_of_test_set ) assert len(instance_train) == len(labels_train) and len( instance_test) == len(labels_test) # Classify the training set classifier = NBclassify(instance_train, labels_train) # Baseline baseline = DummyClassifier(strategy='uniform') dumb_clf = baseline.fit(instance_train, labels_train) # Evaluate the classification evaluate(classifier, dumb_clf, instance_test, labels_test)
dataset=load_digits() X,y = dataset.data, dataset.target for class_name,class_count in zip(dataset.target_names, np.bincount(dataset.target)): print(class_name, class_count) y_binary_imbalanced = y.copy() y_binary_imbalanced[y_binary_imbalanced !=1] = 0 print('Original labels:\t', y[1:30]) print('New binary labels:\t', y_binary_imbalanced[1:30]) np.bincount(y_binary_imbalanced) X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0) svm = SVC(kernel="rbf", C=1).fit(X_train,y_train) svm.score(X_test,y_test) # Dummy classifiers server as a sanity check purpose, not used for real classifier from sklearn.dummy import DummyClassifier dummy_majority= DummyClassifier(strategy="most_frequent").fit(X_train, y_train) # most_frequent, stratified(random from training set class distribution), uniform(uniform random), constant(when positive class is minority) y_dummy_pred=dummy_majority.predict(X_test) y_dummy_pred dummy_majority.score(X_test,y_test) ### dummy classifier: for sanity check, if ### dummy regressors ## strategy: mean, median, quantile, constant # confusion matrix from sklearn.metrics import confusion_matrix dummy_majority = DummyClassifier(strategy="most_frequent").fit(X_train, y_train) y_majority_pred = dummy_majority.predict(X_test) confusion_dummy =confusion_matrix(y_test, y_majority_pred) confusion_dummy # [ [TN,FN], [FP,TP] ]
from sklearn.dummy import DummyClassifier dummy = DummyClassifier() dummy.fit(X_train, y_train) print(accuracy_score(y_train, dummy.predict(X_train)))
def fit(self, X, y, sample_weight=None): """Fit all base estimators. Parameters ---------- X : 2d numpy array or sparse matrix of shape [n_samples, n_features] Training data y : 1d numpy array of shape [n_samples] Target values. sample_weight : 1d numpy array of shape [n_samples] Individual weights for each sample. Passed to fit method of each estimator. Note: will be split automatically for each fold. Returns ------- self : object Fitted StackingTransformer instance. """ # --------------------------------------------------------------------- # Validation # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Check input data # --------------------------------------------------------------------- # Check X and y # ``check_estimator`` does not allow ``force_all_finite=False`` X, y = check_X_y( X, y, accept_sparse=['csr'], # allow csr, cast all others to csr force_all_finite=True, # do not allow nan and inf multi_output=False) # allow only one column in y_train # Check X and sample_weight # X is alredy checked, but we need it to compare length of sample_weight if sample_weight is not None: X, sample_weight = check_X_y(X, sample_weight, accept_sparse=['csr'], force_all_finite=True, multi_output=False) # --------------------------------------------------------------------- # Check ``estimators`` # --------------------------------------------------------------------- if self.estimators is None: if self.regression: self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))] else: self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))] # warnings.warn('No estimators were specified. ' # 'Using single dummy estimator as demo.', UserWarning) else: if 0 == len(self.estimators): raise ValueError('List of estimators is empty') else: # Clone self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators] # Check names of estimators names, estims = zip(*self.estimators_) self._validate_names(names) # Check if all estimators support ``sample_weight`` if sample_weight is not None: for name, estim in self.estimators_: if not has_fit_parameter(estim, 'sample_weight'): raise ValueError( 'Underlying estimator [%s] does not ' 'support sample weights.' % name) # --------------------------------------------------------------------- # Check other StackingTransformer parameters # --------------------------------------------------------------------- # ``variant`` if self.variant not in ['A', 'B']: raise ValueError('Parameter ``variant`` must be set properly') # ``n_folds`` if not isinstance(self.n_folds, int): raise ValueError('Parameter ``n_folds`` must be integer') if not self.n_folds > 1: raise ValueError('Parameter ``n_folds`` must be not less than 2') # ``verbose`` if self.verbose not in [0, 1, 2]: raise ValueError('Parameter ``verbose`` must be 0, 1, or 2') # Additional check for inapplicable parameter combinations # If ``regression=True`` we ignore classification-specific # parameters and issue user warning if self.regression and (self.needs_proba or self.stratified): warn_str = ('This is regression task hence classification-specific' 'parameters set to ``True`` were ignored:') if self.needs_proba: self.needs_proba = False warn_str += ' ``needs_proba``' if self.stratified: self.stratified = False warn_str += ' ``stratified``' warnings.warn(warn_str, UserWarning) # --------------------------------------------------------------------- # Compute attributes (basic properties of data, number of estimators, etc.) # --------------------------------------------------------------------- self.train_shape_ = X.shape self.n_train_examples_ = X.shape[0] self.n_features_ = X.shape[1] if not self.regression: self.n_classes_ = len(np.unique(y)) else: self.n_classes_ = None self.n_estimators_ = len(self.estimators_) self.train_footprint_ = self._get_footprint(X) # --------------------------------------------------------------------- # Specify default metric # --------------------------------------------------------------------- if self.metric is None and self.regression: self.metric_ = mean_absolute_error elif self.metric is None and not self.regression: if self.needs_proba: self.metric_ = log_loss else: self.metric_ = accuracy_score else: self.metric_ = self.metric # --------------------------------------------------------------------- # Create report header strings and print report header # --------------------------------------------------------------------- if self.verbose > 0: if self.regression: task_str = 'task: [regression]' else: task_str = 'task: [classification]' n_classes_str = 'n_classes: [%d]' % self.n_classes_ metric_str = 'metric: [%s]' % self.metric_.__name__ variant_str = 'variant: [%s]' % self.variant n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_ print(task_str) if not self.regression: print(n_classes_str) print(metric_str) print(variant_str) print(n_estimators_str + '\n') # --------------------------------------------------------------------- # Initialize cross-validation split # Stratified can be used only for classification # --------------------------------------------------------------------- if not self.regression and self.stratified: self.kf_ = StratifiedKFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) # Save target to be able to create stratified split in ``transform`` method # This is more efficient than to save split indices self._y_ = y.copy() else: self.kf_ = KFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) self._y_ = None # --------------------------------------------------------------------- # Compute implicit number of classes to create appropriate empty arrays. # !!! Important. In order to unify array creation # variable ``n_classes_implicit_`` is always equal to 1, except the case # when we performing classification task with ``needs_proba=True`` # --------------------------------------------------------------------- if not self.regression and self.needs_proba: self.n_classes_implicit_ = len(np.unique(y)) self.action_ = 'predict_proba' else: self.n_classes_implicit_ = 1 self.action_ = 'predict' # --------------------------------------------------------------------- # Create empty numpy array for train predictions (OOF) # !!! Important. We have to implicitly predict during fit # in order to compute CV scores, because # the most reasonable place to print out CV scores is fit method # --------------------------------------------------------------------- S_train = np.zeros( (X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) # --------------------------------------------------------------------- # Prepare (clone) estmators for fitting and storing # We need models_A_ for both variant A and varian B # We need models_B_ for varian B only (in variant A attribute models_B_ is None) # --------------------------------------------------------------------- self.models_A_ = [] self.models_B_ = None for n, est in self.estimators_: self.models_A_.append([clone(est) for _ in range(self.n_folds)]) if self.variant in ['B']: self.models_B_ = [clone(est) for n, est in self.estimators_] # --------------------------------------------------------------------- # Create empty numpy array to store scores for each estimator and each fold # --------------------------------------------------------------------- self.scores_ = np.zeros((self.n_estimators_, self.n_folds)) # --------------------------------------------------------------------- # Create empty list to store name, mean and std for each estimator # --------------------------------------------------------------------- self.mean_std_ = [] # --------------------------------------------------------------------- # MAIN FIT PROCEDURE # --------------------------------------------------------------------- # Loop across estimators # --------------------------------------------------------------------- for estimator_counter, (name, estimator) in enumerate(self.estimators_): if self.verbose > 0: estimator_str = 'estimator %2d: [%s: %s]' % ( estimator_counter, name, estimator.__class__.__name__) print(estimator_str) # ----------------------------------------------------------------- # Loop across folds # ----------------------------------------------------------------- for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)): # Split data and target X_tr = X[tr_index] y_tr = y[tr_index] X_te = X[te_index] y_te = y[te_index] # Split sample weights accordingly (if passed) if sample_weight is not None: sample_weight_tr = sample_weight[tr_index] # sample_weight_te = sample_weight[te_index] else: sample_weight_tr = None # sample_weight_te = None # Fit estimator _ = self._estimator_action( self.models_A_[estimator_counter][fold_counter], X_tr, y_tr, None, sample_weight=sample_weight_tr, action='fit', transform=self.transform_target) # Predict out-of-fold part of train set if 'predict_proba' == self.action_: col_slice_estimator = slice( estimator_counter * self.n_classes_implicit_, estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_estimator = estimator_counter S_train[te_index, col_slice_estimator] = self._estimator_action( self.models_A_[estimator_counter][fold_counter], None, None, X_te, action=self.action_, transform=self.transform_pred) # Compute score score = self.metric_(y_te, S_train[te_index, col_slice_estimator]) self.scores_[estimator_counter, fold_counter] = score # Print fold score if self.verbose > 1: fold_str = ' fold %2d: [%.8f]' % (fold_counter, score) print(fold_str) # Compute mean and std and save in dict estim_name = self.estimators_[estimator_counter][0] estim_mean = np.mean(self.scores_[estimator_counter]) estim_std = np.std(self.scores_[estimator_counter]) self.mean_std_.append((estim_name, estim_mean, estim_std)) if self.verbose > 1: sep_str = ' ----' print(sep_str) # Compute mean + std (and full) if self.verbose > 0: mean_str = ' MEAN: [%.8f] + [%.8f]\n' % (estim_mean, estim_std) print(mean_str) # Fit estimator on full train set if self.variant in ['B']: if self.verbose > 0: print(' Fitting on full train set...\n') _ = self._estimator_action(self.models_B_[estimator_counter], X, y, None, sample_weight=sample_weight, action='fit', transform=self.transform_target) # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Return fitted StackingTransformer instance return self
def classify(data, descriptors, descriptors_to_labels, print_predictions, \ print_results): # number of data points num_data = data.shape[0] # randomly shuffle the data vectors np.random.shuffle(data) # extract the vectors, labels, and indices vectors = data[:, :-2] labels = data[:, -2] indices = data[:, -1] # number of vectors to use as training data num_training = int(TRAINING_SET_PROP * num_data) num_test = num_data - num_training # extract first 'num_training' training vectors and labels training_vectors = vectors[:num_training, :] training_labels = labels[:num_training] training_indices = indices[:num_training] # extract remaining test vectors and labels test_vectors = vectors[num_training:, :] test_labels = labels[num_training:] test_indices = indices[num_training:] # set up logistic regression classifier and fit to data log_reg_clf = LogisticRegression(solver='newton-cg', max_iter=50, random_state=0,\ multi_class='multinomial', verbose=0).fit(training_vectors, \ training_labels) # get classifier's predictions and decision function for each vector predictions = log_reg_clf.predict(test_vectors) confidences = log_reg_clf.decision_function(test_vectors) # to record number of correctly classified instances num_correct_inst = 0 # to hold mapping from descriptor to list of predictions for that # descriptor descriptor_classifications = {} # go through all test data indices for i in range(num_test): # actual label number and string actual = test_labels[i] actual_str = [k for k, v in label_nums.items() if v == actual][0] # predicted label number and string prediction = predictions[i] prediction_str = [k for k,v in label_nums.items() if \ v == prediction][0] # prediction confidence for prediction confidence = confidences[i][int(prediction)] if prediction != actual: if print_predictions: print("***Incorrect Prediction (" + str(confidence) + "):" + descriptors[i]) print("\t Actual - " + actual_str) print("\t Predicted - " + prediction_str) else: if print_predictions: print("---Correct Prediction: (" + str(confidence) + "):" + descriptors[i]) print("\t Actual - " + actual_str) print("\t Predicted - " + prediction_str) num_correct_inst += 1 # get corresponding descriptor for this point descriptor = descriptors[int(test_indices[i])] # there is not an entry in the dictionary yet for this descriptor if descriptor not in descriptor_classifications: descriptor_classifications[descriptor] = [] # add this prediction to the list of predictions for this descriptor, # along with the confidence descriptor_classifications[descriptor].append((predictions[i],\ float(confidences[i][int(predictions[i])]))) # to record number of correctly classified descriptors num_correct_desc = 0 # go through all descriptors for descriptor in descriptor_classifications: # extract list of (classification,confidence) pairs for this descriptor classifications = descriptor_classifications[descriptor] # to store the sum of the confidences for each label confidences_per_label = {} # go through all labels for label in label_nums: # initialize the confidence for this label as 0 num = label_nums[label] confidences_per_label[num] = 0 # set confidences by summing confidences of classifications for classification in classifications: confidences_per_label[classification[0]] += classification[1] # take a confidence vote descriptor_label_prediction = max(confidences_per_label,\ key=confidences_per_label.get) confidence = max(confidences_per_label.values()) # extract actual and predicted descriptor labels actual_str = [k for k,v in label_nums.items() if v == \ descriptors_to_labels[descriptor]][0] prediction_str = [k for k,v in label_nums.items() if v == \ descriptor_label_prediction][0] if (descriptor_label_prediction != descriptors_to_labels[descriptor]): if print_predictions: print("*Incorrect Prediction (" + str(confidence) + "):" + \ descriptor) print("\t Actual - " + actual_str) print("\t Predicted - " + prediction_str) else: if print_predictions: print("-Correct Prediction (" + str(confidence) + "):" + \ descriptor) print("\t Actual - " + actual_str) print("\t Predicted - " + prediction_str) num_correct_desc += 1 log_reg_result = float(num_correct_inst) / float(num_data - num_training) desc_classify_result = float(num_correct_desc) / \ float(len(descriptor_classifications)) dummy_clf = DummyClassifier(strategy='stratified') dummy_clf.fit(training_vectors, training_labels) dummy_result = dummy_clf.score(test_vectors, test_labels) ridge_clf = RidgeClassifier(solver='auto') ridge_clf.fit(training_vectors, training_labels) ridge_result = ridge_clf.score(test_vectors, test_labels) if print_results: print('Ridge result: ' + str(ridge_result)) print("Logistic Regression result: " + str(log_reg_result)) print("Descriptor classification result: " + str(desc_classify_result)) print('Random result: ' + str(dummy_result)) # return relevant scores ret_dict = { \ "ridge": ridge_result, \ "log_reg": log_reg_result, \ "desc_classify": desc_classify_result, \ "dummy": dummy_result \ } return ret_dict
print("The mean cross-validation accuracy is: " f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% [markdown] tags=["solution"] # Using an arbitrary mapping from string labels to integers as done here causes # the linear model to make bad assumptions on the relative ordering of # categories. # # This prevents the model from learning anything predictive enough and the # cross-validated score is even lower than the baseline we obtained by ignoring # the input data and just constantly predicting the most frequent class: # %% tags=["solution"] from sklearn.dummy import DummyClassifier cv_results = cross_validate(DummyClassifier(strategy="most_frequent"), data_categorical, target) scores = cv_results["test_score"] print("The mean cross-validation accuracy is: " f"{scores.mean():.3f} +/- {scores.std():.3f}") # %% [markdown] # Now, we would like to compare the generalization performance of our previous # model with a new model where instead of using an `OrdinalEncoder`, we will # use a `OneHotEncoder`. Repeat the model evaluation using cross-validation. # Compare the score of both models and conclude on the impact of choosing a # specific encoding strategy when using a linear model. # %% from sklearn.preprocessing import OneHotEncoder
final_decision='only', allow_empty='False', pretrained_weights='scibert_scivocab_uncased', remove_duplicates=True, remove_stopwords=False) embeddings_input = data_loader.read_embeddigns_from_file() number_of_reviews = torch.tensor( [reviews.shape[0] for reviews in embeddings_input]) embeddings_input = rnn.pad_sequence( embeddings_input, batch_first=True).numpy() # pad the reviews to form a tensor print(embeddings_input.shape) labels = data_loader.read_labels().numpy() majoriy_clf = DummyClassifier(strategy='most_frequent') preds = cross_val_predict(majoriy_clf, embeddings_input, labels, cv=5) print('5-CV Majority Classifier:\n', classification_report(labels, preds, output_dict=True)) valid_size = 0.1 num_train = embeddings_input.shape[0] indices = list(range(num_train)) split = int(np.floor(valid_size * num_train)) train_idx, test_idx = indices[split:], indices[:split] test_embeddings_input = embeddings_input[test_idx, :, :]