def test_eigenpro_classification_conflict_data(): """Make sure that the classifier doesn't crash when given conflicting input data""" X, y = make_classification(random_state=1) X, y = np.concatenate([X, X]), np.concatenate([y, 1 - y]) # Make sure we don't throw an error when fitting or predicting EigenProClassifier(kernel="linear", n_epoch=5, random_state=1).fit(X, y).predict(X)
def test_eigenpro_classification_duplicate_data(): """ Make sure that the classifier correctly handles cases where some data is repeated. """ X, y = make_classification(n_features=200, n_repeated=50, random_state=1) prediction = (EigenProClassifier(kernel="rbf", n_epoch=60, gamma=0.002, random_state=1).fit(X, y).predict(X)) assert_allclose(prediction, y, rtol=5e-3)
eig_err = [] svc_fit_times = [] svc_pred_times = [] svc_err = [] train_sizes = [2000, 5000, 10000, 20000, 50000] gamma = 0.005 for train_size in train_sizes: for name, estimator in [ ( "EigenPro", EigenProClassifier( n_epoch=3, gamma=gamma, n_components=30, subsample_size=1000, random_state=rng, ), ), ("SupportVector", SVC(C=5, gamma=gamma)), ]: stime = time() estimator.fit(x_train[:train_size], y_train[:train_size]) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test) pred_t = time() - stime err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
# Tests for FastKernelClassification @pytest.mark.parametrize( "data, estimator", [ # Test rbf kernel ( gen_classification({ "n_samples": 10, "hypercube": False }), EigenProClassifier( batch_size=9, kernel="rbf", gamma=0.08, n_epoch=100, random_state=1, ), ), # Test laplacian kernel ( gen_classification({}), EigenProClassifier( kernel="laplace", n_epoch=100, gamma=0.003, random_state=1), ), # Test cauchy kernel ( gen_classification({}), EigenProClassifier( kernel="cauchy", n_epoch=100, gamma=0.005, random_state=1),
for n_features in feature_counts: x, y = make_classification( n_samples=train_size + test_size, n_features=n_features, random_state=rng, ) x_train = x[:train_size] y_train = y[:train_size] x_test = x[train_size:] y_test = y[train_size:] for name, estimator in [ ( "EigenPro", EigenProClassifier(n_epoch=2, gamma=gamma, n_components=400, random_state=rng), ), ("SupportVector", SVC(gamma=gamma, random_state=rng)), ]: stime = time() estimator.fit(x_train, y_train) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test) pred_t = time() - stime err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) if name == "EigenPro": eig_fit_times.append(fit_t)
svc_pred_times = [] svc_err = [] train_sizes = [500, 1000, 2000] print("Train Sizes: " + str(train_sizes)) bandwidth = 5.0 # Fit models to data for train_size in train_sizes: for name, estimator in [ ( "EigenPro", EigenProClassifier( n_epoch=2, bandwidth=bandwidth, random_state=rng ), ), ( "SupportVector", SVC( C=5, gamma=1.0 / (2 * bandwidth * bandwidth), random_state=rng ), ), ]: stime = time() estimator.fit(x_train[:train_size], y_train[:train_size]) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test)
x, y = make_classification( n_samples=train_size + test_size, n_features=n_features, random_state=rng, ) x_train = x[:train_size] y_train = y[:train_size] x_test = x[train_size:] y_test = y[train_size:] for name, estimator in [ ( "EigenPro", EigenProClassifier( n_epoch=2, bandwidth=bandwidth, n_components=400, random_state=rng, ), ), ( "SupportVector", SVC(gamma=1.0 / (2 * bandwidth * bandwidth), random_state=rng), ), ]: stime = time() estimator.fit(x_train, y_train) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test) pred_t = time() - stime
eig_fit_times = [] eig_pred_times = [] eig_err = [] svc_fit_times = [] svc_pred_times = [] svc_err = [] train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000] gamma = 0.02 # Fit models to data for train_size in train_sizes: for name, estimator in [ ( "EigenPro", EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng), ), ("SupportVector", SVC(C=5, gamma=gamma, random_state=rng)), ]: stime = time() estimator.fit(x_train[:train_size], y_train[:train_size]) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test) pred_t = time() - stime err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test) if name == "EigenPro": eig_fit_times.append(fit_t) eig_pred_times.append(pred_t)
).fit(X, y).predict(X) # Tests for FastKernelClassification @pytest.mark.parametrize( "data, estimator", [ # Test rbf kernel ( gen_classification({"n_samples": 10, "hypercube": False}), EigenProClassifier( batch_size=9, kernel="rbf", bandwidth=2.5, n_epoch=100, random_state=1, ), ), # Test laplacian kernel ( gen_classification({}), EigenProClassifier( kernel="laplace", n_epoch=100, bandwidth=13, random_state=1 ), ), # Test cauchy kernel ( gen_classification({}), EigenProClassifier(
eig_err = [] svc_fit_times = [] svc_pred_times = [] svc_err = [] train_sizes = [2000, 5000, 10000, 20000, 50000] bandwidth = 10.0 for train_size in train_sizes: for name, estimator in [ ( "EigenPro", EigenProClassifier( n_epoch=3, bandwidth=bandwidth, n_components=30, subsample_size=1000, random_state=rng, ), ), ("SupportVector", SVC(C=5, gamma=1.0 / (2 * bandwidth * bandwidth))), ]: stime = time() estimator.fit(x_train[:train_size], y_train[:train_size]) fit_t = time() - stime stime = time() y_pred_test = estimator.predict(x_test) pred_t = time() - stime err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
def instanciate_estimators(clf_type, classifiers, clf_seed, y=None, **kw): score_metric, _ = get_score_metric(clf_type) param_grid_LGBM = { 'learning_rate': [0.1, .05, .5], 'num_leaves': [7, 15, 31] } param_grid_XGB = {'learning_rate': [0.1, .05, .3], 'max_depth': [3, 6, 9]} param_grid_MLP = { 'learning_rate_init': [.001, .0005, .005], 'hidden_layer_sizes': [(30, ), (50, ), (100, ), (30, 30), (50, 50), (100, 100)] } param_grid_EigenProGaussian = {'bandwidth': [1, 5, 25]} n_components_eigenpro = 160 param_grid_nystroem_ridgecv = { 'kernel_approx__n_components': [1000, 3000], 'kernel_approx__degree': [2, 3], } if clf_type == 'binary': print(('Fraction by class: True: %0.2f; False: %0.2f' % (list(y).count(True) / len(y), list(y).count(False) / len(y)))) cw = 'balanced' clfs = { 'L2RegularizedLinearModel': linear_model.LogisticRegressionCV(class_weight=cw, max_iter=100, solver='sag', penalty='l2', n_jobs=1, cv=3, multi_class='multinomial'), 'GradientBoosting': ensemble.GradientBoostingClassifier(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1, is_unbalance=True), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': EigenProClassifier(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", bandwidth=5, gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=EigenProClassifier( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=EigenProClassifier(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([ ('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.LogisticRegressionCV(class_weight=cw, max_iter=100, solver='sag', penalty='l2', n_jobs=1, cv=3, multi_class='multinomial')) ]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } elif clf_type == 'multiclass': print('fraction of the most frequent class:', max([list(y).count(x) for x in set(list(y))]) / len(list(y))) clfs = { 'L2RegularizedLinearModel': linear_model.LogisticRegressionCV(penalty='l2', n_jobs=1, cv=3, multi_class='multinomial', solver='sag', max_iter=100), 'GradientBoosting': ensemble.GradientBoostingClassifier(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1, objective='multi:softmax', num_class=len(np.unique(y))), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': EigenProClassifier(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=EigenProClassifier( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=EigenProClassifier(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([ ('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.LogisticRegressionCV(penalty='l2', n_jobs=1, cv=3, multi_class='multinomial', solver='sag', max_iter=100)) ]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } elif clf_type == 'regression': clfs = { 'L2RegularizedLinearModel': linear_model.RidgeCV(cv=3), 'GradientBoosting': ensemble.GradientBoostingRegressor(n_estimators=100), 'LGBM': GridSearchCV(estimator=LGBMRegressor(n_estimators=100, n_jobs=1), param_grid=param_grid_LGBM, cv=3, scoring=metrics.make_scorer(score_metric)), 'XGB': GridSearchCV(estimator=XGBRegressor(n_estimators=100, n_jobs=1), param_grid=param_grid_XGB, cv=3, scoring=metrics.make_scorer(score_metric)), 'MLP': MLPRegressor(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), 'MLPGridSearchCV': GridSearchCV(estimator=MLPRegressor(hidden_layer_sizes=(30, 30), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10), param_grid=param_grid_MLP, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProPolynomial': EigenProRegressor(batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="polynomial", bandwidth=5, gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), 'EigenProGaussian160': GridSearchCV(estimator=EigenProRegressor( batch_size="auto", n_epoch=10, n_components=n_components_eigenpro, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'EigenProGaussian1000': GridSearchCV(estimator=EigenProRegressor(batch_size="auto", n_epoch=10, n_components=1000, subsample_size="auto", kernel="gaussian", gamma=None, degree=2, coef0=1, kernel_params=None, random_state=None), param_grid=param_grid_EigenProGaussian, cv=3, scoring=metrics.make_scorer(score_metric)), 'NystroemRidgeCV': GridSearchCV(estimator=Pipeline([('kernel_approx', Nystroem(kernel="polynomial", n_components=None, random_state=clf_seed, degree=2)), ('classifier', linear_model.RidgeCV(cv=3))]), param_grid=param_grid_nystroem_ridgecv, cv=3, scoring=metrics.make_scorer(score_metric)), } else: raise ValueError("{} not recognized".format(clf_type)) clfs = [clfs[clf] for clf in classifiers] for clf in clfs: try: if 'random_state' in clf.estimator.get_params(): clf.estimator.set_params(random_state=clf_seed) except AttributeError: if 'random_state' in clf.get_params(): clf.set_params(random_state=clf_seed) return clfs