def make_sparse_data(use_feature_hashing=False): """ Function to create sparse data with two features always zero in the training set and a different one always zero in the test set """ # Create training data X, y = make_classification(n_samples=500, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) # we need features to be non-negative since we will be # using naive bayes laster X = np.abs(X) # make sure that none of the features are zero X[np.where(X == 0)] += 1 # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)] # create a list of dictionaries as the features # with f1 and f5 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = [0] + row.tolist() + [0] features.append(dict(zip(feature_names, row))) # use a FeatureHasher if we are asked to do feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_sparse', ids, features=features, labels=y, vectorizer=vectorizer) # now create the test set with f4 always 0 but nothing else X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=2, random_state=1234567890) X = np.abs(X) X[np.where(X == 0)] += 1 ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)] # create a list of dictionaries as the features # with f4 always 0 feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: row = row.tolist() row = row[:3] + [0] + row[3:] features.append(dict(zip(feature_names, row))) test_fs = FeatureSet('test_sparse', ids, features=features, labels=y, vectorizer=vectorizer) return train_fs, test_fs
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_linearsvc_parameters(): """ Test possible parameter combinations in LinearSVC """ # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): assert_raises_regexp(ValueError, "Unsupported set of arguments.*penalty='%s.*" "loss='%s.*dual=%s" % (penalty, loss, dual), clf.fit, X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised assert_raises_regexp(ValueError, ".*loss='l3' is not supported.*", svm.LinearSVC(loss="l3").fit, X, y)
def test_grid_search_precomputed_kernel_error_kernel_function(): """Test that grid search returns an error when using a kernel_function""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) kernel_function = lambda x1, x2: np.dot(x1, x2.T) clf = SVC(kernel=kernel_function) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_, y_)
def test_f_classif_multi_class(): """ Test whether the F test yields meaningful results on a simple simulated classification problem """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, Y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-5).all()
def test_mismatch_labels_features(): """ Test to catch mistmatch between the shape of the labels vector and the feature matrix """ # get a 100 instances with 4 features but ignore the labels we # get from here X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # double-stack y to ensure we don't match the number of feature rows y2 = np.hstack([y, y]) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 100 ids ids = ['EXAMPLE_{}'.format(i) for i in range(100)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y2)
def test_grid_search_precomputed_kernel_error_kernel_function(): """Test that grid search returns an error when using a kernel_function""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) kernel_function = lambda x1, x2: np.dot(x1, x2.T) clf = SVC(kernel=kernel_function) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_, y_)
def test_mismatch_labels_features(): """ Test to catch mistmatch between the shape of the labels vector and the feature matrix """ # get a 100 instances with 4 features but ignore the labels we # get from here X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # double-stack y to ensure we don't match the number of feature rows y2 = np.hstack([y, y]) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # get 100 ids ids = ['EXAMPLE_{}'.format(i) for i in range(100)] # This should raise a ValueError FeatureSet('test', ids, features=features, labels=y2)
def make_scaling_data(use_feature_hashing=False): X, y = make_classification(n_samples=1000, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=1234567890) # we want to arbitrary scale the various features to test the scaling scalers = np.array([1, 10, 100, 1000, 10000]) X = X * scalers # since we want to use SKLL's FeatureSet class, we need to # create a list of IDs ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)] # create a list of dictionaries as the features feature_names = ['f{}'.format(n) for n in range(1, 6)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # split everything into training and testing portions train_features, test_features = features[:800], features[800:] train_y, test_y = y[:800], y[800:] train_ids, test_ids = ids[:800], ids[800:] vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None train_fs = FeatureSet('train_scaling', train_ids, features=train_features, labels=train_y, vectorizer=vectorizer) test_fs = FeatureSet('test_scaling', test_ids, features=test_features, labels=test_y, vectorizer=vectorizer) return (train_fs, test_fs)
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_deprecated_score_func(): # test that old deprecated way of passing a score / loss function is still # supported X, y = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X[:180], y[:180]) y_pred = cv.predict(X[180:]) C = cv.best_estimator_.C clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_func = cv.predict(X[180:]) C_func = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_func) assert_equal(C, C_func) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_loss = cv.predict(X[180:]) C_loss = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_loss) assert_equal(C, C_loss)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k="all") X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r)
def test_linearsvc_parameters(): """ Test possible parameter combinations in LinearSVC """ # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): assert_raises_regexp(ValueError, "Unsupported set of arguments.*penalty='%s.*" "loss='%s.*dual=%s" % (penalty, loss, dual), clf.fit, X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised assert_raises_regexp(ValueError, ".*loss='L3' is not supported.*", svm.LinearSVC(loss="L3").fit, X, y)
def test_mutual_info_classif(): X, y = make_classification( n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def test_f_classif(): # Test whether the F test yields meaningful results # on a simple simulated classification problem X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, y) F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y) assert_true((F > 0).all()) assert_true((pv > 0).all()) assert_true((pv < 1).all()) assert_true((pv[:5] < 0.05).all()) assert_true((pv[5:] > 1.0e-4).all()) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert np.sum(np.abs(support - gtruth)) < 2
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = Scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher( n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pd.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)
def test_randomized_search(): # very basic smoke test X, y = make_classification(n_samples=200, n_features=100, random_state=0) params = dict(C=distributions.expon()) search = RandomizedSearchCV(LinearSVC(), param_distributions=params) search.fit(X, y) assert_equal(len(search.cv_scores_), 10)
def test_crammer_singer_binary(): """Test Crammer-Singer formulation in the binary case""" X, y = make_classification(n_classes=2, random_state=0) acc = svm.LinearSVC(random_state=0).fit(X, y).score(X, y) acc2 = svm.LinearSVC(multi_class="crammer_singer", random_state=0).fit(X, y).score(X, y) assert_almost_equal(acc, 0.66) assert_almost_equal(acc2, 0.68)
def test_grid_search_error(): """Test that grid search will capture errors on data with different length""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_[:180], y_)
def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k='all') X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r)
def test_grid_search_error(): """Test that grid search will capture errors on data with different length""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_[:180], y_)
def test_randomized_search(): # very basic smoke test X, y = make_classification(n_samples=200, n_features=100, random_state=0) params = dict(C=distributions.expon()) search = RandomizedSearchCV(LinearSVC(), param_distributions=params) search.fit(X, y) assert_equal(len(search.cv_scores_), 10)
def generate_random_dataset(data_type, amount=500, features=10): _valid_datatype = { 'Normal Distribution Classified': make_classification(n_features=features, ) } return None
def test_on_synthetic_data(): X, _ = make_classification(n_samples=10000,n_features=2, n_redundant=0, weights=[0.99,0.01]) for i in [1000,2000,3000,4000,5000,6000,7000,8000,9000]: labels, cluster_novelty = novelty_detector(X[:i], X[i:i+1000])
def loadData(): # generating data X, y = samples_generator.make_classification(n_features=20, \ n_informative=3, \ n_redundant=0, \ n_classes=4, \ n_clusters_per_class=2) return X, y
def test_gradient_boosting_estimator_with_binomial_deviance_loss(): np.random.seed(0) X, y = make_classification(n_classes=2) loss_function = BinomialDeviance(2) model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function) model.fit(X, y) assert_greater(np.sum(model.predict(X)==y) / float(y.shape[0]), .90) assert_true(np.all(0<=model.predict_proba(X))) assert_true(np.all(1>=model.predict_proba(X)))
def load_blob(classes=3, features=10, samples=10, random_state=0, noise_dims=0, noise_scale=1.0, noise_pattern=None, clusters=1): """ Creates a Gaussian blob classification dataset of given parameters Parameters ---------- classes : num classes features : num columns samples: num rows random_state: random seed noise_dims: parameters for adding redundant dims noise_scale: // noise_pattern: // clusters: num blobs for each class Returns ------- Noise added standardized/normalized datasets dict separating train test samples and labels """ #samples = int(samples / .7) # 30% for test print("values:", features, classes, clusters) xs, ys = make_classification(n_samples=samples, n_features=features, n_informative=features, n_redundant=0, n_repeated=0, n_classes=classes, n_clusters_per_class=clusters) ys = np.float32(ys) if noise_dims > 0: #print("here",noise_dims) xs = add_noisy_dims(xs, noise_dims, noise_scale, noise_pattern) X_train, X_test, y_train, y_test = train_test_split( xs, ys.squeeze(), test_size=0.3, random_state=random_state) # now normalize all scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return dict(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, num_examples_train=X_train.shape[0], num_examples_test=X_test.shape[0])
def test_crammer_singer_binary(): """Test Crammer-Singer formulation in the binary case""" X, y = make_classification(n_classes=2, random_state=0) for fit_intercept in (True, False): acc = svm.LinearSVC(fit_intercept=fit_intercept, multi_class="crammer_singer", random_state=0).fit(X, y).score(X, y) assert_greater(acc, 0.9)
def test_crammer_singer_binary(): """Test Crammer-Singer formulation in the binary case""" X, y = make_classification(n_classes=2, random_state=0) for fit_intercept in (True, False): acc = svm.LinearSVC(fit_intercept=fit_intercept, multi_class="crammer_singer", random_state=0).fit(X, y).score(X, y) assert_greater(acc, 0.9)
def genRandomData(numVariables, numMuestras): """ Function documentation """ from sklearn.datasets import samples_generator X, y = samples_generator.make_classification(n_samples=numMuestras, n_features=numVariables, n_informative=2, n_redundant=0, random_state=77) return X, y
def test_sklearn2code_export(): np.random.seed(0) X, y = make_classification(n_classes=2) X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])]) loss_function = BinomialDeviance(2) model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function) model.fit(X, y) code = sklearn2code(model, ['predict', 'predict_proba', 'transform'], numpy_flat) module = exec_module('test_module', code) assert_correct_exported_module(model, module, ['predict', 'predict_proba', 'transform'], dict(X=X), X)
def classification(self): from sklearn.datasets.samples_generator import make_classification # X1为样本特征,Y1为样本类别输出, 共400个样本,每个样本2个特征,输出有3个类别,没有冗余特征,每个类别一个簇 X1, Y1 = make_classification(n_samples=400, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) plt.show()
def make_cv_folds_data(num_examples_per_fold=100, num_folds=3, use_feature_hashing=False): """ Create data for pre-specified CV folds tests with or without feature hashing """ num_total_examples = num_examples_per_fold * num_folds # create the numeric features and the binary labels X, _ = make_classification(n_samples=num_total_examples, n_features=3, n_informative=3, n_redundant=0, n_classes=2, random_state=1234567890) y = np.array([0, 1] * int(num_total_examples / 2)) # the folds mapping: the first num_examples_per_fold examples # are in fold 1 the second num_examples_per_fold are in # fold 2 and so on foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds)) folds = list(itertools.chain(*foldgen)) # now create the list of feature dictionaries # and add the binary features that depend on # the class and fold number feature_names = ['f{}'.format(i) for i in range(1, 4)] features = [] for row, classid, foldnum in zip(X, y, folds): string_feature_name = 'is_{}_{}'.format(classid, foldnum) string_feature_value = 1 feat_dict = dict(zip(feature_names, row)) feat_dict.update({string_feature_name: string_feature_value}) features.append(feat_dict) # create the example IDs ids = [ 'EXAMPLE_{}'.format(num_examples_per_fold * k + i) for k in range(num_folds) for i in range(num_examples_per_fold) ] # create the cross-validation feature set with or without feature hashing vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None cv_fs = FeatureSet('cv_folds', ids, features=features, labels=y, vectorizer=vectorizer) # make the custom cv folds dictionary custom_cv_folds = dict(zip(ids, folds)) return (cv_fs, custom_cv_folds)
def prepare_data(self): data, target = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, n_classes=2) X, y = shuffle(data, target, random_state=42) X = X.astype(np.float32) y = y.reshape(-1, 1) data = np.concatenate((X, y), axis=1) return data
def toydata2(rng): from sklearn.datasets import samples_generator n_samples = 1000 n_features = 2 n_classes = 2 n_informative = 2 n_clusters_per_class = int((2**n_informative) // n_classes) hypercube = False samplekw = dict( flip_y=0.00, class_sep=1.0, shift=[-10, 10], scale=1.0, n_redundant=0, n_repeated=0, hypercube=hypercube, n_samples=n_samples, n_informative=n_informative, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, shuffle=True, n_features=n_features, random_state=rng, ) X_true, y = samples_generator.make_classification(**samplekw) with_extra = ut.get_argflag('--extra') # make very informative nan dimension if with_extra: n_informative_nan = 100 # extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[12, -8]]) extra_x = rng.randn(n_informative_nan, 2) / 2 + [[10, -12]] X_true = np.vstack((X_true, extra_x)) y = np.append(y, [0] * n_informative_nan) # Randomly drop datapoints X = X_true.copy() nanrate = ut.get_argval('--nanrate', default=0.01) if nanrate: # TODO: # * informative nan # * random nan # * random nan + informative nan X.ravel()[rng.rand(X.size) < nanrate] = np.nan if with_extra: if True: X.T[1][-n_informative_nan:] = np.nan else: X.T[0][-n_informative_nan:-n_informative_nan // 2] = np.nan X.T[1][-n_informative_nan // 2:] = np.nan return X_true, X, y
def test_weight(): """ Test class weights """ X_, y_ = make_classification(n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0) X_ = sparse.csr_matrix(X_) for clf in (linear_model.LogisticRegression(C=180), svm.LinearSVC(C=180), svm.SVC(C=180)): clf.fit(X_[:180], y_[:180], class_weight={0: 5}) y_pred = clf.predict(X_[180:]) assert_true(np.sum(y_pred == y_[180:]) >= 11)
def test_select_kbest_zero(): # Test whether k=0 correctly returns no features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=0) univariate_filter.fit(X, y) support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth) X_selected = assert_warns_message(UserWarning, "No features were selected", univariate_filter.transform, X) assert_equal(X_selected.shape, (20, 0))
def create_data(self): X, labels = make_classification(n_samples=100, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=2) labels = labels.reshape((-1, 1)) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], labels[:offset] X_test, y_test = X[offset:], labels[offset:] return X_train, y_train, X_test, y_test
def test_pipeline_estimator(self): self.X, self.y = samples_generator.make_classification( n_informative=5, n_redundant=0, random_state=42) anova_filter = SelectKBest(f_regression, k=5) self.mdl = Pipeline([('anova', anova_filter), ('svc', SVC(kernel='linear'))]) self.mdl.set_params(anova__k=10, svc__C=.1) try: self._port_model() except Exception as e: self.fail('Unexpected exception raised: {}'.format(e.message)) finally: self._clear_model()
def test_select_kbest_zero(): """ Test whether k=0 correctly returns no features. """ X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=0) univariate_filter.fit(X, y).transform(X) support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth)
def make_skewed_data(n_samples=5000,n_features=20,n_classes=2): from sklearn.datasets.samples_generator import (make_classification, make_regression) X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2, n_clusters_per_class=2, n_informative=8, n_redundant=2, random_state=1) # create unbalamced classes plus = np.where(y>0)[0] minus = np.where(y<=0)[0] plus_sel = random.sample(plus,int(len(plus)/25)) sel = np.r_[minus,plus_sel] np.sort(sel) return X[sel,:],y[sel]
def test_select_kbest_zero(): """ Test whether k=0 correctly returns no features. """ X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=0) univariate_filter.fit(X, y).transform(X) support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth)
def test_grid_search_one_grid_point(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} clf = SVC() cv = GridSearchCV(clf, param_dict) cv.fit(X_, y_) clf = SVC(C=1.0, kernel="rbf", gamma=0.1) clf.fit(X_, y_) assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
def make_skewed_data(random_state=None): X, y = make_classification( n_samples=5000, n_features=20, n_classes=2, n_clusters_per_class=2, n_informative=8, n_redundant=2, random_state=random_state) # create unbalamced classes plus = np.where(y > 0)[0] minus = np.where(y <= 0)[0] plus_sel = random.sample(plus, int(len(plus) / 25)) sel = np.r_[minus, plus_sel] np.sort(sel) return X[sel, :], y[sel]
def test_grid_search_one_grid_point(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} clf = SVC() cv = GridSearchCV(clf, param_dict) cv.fit(X_, y_) clf = SVC(C=1.0, kernel="rbf", gamma=0.1) clf.fit(X_, y_) assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
def make_dataset(dataset, n_rows, n_cols, n_classes=2): np.random.seed(137) if dataset == 'classification1': X, y = make_classification( n_rows, n_cols, n_informative=2, n_redundant=0, n_classes=n_classes, n_clusters_per_class=1) elif dataset == 'classification2': X, y = make_classification( n_rows, n_cols, n_informative=2, n_redundant=0, n_classes=n_classes, n_clusters_per_class=2) elif dataset == 'gaussian': X, y = make_gaussian_quantiles(n_samples=n_rows, n_features=n_cols, n_classes=n_classes) elif dataset == 'blobs': X, y = make_blobs(n_samples=n_rows, n_features=n_cols, centers=n_classes) X_train, X_test, y_train, y_test = train_test_split(X, y) # correct case when not all classes made it into the training set if np.unique(y_train).size < n_classes: for i in range(n_classes): y_train[i] = i return X_train, X_test, y_train, y_test
def test_select_kbest_zero(): # Test whether k=0 correctly returns no features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=0) univariate_filter.fit(X, y) support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth) X_selected = assert_warns_message(UserWarning, 'No features were selected', univariate_filter.transform, X) assert_equal(X_selected.shape, (20, 0))
def main(): from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.preprocessing import MinMaxScaler X, y = samples_generator.make_classification(n_samples=1000, n_informative=5, n_redundant=4, random_state=_random_state) anova_filter = SelectKBest(f_regression, k=5) scaler = MinMaxScaler() clf = svm.SVC(kernel='linear') steps = [scaler, anova_filter, clf] cached_run(steps, X, y)
def test_f_classif_multi_class(): # Test whether the F test yields meaningful results # on a simple simulated classification problem X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) F, pv = f_classif(X, y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all()
def prepare_data(self): # 数据准备 # 生成分类数据,其中n_samples:生成的数据个数, n_features为生成的数据的特征数, n_classes为数据类别数 # n_features = n_redundant + n_informative + n_repeated, # n_clusters_per_class * n_classes ≤ 2 * n_informative data, target = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, n_classes=2) X, y = shuffle(data, target, random_state=42) X = X.astype(np.float32) y = y.reshape(-1, 1) data = np.concatenate((X, y), axis=1) return data
def test_classification_2classes_big(): X, y = make_classification(n_samples=200000, n_features=20, n_classes=2, n_informative=3, weights=[0.7, 0.3], random_state=0) X = pd.DataFrame(X) y = pd.Series(y) cls = MALSS(X, y, 'classification', n_jobs=3) cls.execute() # cls.make_report('test_classification_2classes_big') assert len(cls.algorithms) == 1 assert cls.algorithms[0].best_score is not None
def test_weight(): """ Test class weights """ X_, y_ = make_classification(n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0) X_ = sparse.csr_matrix(X_) for clf in (linear_model.LogisticRegression(), svm.LinearSVC(), svm.SVC()): clf.set_params(class_weight={0: 5}) clf.fit(X_[:180], y_[:180]) y_pred = clf.predict(X_[180:]) assert_true(np.sum(y_pred == y_[180:]) >= 11)
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher): """ Helper function for the two unit tests for FeatureSet.from_data_frame(). Since labels are optional, run two tests, one with, one without. """ import pandas # First, setup the test data. # get a 100 instances with 4 features each X, y = make_classification(n_samples=100, n_features=4, n_informative=4, n_redundant=0, n_classes=3, random_state=1234567890) # Not using 0 - 100 here because that would be pandas' default index names anyway. # So let's make sure pandas is using the ids we supply. ids = list(range(100, 200)) featureset_name = 'test' # if use_feature_hashing, run these tests with a vectorizer feature_bins = 4 vectorizer = (FeatureHasher(n_features=feature_bins) if use_feature_hasher else None) # convert the features into a list of dictionaries feature_names = ['f{}'.format(n) for n in range(1, 5)] features = [] for row in X: features.append(dict(zip(feature_names, row))) # Now, create a FeatureSet object. if with_labels: expected = FeatureSet(featureset_name, ids, features=features, labels=y, vectorizer=vectorizer) else: expected = FeatureSet(featureset_name, ids, features=features, vectorizer=vectorizer) # Also create a DataFrame and then create a FeatureSet from it. df = pandas.DataFrame(features, index=ids) if with_labels: df['y'] = y current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y', vectorizer=vectorizer) else: current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer) return (expected, current)