def test_sample_weight_invariance(n_samples=50): random_state = check_random_state(0) # binary random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) for name in ALL_METRICS: if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multiclass random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples, )) y_pred = random_state.randint(0, 5, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, 5)) for name in ALL_METRICS: if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY_MULTICLASS): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multilabel indicator _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) _, yb = make_multilabel_classification(n_features=1, n_classes=20, random_state=1, n_samples=100, allow_unlabeled=False) y_true = np.vstack([ya, yb]) y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + MULTIOUTPUT_METRICS): if name in METRICS_WITHOUT_SAMPLE_WEIGHT: continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_score) else: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_pred)
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 # for both random_state 0 and 1, y_true and y_pred has at least one # unlabelled entry _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, allow_unlabeled=True, n_samples=n_samples) _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, allow_unlabeled=True, n_samples=n_samples) # To make sure at least one empty label is present y_true += [0]*n_classes y_pred += [0]*n_classes for name in METRICS_WITH_NORMALIZE_OPTION: metrics = ALL_METRICS[name] measure = metrics(y_true, y_pred, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal(metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name)
def test_multilabel_classification_report(): n_classes = 4 n_samples = 50 _, y_true = make_multilabel_classification(n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=0) _, y_pred = make_multilabel_classification(n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=1) expected_report = """\ precision recall f1-score support 0 0.50 0.67 0.57 24 1 0.51 0.74 0.61 27 2 0.29 0.08 0.12 26 3 0.52 0.56 0.54 27 avg / total 0.45 0.51 0.46 104 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) # Be sure to have at least one empty label y_true += ([],) y_pred += ([],) n_samples += 1 lb = LabelBinarizer().fit([range(n_classes)]) y_true_binary_indicator = lb.transform(y_true) y_pred_binary_indicator = lb.transform(y_pred) for name, metrics in METRICS_WITH_NORMALIZE_OPTION.items(): # List of list of labels measure = metrics(y_true, y_pred, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal( metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name ) # Indicator matrix format measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal( metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name, )
def test_multilabel_representation_invariance(): # Generate some data n_classes = 4 n_samples = 50 _, y1 = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples, allow_unlabeled=True ) _, y2 = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples, allow_unlabeled=True ) # To make sure at least one empty label is present y1 += [0] * n_classes y2 += [0] * n_classes y1_sparse_indicator = sp.coo_matrix(y1) y2_sparse_indicator = sp.coo_matrix(y2) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] # XXX cruel hack to work with partial functions if isinstance(metric, partial): metric.__module__ = "tmp" metric.__name__ = name measure = metric(y1, y2) # Check representation invariance assert_almost_equal( metric(y1_sparse_indicator, y2_sparse_indicator), measure, err_msg="%s failed representation invariance " "between dense and sparse indicator " "formats." % name, )
def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())), formats=tuple(v for k, v in sorted(FORMATS.items())), samples=1000, classes=4, density=.2, n_times=5): """Times metric calculations for a number of inputs Parameters ---------- metrics : array-like of callables (1d or 0d) The metric functions to time. formats : array-like of callables (1d or 0d) These may transform a dense indicator matrix into multilabel representation. samples : array-like of ints (1d or 0d) The number of samples to generate as input. classes : array-like of ints (1d or 0d) The number of classes in the input. density : array-like of ints (1d or 0d) The density of positive labels in the input. n_times : int Time calling the metric n_times times. Returns ------- array of floats shaped like (metrics, formats, samples, classes, density) Time in seconds. """ metrics = np.atleast_1d(metrics) samples = np.atleast_1d(samples) classes = np.atleast_1d(classes) density = np.atleast_1d(density) formats = np.atleast_1d(formats) out = np.zeros((len(metrics), len(formats), len(samples), len(classes), len(density)), dtype=float) it = itertools.product(samples, classes, density) for i, (s, c, d) in enumerate(it): _, y_true = make_multilabel_classification(n_samples=s, n_features=1, n_classes=c, n_labels=d * c, return_indicator=True, random_state=42) _, y_pred = make_multilabel_classification(n_samples=s, n_features=1, n_classes=c, n_labels=d * c, return_indicator=True, random_state=84) for j, f in enumerate(formats): f_true = f(y_true) f_pred = f(y_pred) for k, metric in enumerate(metrics): t = timeit(partial(metric, f_true, f_pred), number=n_times) out[k, j].flat[i] = t return out
def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: base_clf = MultinomialNB(alpha=1) X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) Y_pred_sprs = clf_sprs.predict(X_test) assert_true(clf.multilabel_) assert_true(sp.issparse(Y_pred_sprs)) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) # Test predict_proba Y_proba = clf_sprs.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > 0.5 assert_array_equal(pred, Y_pred_sprs.toarray()) # Test decision_function clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, return_indicator=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) decision_only.fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def check_alternative_lrap_implementation(lrap_score, n_classes=5, n_samples=20, random_state=0): _, y_true = make_multilabel_classification(n_features=1, allow_unlabeled=False, random_state=random_state, n_classes=n_classes, n_samples=n_samples) # Score with ties y_score = sparse_random_matrix(n_components=y_true.shape[0], n_features=y_true.shape[1], random_state=random_state) if hasattr(y_score, "toarray"): y_score = y_score.toarray() score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap) # Uniform score random_state = check_random_state(random_state) y_score = random_state.uniform(size=(n_samples, n_classes)) score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap)
def get_multilabel(self): return make_multilabel_classification(n_samples=100, n_features=10, n_classes=5, n_labels=5, return_indicator=True, random_state=1)
def testMultiClassification(self): """TODO(ilblackdragon): Implement multi-output classification. """ random.seed(42) n_classes = 5 X, y = datasets.make_multilabel_classification(n_classes=n_classes, random_state=42)
def get_codes(self): X, Y = make_multilabel_classification(n_samples=15, n_labels=8, n_classes=8, random_state=0) self.classifier_labels = Y self.classifier_error_codes = LabelBinarizer().fit_transform(Y) print self.classifier_labels print self.classifier_error_codes f = open('ecoc_classifiers', 'w') for row in self.classifier_labels: str_op = '[' for label in row: str_op += str(label) + ',' str_op += ']' f.write(str_op) f.write('\n') for row in self.classifier_error_codes: str_op = '[' for label in row: str_op += str(label) + ',' str_op += ']' f.write(str_op) f.flush() return
def test_sparse_input(): X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) for name, sparse_matrix in product(FOREST_ESTIMATORS, (csr_matrix, csc_matrix, coo_matrix)): yield check_sparse_input, name, X, sparse_matrix(X), y
def test_random_hasher_sparse_data(): X, y = datasets.make_multilabel_classification(return_indicator=True, random_state=0) hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X_transformed = hasher.fit_transform(X) X_transformed_sparse = hasher.fit_transform(csc_matrix(X)) assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
def test_output_transformer(): X, y = datasets.make_multilabel_classification(return_indicator=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # Check that random_state are different transformer = GaussianRandomProjection(n_components=5, random_state=None) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [sub.output_transformer_.random_state for sub in est.estimators_] assert_equal(len(set(random_state)), est.n_estimators) # Check that random_state are equals transformer = FixedStateTransformer(GaussianRandomProjection( n_components=5), random_seed=0) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [sub.output_transformer_.random_state for sub in est.estimators_] assert_equal(len(set(random_state)), 1) assert_equal(random_state[0], 0)
def test_grid_search_with_multioutput_data(): # Test search with multi-output estimator X, y = make_multilabel_classification(random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(y.shape[0], random_state=0) estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) for parameters, _, cv_validation_scores in grid_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i])
def test_multilabel_classification(): """Test that multi-label classification works as expected.""" # test fit method X, y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) elm = ELMClassifier(weight_scale=100) elm.fit(X, y) assert_greater(elm.score(X, y), 0.95)
def assertClassifierWorksWithSparsity(self, classifier, sparsity_indicator = 'sparse'): feed_sparse = sparsity_indicator == 'sparse' X, y = make_multilabel_classification(sparse = feed_sparse, return_indicator = sparsity_indicator) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) classifier.fit(X_train, y_train) result = classifier.predict(X_test) self.assertEqual(result.shape, y_test.shape)
def test_ovr_multilabel_decision_function(): X, Y = datasets.make_multilabel_classification( n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0 ) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train) assert_array_equal((clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test))
def test_sparse_input(EstimatorClass, sparse_matrix): y, X = datasets.make_multilabel_classification(random_state=0, n_samples=50, n_features=1, n_classes=20) y = y[:, 0] check_sparse_input(EstimatorClass, X, sparse_matrix(X), y)
def test_make_multilabel_classification_return_indicator(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, return_indicator=True, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(np.all(np.sum(Y, axis=0) > min_length))
def test_multilabel_sample_weight_invariance(name): # multilabel indicator random_state = check_random_state(0) _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) _, yb = make_multilabel_classification(n_features=1, n_classes=20, random_state=1, n_samples=100, allow_unlabeled=False) y_true = np.vstack([ya, yb]) y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: check_sample_weight_invariance(name, metric, y_true, y_score) else: check_sample_weight_invariance(name, metric, y_true, y_pred)
def test_make_multilabel_classification_return_indicator_sparse(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, return_indicator='sparse', allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(sp.issparse(Y))
def test_scorer_sample_weight(): # Test that scorers support sample_weight or raise sensible errors # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy="median") sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) estimator = dict( [(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] ) for name, scorer in SCORERS.items(): if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test try: weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert_not_equal( weighted, unweighted, msg="scorer {0} behaves identically when " "called with sample weights: {1} vs " "{2}".format(name, weighted, unweighted), ) assert_almost_equal( weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" " 0: {1} vs {2}".format(name, weighted, ignored), ) except TypeError as e: assert_true( "sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)), )
def test_class_type(self): """ Test class must be either binary or multiclass type """ X, y = make_multilabel_classification() model = RandomForestClassifier() model.fit(X, y) with self.assertRaises(YellowbrickValueError): visualizer = ClassPredictionError(model) visualizer.score(X, y)
def test_output_transformer(): X, y = datasets.make_multilabel_classification(return_indicator=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) transformer = GaussianRandomProjection(n_components=10) for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(random_state=0, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape)
def test(): X,Y_list = make_multilabel_classification() Y = LabelBinarizer().fit_transform(Y_list) Y[Y==0] = -1 clf = OneVsRestClassifier(LinearSVC()) #clf = MultilabelLR(L0=1, λ1=0.1, λ2=0.1, γ=0.1, μ=0.1) clf.fit(X,Y) Y_hat = clf.predict(X) print(roc_auc_score(Y.flat, Y_hat.flat))
def test_make_multilabel_classification(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (100, 20), "X shape mismatch") if not allow_unlabeled: assert_equal(max([max(y) for y in Y]), 2) assert_equal(min([len(y) for y in Y]), min_length) assert_true(max([len(y) for y in Y]) <= 3)
def setup_module(): # Create some memory mapped data global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_') X, y = make_classification(n_samples=30, n_features=5, random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) filename = os.path.join(TEMP_FOLDER, 'test_data.pkl') joblib.dump((X, y, y_ml), filename) X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r') ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def test_make_multilabel_classification_return_indicator(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(np.all(np.sum(Y, axis=0) > min_length)) # Also test return_distributions and return_indicator with True X2, Y2, p_c, p_w_c = make_multilabel_classification( n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled, return_distributions=True) assert_array_equal(X, X2) assert_array_equal(Y, Y2) assert_equal(p_c.shape, (3,)) assert_almost_equal(p_c.sum(), 1) assert_equal(p_w_c.shape, (20, 3)) assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
def test_ovr_fit_predict_sparse(): for sparse in [ sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix ]: base_clf = MultinomialNB(alpha=1) X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) Y_pred_sprs = clf_sprs.predict(X_test) assert_true(clf.multilabel_) assert_true(sp.issparse(Y_pred_sprs)) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) # Test predict_proba Y_proba = clf_sprs.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred_sprs.toarray()) # Test decision_function clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert not hasattr(decision_only, 'predict_proba') # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) assert not hasattr(decision_only, 'predict_proba') decision_only.fit(X_train, Y_train) assert not hasattr(decision_only, 'predict_proba') assert hasattr(decision_only, 'decision_function') # Estimator which can get predict_proba enabled after fitting gs = GridSearchCV(svm.SVC(probability=False), param_grid={'probability': [True]}) proba_after_fit = OneVsRestClassifier(gs) assert not hasattr(proba_after_fit, 'predict_proba') proba_after_fit.fit(X_train, Y_train) assert hasattr(proba_after_fit, 'predict_proba') Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def test_grid_search_with_multioutput_data(): """ Test search with multi-output estimator""" X, y = make_multilabel_classification(return_indicator=True, random_state=0) est_parameters = {"max_depth": [1, 2, 3, 4]} cv = KFold(y.shape[0], random_state=0) estimators = [ DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0) ] # Test with grid search cv for est in estimators: grid_search = GridSearchCV(est, est_parameters, cv=cv) grid_search.fit(X, y) for parameters, _, cv_validation_scores in grid_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i]) # Test with a randomized search for est in estimators: random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) random_search.fit(X, y) for parameters, _, cv_validation_scores in random_search.grid_scores_: est.set_params(**parameters) for i, (train, test) in enumerate(cv): est.fit(X[train], y[train]) correct_score = est.score(X[test], y[test]) assert_almost_equal(correct_score, cv_validation_scores[i])
def test_multioutput(): X, y = make_multilabel_classification(n_samples=100, n_labels=1, n_classes=5, random_state=0, return_indicator=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) est = LazyBaggingClassifier(random_state=0, n_estimators=10, bootstrap=False) est.fit(X_train, y_train) assert_almost_equal(est.score(X_train, y_train), 1.) y_proba = est.predict_proba(X_test) y_log_proba = est.predict_log_proba(X_test) for p, log_p in zip(y_proba, y_log_proba): assert_array_almost_equal(p, np.exp(log_p)) est = LazyBaggingRegressor(random_state=0, n_estimators=10, bootstrap=False) est.fit(X_train, y_train) assert_almost_equal(est.score(X_train, y_train), 1.)
def multilLabel(): # 多标签多分类原始标签 y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]] # 对标签进行预处理 mb = MultiLabelBinarizer() # y_mb变成N*K的矩阵(N:样本数,K:类别数) y_mb = mb.fit_transform(y) # 多类别学习,标签形如[0,0,1,1,2,2] data = load_iris() X, y = data.data, data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) binary_model = SVC(kernel="linear", random_state=1) # one-vs-all形式(既可以多类别问题也可以多标签多分类问题,fir(X,y)中y.shape=[samples] or [samples,classes]) multi_model = OneVsRestClassifier(binary_model).fit(X_train, y_train) # one-vs-one形式(只能用于多类别问题,fit(X,y)函数要求y.shape=[samples]) #multi_model = OneVsOneClassifier(binary_model).fit(X_train,y_train) y_pred = multi_model.predict(X_test) print("True Labels: ", y_test) print("Predict Labels:", y_pred) print("Accuracy: ", accuracy_score(y_test, y_pred)) # 多标签多分类 ml_X, ml_y = make_multilabel_classification() print("多标签多分类训练标签:\n", ml_y[:5]) ml_X_train, ml_X_test, ml_y_train, ml_y_test = train_test_split( ml_X, ml_y, test_size=0.1) # one-vs-all clf = OneVsRestClassifier(SVC(kernel="linear")) clf.fit(ml_X_train, ml_y_train) pred_y = clf.predict(ml_X_test) print("True Labels: \n", ml_y_test) print("Predict Labels:\n", pred_y) print("Hamming_loss: ", hamming_loss(ml_y_test, pred_y)) print("Accuracy: ", accuracy_score(ml_y_test, pred_y))
def test_evenly_distributes_unlabelled(self): cv = IterativeStratifiedKFold(n_splits=5, shuffle=False, random_state=0) X, y = make_multilabel_classification(100, 20, n_labels=5, random_state=0, allow_unlabeled=False) y[[0, 1, 2, 3, 4], :] = 0 # Make label 0 have only 3 positive instances. folds = list(cv.split(X, y)) for train_idx, valid_idx in folds: unlabelled_in_train = np.where( np.sum(y[train_idx, :], axis=1) == 0)[0].shape[0] unlabelled_in_valid = np.where( np.sum(y[valid_idx, :], axis=1) == 0)[0].shape[0] self.assertEqual(unlabelled_in_train, 4) self.assertEqual(unlabelled_in_valid, 1)
def generate_classification(self, num_classes, num_features, num_samples, test_split=0.1, seed=0): #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4) X, y = make_multilabel_classification(n_samples=num_samples, n_features=num_features, n_classes=num_classes, n_labels=0.01, length=50, allow_unlabeled=False, sparse=False, return_indicator='dense', return_distributions=False, random_state=seed) Y = np.argmax(y, axis=1) self.categorical_features = [False] * num_features self.problem_type = ProblemType.FeatureClassification self.X, self.Y = X, Y self._split_data(test_split, seed)
def dataset(request): X, y = make_multilabel_classification( n_samples=int(request.param['n_samples'] * 1.2), n_features=request.param['n_features'], n_classes=request.param['n_classes'], n_labels=request.param['n_classes'], length=request.param['n_targets']) new_x = [] new_y = [] for i in range(y.shape[0]): a = np.argwhere(y[i] == 1)[:, 0] if len(a) >= request.param['n_targets']: new_x.append(i) np.random.shuffle(a) a = a[:request.param['n_targets']] new_y.append(a) if len(new_x) >= request.param['n_samples']: break X = X[new_x] y = np.array(new_y) return train_test_split(X, y, test_size=0.33)
def test_check_classifiers_multilabel_output_format_predict(): n_samples, test_size, n_outputs = 100, 25, 5 _, y = make_multilabel_classification( n_samples=n_samples, n_features=2, n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0, ) y_test = y[-test_size:] class MultiLabelClassifierPredict(_BaseMultiLabelClassifierMock): def predict(self, X): return self.response_output # 1. inconsistent array type clf = MultiLabelClassifierPredict(response_output=y_test.tolist()) err_msg = (r"MultiLabelClassifierPredict.predict is expected to output a " r"NumPy array. Got <class 'list'> instead.") with raises(AssertionError, match=err_msg): check_classifiers_multilabel_output_format_predict( clf.__class__.__name__, clf) # 2. inconsistent shape clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1]) err_msg = (r"MultiLabelClassifierPredict.predict outputs a NumPy array of " r"shape \(25, 4\) instead of \(25, 5\).") with raises(AssertionError, match=err_msg): check_classifiers_multilabel_output_format_predict( clf.__class__.__name__, clf) # 3. inconsistent dtype clf = MultiLabelClassifierPredict( response_output=y_test.astype(np.float64)) err_msg = (r"MultiLabelClassifierPredict.predict does not output the same " r"dtype than the targets.") with raises(AssertionError, match=err_msg): check_classifiers_multilabel_output_format_predict( clf.__class__.__name__, clf)
def test_predict_proba_multilabel(): # Test that predict_proba works as expected for multilabel. # Multilabel should not use softmax which makes probabilities sum to 1 X, Y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) n_samples, n_classes = Y.shape clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30, random_state=0) clf.fit(X, Y) y_proba = clf.predict_proba(X) assert_equal(y_proba.shape, (n_samples, n_classes)) assert_array_equal(y_proba > 0.5, Y) y_log_proba = clf.predict_log_proba(X) proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10) assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba))
def test_ovr_multilabel_dataset(): base_clf = MultinomialNB(alpha=1) for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2) assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2)
def test_sparse_input(EstimatorClass, sparse_matrix): y, X = datasets.make_multilabel_classification( random_state=0, n_samples=50, n_features=1, n_classes=20 ) y = y[:, 0] X_sparse = sparse_matrix(X) dense = EstimatorClass( n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7 ).fit(X, y) sparse = EstimatorClass( n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7 ).fit(X_sparse, y) assert_array_almost_equal(sparse.apply(X), dense.apply(X)) assert_array_almost_equal(sparse.predict(X), dense.predict(X)) assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_) assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X)) assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X)) if issubclass(EstimatorClass, GradientBoostingClassifier): assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X)) assert_array_almost_equal( sparse.predict_log_proba(X), dense.predict_log_proba(X) ) assert_array_almost_equal( sparse.decision_function(X_sparse), sparse.decision_function(X) ) assert_array_almost_equal( dense.decision_function(X_sparse), sparse.decision_function(X) ) for res_sparse, res in zip( sparse.staged_decision_function(X_sparse), sparse.staged_decision_function(X), ): assert_array_almost_equal(res_sparse, res)
def test_actually_works_on_proper_params(self): X, y = make_multilabel_classification(sparse=True, return_indicator='sparse') assert sp.issparse(y) for allow_overlap in [True, False]: for weighted in [True, False]: for include_self_edges in [True, False]: for use_degree_corr in [True, False, None]: for model_selection_criterium in [ 'mean_field', 'bethe' ]: for verbose in [True, False]: clusterer = GraphToolCooccurenceClusterer( weighted=weighted, allow_overlap=allow_overlap, include_self_edges=include_self_edges, n_iters=2, n_init_iters=2, use_degree_corr=use_degree_corr, model_selection_criterium= model_selection_criterium, verbose=verbose) self.assertEqual(clusterer.allow_overlap, allow_overlap) self.assertEqual(clusterer.is_weighted, weighted) self.assertEqual(clusterer.include_self_edges, include_self_edges) self.assertEqual(clusterer.n_iters, 2) self.assertEqual(clusterer.n_init_iters, 2) self.assertEqual( clusterer.model_selection_criterium, model_selection_criterium) self.assertEqual(clusterer.verbose, verbose) partition = clusterer.fit_predict(X, y) self.assertIsInstance(partition, np.ndarray)
def test_evenly_distributes_label_with_multilabel(self): X, y = make_multilabel_classification(100, 20, n_labels=18, random_state=0, allow_unlabeled=False, n_classes=18) d_idx = 0 c_idx = 1 y[:, d_idx] = 0 y[:, c_idx] = 0 y[[0, 10, 20, 12, 4], d_idx] = 1 y[[1, 11, 21, 12, 4], c_idx] = 1 # With Shuffle iskf = IterativeStratifiedKFold(n_splits=3, shuffle=True, random_state=42) cv = list(iskf.split(X, y)) for train, valid in cv: self.assertIn(y[train].sum(axis=0)[d_idx], (4, 3)) self.assertIn(y[valid].sum(axis=0)[d_idx], (1, 2)) for train, valid in cv: self.assertIn(y[train].sum(axis=0)[c_idx], (4, 3)) self.assertIn(y[valid].sum(axis=0)[c_idx], (1, 2)) # With Shuffle iskf = IterativeStratifiedKFold(n_splits=3, shuffle=False, random_state=42) cv = list(iskf.split(X, y)) for train, valid in cv: self.assertIn(y[train].sum(axis=0)[d_idx], (4, 3)) self.assertIn(y[valid].sum(axis=0)[d_idx], (1, 2)) for train, valid in cv: self.assertIn(y[train].sum(axis=0)[c_idx], (4, 3)) self.assertIn(y[valid].sum(axis=0)[c_idx], (1, 2))
def __configure(self): """ __configure Uses the make_multilabel_classification function from scikit-learn to generate a multilabel classification problem. This problem will be kept in memory and provided as demanded. """ self.X, self.y = make_multilabel_classification( n_samples=self.n_samples, n_features=self.n_features, n_classes=self.n_targets, n_labels=self.n_labels, random_state=self.random_state) self.target_names = ["target_" + str(i) for i in range(self.n_targets)] self.feature_names = [ "att_num_" + str(i) for i in range(self.n_num_features) ] self.target_values = np.unique( self.y).tolist() if self.n_targets == 1 else [ np.unique(self.y[:, i]).tolist() for i in range(self.n_targets) ]
def setUp(self): self.binary_problem_instance = make_classification(n_classes=2, n_samples=100, n_features=10, \ n_informative=8, n_redundant=0,random_state=0, shuffle=False) self.Xb = self.binary_problem_instance[0] self.yb = self.binary_problem_instance[1] self.Xb, self.yb = helpers.binary_to_regression(self.Xb, self.yb) self.multiClass_problem_instance = make_classification(n_classes=4, n_samples=100, n_features=10, \ n_informative=8, n_redundant=0,random_state=0, shuffle=False) self.Xmc = self.multiClass_problem_instance[0] self.ymc = self.multiClass_problem_instance[1] self.Xmc, self.ymc = helpers.multiClass_to_regression( self.Xmc, self.ymc, 4) self.multiLabel_problem_instance = make_multilabel_classification(n_classes=5, \ n_labels=2, n_samples=100, n_features=10) self.Xml = self.multiLabel_problem_instance[0] self.yml = self.multiLabel_problem_instance[1] self.Xml, self.yml = helpers.multiLabel_to_regression( self.Xml, self.yml, 5)
def test_sparse(): """ Validate running LinearExplainer on scipy sparse data """ import sklearn.linear_model from sklearn.datasets import make_multilabel_classification from scipy.special import expit np.random.seed(0) n_features = 20 X, y = make_multilabel_classification(n_samples=100, sparse=True, n_features=n_features, n_classes=1, n_labels=2) # train linear model model = sklearn.linear_model.LogisticRegression() model.fit(X, y) # explain the model's predictions using SHAP values explainer = shap.LinearExplainer(model, X) shap_values = explainer.shap_values(X) assert np.max(np.abs(expit(explainer.expected_value + shap_values.sum(1)) - model.predict_proba(X)[:, 1])) < 1e-6
def get_data(): x,y = make_multilabel_classification(n_samples=20,n_features=2,\ n_labels=1,n_classes=1,random_state=2) # #创建表格 # wb = workbook.Workbook() # #表示动作句柄 # wa = wb.active # for i in range(len(x)): # # print(list(x[i])+list(y[i])) # wa.append(list(x[i])+list(y[i])) # wb.save('data.xlsx') # # read_excel_xlsx('data.xlsx') # x:特征值,y:类别 # 根据类别分个类 # 类别1的下标 index1 = np.array([index for (index,value) in enumerate(y) if value == 0]) # print(index1) #类别2的下标 index2 = np.array([index for (index,value) in enumerate(y) if value == 1]) c1 = x[index1] c2 = x[index2] return x,np.array([c1,c2])
def test_multilabel_classification(): # Test that multi-label classification works as expected. # test fit method X, y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5, max_iter=150, random_state=0, activation='logistic', learning_rate_init=0.2) mlp.fit(X, y) assert mlp.score(X, y) > 0.97 # test partial fit method mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150, random_state=0, activation='logistic', alpha=1e-5, learning_rate_init=0.2) for i in range(100): mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4]) assert mlp.score(X, y) > 0.9 # Make sure early stopping still work now that spliting is stratified by # default (it is disabled for multilabel classification) mlp = MLPClassifier(early_stopping=True) mlp.fit(X, y).predict(X)
def generate_classification(self, num_classes, num_features, num_samples, test_split=0.1, seed=0): """Generate a classification task Arguments: num_classes {int} -- Number of classes num_features {int} -- Number of features num_samples {int} -- Number of samples Keyword Arguments: test_split {float} -- Size of test split (default: {0.1}) seed {int} -- A random seed (default: {0}) """ #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4) X, y = make_multilabel_classification( n_samples=num_samples, n_features=num_features, n_classes=num_classes, n_labels=0.01, length=50, allow_unlabeled=False, sparse=False, return_indicator='dense', return_distributions=False, random_state=seed ) Y = np.argmax(y, axis=1) self.categorical_features = [False] * num_features self.problem_type = ProblemType.FeatureClassification self.X, self.Y = X, Y self._split_data(test_split, seed)
def test_shuffle_shuffles_splits(self): X, y = make_multilabel_classification(100, 20, n_labels=5, random_state=0, allow_unlabeled=False) # With Shuffle iskf = IterativeStratifiedKFold(n_splits=3, shuffle=True, random_state=42) cv1 = list(iskf.split(X, y)) # Without shuffle iskf = IterativeStratifiedKFold(n_splits=3, shuffle=False, random_state=42) cv2 = list(iskf.split(X, y)) for train_shuff, valid_shuff in cv1: for train_no_shuff, valid_no_shuff in cv2: self.assertNotEqual(list(train_shuff), list(train_no_shuff)) self.assertNotEqual(list(valid_shuff), list(valid_no_shuff))
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def __configure(self, n_samples, n_features, n_targets, n_labels): """ __configure Uses the make_multilabel_classification function from scikit-learn to generate a multilabel classification problem. This problem will be kept in memory and provided as demanded. Parameters ---------- n_samples: int Total amount of samples to generate. n_features: int Number of features to generate. n_targets: int Number of targeting tasks to generate. n_labels: int Number of labels to generate. """ self.X, self.y = make_multilabel_classification( n_samples=n_samples, n_features=n_features, n_classes=n_targets, n_labels=n_labels, random_state=self.random_state) self.num_samples = n_samples self.num_features = n_features self.num_target_tasks = n_targets self.num_labels = n_labels self.num_numerical_attributes = n_features self.class_header = ["label_" + str(i) for i in range(self.num_labels)] self.attributes_header = [ "att_num_" + str(i) for i in range(self.num_numerical_attributes) ]
def setup_mlc_dataset(self): X, Y = datasets.make_multilabel_classification(n_features=5, random_state=1126) return Dataset(X, Y)
def check_classifier_on_multilabel_or_multioutput_targets(name, Estimator): estimator = Estimator() X, y = make_multilabel_classification(n_samples=30) msg = "Multilabel and multioutput targets are not supported." with pytest.raises(ValueError, match=msg): estimator.fit(X, y)
def test_sparse_input(name, sparse_matrix): X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) check_sparse_input(name, X, sparse_matrix(X), y)
def test_random_hasher_sparse_data(): X, y = datasets.make_multilabel_classification(random_state=0) hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X_transformed = hasher.fit_transform(X) X_transformed_sparse = hasher.fit_transform(csc_matrix(X)) assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
# # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html # # # X contiene las noticias, pero quiero ver en qué formato # # In[1]: # Parallelizing using Pool.apply() import multiprocessing as mp from sklearn.decomposition import LatentDirichletAllocation from sklearn.datasets import make_multilabel_classification X, _ = make_multilabel_classification(random_state=0) print(X) print(X.shape) lda = LatentDirichletAllocation(n_components=5, random_state=0) lda.fit(X) lda.transform(X[-2:]) # Vale, parece que X es un array de numpy, que contiene 100 filas con 20 columnas cada uno: un vector fila por cada noticia, que contiene el conteo de veces que aparece cada palabra (de un total de 20 palabras). Esto es lo que se llama "Bag Of Words" # # En nuestro caso tenemos que coger todas las noticias, eliminar las palabras te tipo "stopwords" (palabras como "de", "la", "un", ...) y hacernos una lista con todas las palabras únicas que aparecen en el conjunto de todas las noticias (que serán más de 20 seguro, pero bueno). # ## Leer y limpiar csv # In[1]:
def dump_multilabel_classification( model, suffix="", folder=None, allow_failure=None, verbose=False, label_string=False, first_class=0, comparable_outputs=None, target_opset=None): """ Trains and dumps a model for a binary classification problem. The function trains a model and calls :func:`dump_data_and_model`. Every created filename will follow the pattern: ``<folder>/<prefix><task><classifier-name><suffix>.<data|expected|model|onnx>.<pkl|onnx>``. """ X = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]] X = numpy.array(X, dtype=numpy.float32) if label_string: y = [["l0"], ["l1"], ["l2"], ["l0", "l1"], ["l1"], ["l2"]] else: y = [[0 + first_class], [1 + first_class], [2 + first_class], [0 + first_class, 1 + first_class], [1 + first_class], [2 + first_class]] y = MultiLabelBinarizer().fit_transform(y) model.fit(X, y) if verbose: print("[make_multilabel_classification] model '{}'".format( model.__class__.__name__)) model_onnx, prefix = convert_model( model, "multi-class classifier", [("input", FloatTensorType([None, 2]))], target_opset=target_opset) if verbose: print("[make_multilabel_classification] model was converted") dump_data_and_model( X.astype(numpy.float32), model, model_onnx, folder=folder, allow_failure=allow_failure, basename=prefix + "Mcl" + model.__class__.__name__ + suffix, verbose=verbose, comparable_outputs=comparable_outputs, ) X, y = make_multilabel_classification(40, n_features=4, random_state=42, n_classes=3) X = X[:, :2] model.fit(X, y) if verbose: print("[make_multilabel_classification] model '{}'".format( model.__class__.__name__)) model_onnx, prefix = convert_model(model, "multi-class classifier", [("input", FloatTensorType([None, 2]))]) if verbose: print("[make_multilabel_classification] model was converted") dump_data_and_model( X[:10].astype(numpy.float32), model, model_onnx, folder=folder, allow_failure=allow_failure, basename=prefix + "RndMla" + model.__class__.__name__ + suffix, verbose=verbose, comparable_outputs=comparable_outputs, )
filename = input("Введите путь файла: ") # Define the color maps for plots color_map = plt.cm.get_cmap('RdYlBu') color_map_discrete = matplotlib.colors.LinearSegmentedColormap.from_list("", ["red","cyan","magenta","blue"]) fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(18, 7)) plt_ind_list = np.arange(3) + 131 dataset_x = [] dataset_y = [] dataset_sparse = [] labels = [1,2,4] for label, plt_ind in zip(labels, plt_ind_list): x, y = dt.make_multilabel_classification(n_samples=1000, n_features=4, n_labels=label, n_classes=5, random_state=rand_state) target = np.sum(y * [1,1,1,1,1], axis=1) dataset_x.append(x) dataset_y.append(y) plt.subplot(plt_ind) my_scatter_plot = plt.scatter(x[:, 0], x[:, 1], c=target, vmin=min(target), vmax=max(target), cmap=color_map) plt.title('n_labels: ' + str(label)) n_ds_x = np.concatenate(dataset_x) n_ds_y = np.concatenate(dataset_y)