Пример #1
0
def test_ovr():
    X = np.array([[0, 0, 1, 1], [1, 1, 0, 0], [-1, -1, -1, -1]] * 100)
    y = np.array([0, 1, 2] * 100)
    ovr = DistOneVsRestClassifier(LogisticRegression(solver="liblinear"))
    ovr.fit(X, y)
    preds = ovr.predict(X[:3])
    assert np.allclose(preds, np.array([0, 1, 2]))
Пример #2
0
def test_multiclass(spark_session):
    sc = spark_session.sparkContext

    # variables
    solver = "liblinear"
    test_size = 0.2

    # load sample data (binary target)
    data = load_digits()
    X = data["data"]
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=10
    )

    ### distributed one vs rest
    model = DistOneVsRestClassifier(LogisticRegression(solver=solver), sc)
    # distributed fitting with spark
    model.fit(X_train, y_train)
    # predictions on the driver
    preds = model.predict(X_test)

    assert preds.shape == y_test.shape
Пример #3
0
# variables
scoring_average = "weighted"
solver = "liblinear"
test_size = 0.2

# load sample data (binary target)
data = load_digits()
X = data["data"]
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=10)

### distributed one vs rest
model = DistOneVsRestClassifier(LogisticRegression(solver=solver), sc)
# distributed fitting with spark
model.fit(X_train, y_train)
# predictions on the driver
preds = model.predict(X_test)
probs = model.predict_proba(X_test)

# results
print("-- One Vs Rest --")
print("Weighted F1: {0}".format(
    f1_score(y_test, preds, average=scoring_average)))
print("Precision: {0}".format(
    precision_score(y_test, preds, average=scoring_average)))
print("Recall: {0}".format(recall_score(y_test, preds,
                                        average=scoring_average)))
print(pickle.loads(pickle.dumps(model)))
Пример #4
0
# create dataset
X, y = make_classification(
    n_samples=100000,
    n_features=40,
    n_informative=36,
    n_redundant=1,
    n_repeated=1,
    n_classes=40,
    n_clusters_per_class=1,
    random_state=5,
)

# one nested example
model = DistGridSearchCV(
    DistOneVsRestClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)
print(pd.DataFrame(model.cv_results_)[["mean_test_score", "param_estimator__C"]])

# another nested example
model = DistGridSearchCV(
    DistOneVsOneClassifier(LogisticRegression(solver="liblinear"), sc=sc),
    {"estimator__C": params},
    cv=cv,
    scoring=scoring,
)
model.fit(X, y)