示例#1
0
def test_equivalence_subsemble():
    """[SequentialEnsemble] Test ensemble equivalence with Subsemble."""
    ens = Subsemble(n_jobs=1)
    seq = SequentialEnsemble(n_jobs=1)

    ens.add(ECM, dtype=np.float64)
    seq.add('subsemble', ECM, dtype=np.float64)

    F = ens.fit(X, y).predict(X)
    P = seq.fit(X, y).predict(X)

    np.testing.assert_array_equal(P, F)
示例#2
0
def test_equivalence_subsemble():
    """[Sequential] Test ensemble equivalence with Subsemble."""

    ens = Subsemble()
    seq = SequentialEnsemble()

    ens.add(ECM)
    seq.add('subset', ECM)

    F = ens.fit(X, y).predict(X)
    P = seq.fit(X, y).predict(X)

    np.testing.assert_array_equal(P, F)
示例#3
0
def test_subset_equiv():
    """[Subsemble] Test equivalence with SuperLearner for J=1."""

    sub = Subsemble(partitions=1)
    sl = SuperLearner()

    sub.add(ECM, dtype=np.float64)
    sl.add(ECM, dtype=np.float64)

    F = sub.fit(X, y).predict(X)
    P = sl.fit(X, y).predict(X)

    np.testing.assert_array_equal(P, F)
示例#4
0
def test_subset_fit():
    """[Subsemble] 'fit' and 'predict' runs correctly."""
    meta = OLS()
    meta.fit(F, y)
    g = meta.predict(P)

    ens = Subsemble()
    ens.add(ECM, partitions=2, folds=3, dtype=np.float64)
    ens.add_meta(OLS(), dtype=np.float64)

    ens.fit(X, y)

    pred = ens.predict(X)
    np.testing.assert_array_equal(pred, g)
示例#5
0
def build_clustered_subsemble(estimator):
    """Build a subsemble with random partitions"""
    sub = Subsemble(partitions=2,
                    partition_estimator=estimator,
                    folds=2,
                    verbose=2)

    sub.add([SVC(), LogisticRegression()])
    sub.add_meta(SVC())
    return sub
def add_subsemble(name, models, X_train, Y_train, X_test, Y_test):
    # Establish and reset variables
    acc_score_cv = None
    acc_score = None
    time_ = None
    ensemble = Subsemble(scorer=accuracy_score, random_state=seed)

    ensemble.add(models)
    # Attach the final meta estimator
    ensemble.add(SVC(), meta=True)

    start = time.time()
    ensemble.fit(X_train, Y_train)
    preds = ensemble.predict(X_test)
    acc_score = accuracy_score(preds, Y_test)
    end = time.time()
    time_ = end - start

    return {
        "Ensemble": name,
        "Meta_Classifier": "SVC",
        "Accuracy_Score": acc_score,
        "Runtime": time_
    }
示例#7
0
                        'machine-learning-databases/'
                        'poker/poker-hand-testing.data')
    else:
        raise ValueError("Not valid data option.")

    X = np.loadtxt(out, delimiter=",")
    y = X[:, -1]
    X = X[:, :-1]
    return X, y


xtrain, ytrain = get_data('train')
xtest, ytest = get_data('test')

estimators = {
    'subsemble': Subsemble(),
    'super_learner': SuperLearner(),
    'blend_ensemble': BlendEnsemble()
}

base_learners = [
    RandomForestClassifier(n_estimators=500,
                           max_depth=10,
                           min_samples_split=50,
                           max_features=0.6),
    LogisticRegression(C=1e5),
    GradientBoostingClassifier()
]

for clf in estimators.values():
    clf.add([RandomForestClassifier(), LogisticRegression(), MLPClassifier()])
示例#8
0
def build_subsemble():
    """Build a subsemble with random partitions"""
    sub = Subsemble(partitions=3, folds=2)
    sub.add([SVC(), LogisticRegression()])
    return sub
示例#9
0
    def __init__(self):
        pass

    def our_custom_function(self, X, y=None):
        """Split the data in half based on the sum of features"""
        # Labels should be numerical
        return 1 * (X.sum(axis=1) > X.sum(axis=1).mean())

    def get_params(self, deep=False):
        return {}


# Note that the number of partitions the estimator creates *must* match the
# ``partitions`` argument passed to the subsemble.

sub = Subsemble(partitions=2, folds=3, verbose=1)
sub.add([SVC(), LogisticRegression()],
        partition_estimator=SimplePartitioner(),
        fit_estimator=False,
        attr="our_custom_function")

sub.fit(X, y)

##############################################################################
# A final word of caution. When implementing custom estimators from scratch, some
# care needs to be taken if you plan on copying the Subsemble. It is advised that
# the estimator inherits the :class:`sklearn.base.BaseEstimator` class to
# provide a Scikit-learn compatible interface. For further information,
# see the :ref:`API` documentation of the :class:`Subsemble`
# and :class:`mlens.base.indexer.ClusteredSubsetIndex`.
#
示例#10
0
# determine if we are building a classifier model
classifier = np.all(np.unique(Y.to_numpy()) == [0, 1])
outputs = Y.shape[1]

# separate the data into training and testing
if TIME_SERIES:
    test_idx = X.index.values[-int(X.shape[0] / 5):]
else:
    np.random.seed(1)
    test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False)
train_idx = np.array(list(set(X.index.values) - set(test_idx)))

# set up the model
if classifier:
    model = Subsemble(partitions=2, random_state=42, n_jobs=1)
    model.add(KNeighborsClassifier())
    model.add(RandomForestClassifier())
    model.add(GaussianNB())
    model.add_meta(LogisticRegression(penalty="l1", solver="saga"))
else:
    model = Subsemble(partitions=2, random_state=42, n_jobs=1)
    model.add(KNeighborsRegressor())
    model.add(RandomForestRegressor())
    model.add(BayesianRidge())
    model.add_meta(Lasso())

# train and predict
train_predict = pd.DataFrame()
test_predict = pd.DataFrame()
for j in Y.columns: