Exemplo n.º 1
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Exemplo n.º 2
0
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit classifier chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    classifier_chain = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain.fit(X_sparse, Y)
    Y_pred = classifier_chain.predict(X_sparse)
    assert_equal(Y_pred.shape, Y.shape)
Exemplo n.º 3
0
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
Exemplo n.º 4
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
Exemplo n.º 5
0
def test_naiveBayes(df, truth, eval_type):
    clf = MultinomialNB()
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)

    return ["NB"], [scores]
Exemplo n.º 6
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Exemplo n.º 7
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_score(Y_test, Y_pred_chain, average='samples'),
                   jaccard_score(Y_test, Y_pred_ovr, average='samples'))
Exemplo n.º 8
0
def test_best_AdaBoost(df, truth, eval_type):
    clf = AdaBoostClassifier(n_estimators=50)
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    print("Start crossvalidation...")
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)
    print(f"Crossvalidation done. Mean: {np.mean(scores)}")
    return scores
Exemplo n.º 9
0
def test_base_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit base chain with sparse area_data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    base_chains = [
        ClassifierChain(LogisticRegression(), cv=3),
        RegressorChain(Ridge(), cv=3)
    ]
    for chain in base_chains:
        chain.fit(X_sparse, Y)
        Y_pred = chain.predict(X_sparse)
        assert Y_pred.shape == Y.shape
Exemplo n.º 10
0
def test_classifier_chain_fit_and_predict_with_sparse_data():
    # Fit classifier chain with sparse data
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X_sparse, Y)
    Y_pred_sparse = classifier_chain.predict(X_sparse)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)
    Y_pred_dense = classifier_chain.predict(X)

    assert_array_equal(Y_pred_sparse, Y_pred_dense)
Exemplo n.º 11
0
def XGBoostChain(X_train, y_train, X_test):
    print("fitting the data")
    # Fitting X-Gradient boosting
    gbc = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    chains = [ClassifierChain(gbc, order='random', random_state=i) for i in range(10)]

    for chain in chains:
        chain.fit(X_train, y_train)

    Y_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains])
    Y_pred_ensemble = Y_pred_chains.mean(axis=0)
    print(Y_pred_ensemble)
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
Exemplo n.º 13
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(
        LogisticRegression(),
        order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Exemplo n.º 14
0
def test_best_rf(df, truth, eval_type):
    clf = RandomForestClassifier(n_estimators=200,
                                 max_depth=50,
                                 max_features='auto',
                                 criterion='entropy')
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    print("Start crossvalidation...")
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)
    print(f"Crossvalidation done. Mean: {np.mean(scores)}")
    return scores
Exemplo n.º 15
0
def test_base_chain_random_order():
    # Fit base chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
        chain_random = clone(chain).set_params(order="random", random_state=42)
        chain_random.fit(X, Y)
        chain_fixed = clone(chain).set_params(order=chain_random.order_)
        chain_fixed.fit(X, Y)
        assert_array_equal(chain_fixed.order_, chain_random.order_)
        assert list(chain_random.order) != list(range(4))
        assert len(chain_random.order_) == 4
        assert len(set(chain_random.order_)) == 4
        # Randomly ordered chain should behave identically to a fixed order
        # chain with the same order.
        for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
            assert_array_almost_equal(est1.coef_, est2.coef_)
Exemplo n.º 16
0
def _get_training_pipeline(X_train, y_train_binarized):
    """
    Build the training pipe based on some default configuration.
    TODO: Externalize the parameters of this default configuration to a file
    """

    estimators = [
        ("preprocessor", FunctionTransformer(text_prepare, kw_args={"join_symbol": " "})),
        (
            "tfidf",
            TfidfVectorizer(tokenizer=tokenize_and_stem, ngram_range=(1, 3), norm="l2", max_df=0.9, min_df=5),
        ),
        ("clf", ClassifierChain(LogisticRegression(C=10, penalty="l1", dual=False, solver="liblinear"))),
    ]

    return Pipeline(estimators, verbose=True)
Exemplo n.º 17
0
def test_base_chain_fit_and_predict():
    # Fit base chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    chains = [RegressorChain(Ridge()), ClassifierChain(LogisticRegression())]
    for chain in chains:
        chain.fit(X, Y)
        Y_pred = chain.predict(X)
        assert Y_pred.shape == Y.shape
        assert ([c.coef_.size for c in chain.estimators_
                 ] == list(range(X.shape[1], X.shape[1] + Y.shape[1])))

    Y_prob = chains[1].predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert isinstance(chains[1], ClassifierMixin)
Exemplo n.º 18
0
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
def train_cc(trainX_embedded,trainY_pruned,num_chains=FLAGS.num_chains):
    #base_lr = LogisticRegression()
    #base_svm = SVC(kernel='rbf',C=FLAGS.C,gamma=FLAGS.gamma,probability=False)
    if FLAGS.mode == 'linear':
        base_svm = OneVsRestClassifier(SVC(kernel='linear',probability=False))
    elif FLAGS.mode == 'rbf':
        base_svm = OneVsRestClassifier(SVC(kernel='rbf',C=FLAGS.C,gamma=FLAGS.gamma,probability=False))
    else:
        base_svm = OneVsRestClassifier(SVC()) #everything default, this will be using rbf kernel with default C and gamma settings, see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    #model = train_svm(trainX_embedded,np.asarray(trainY).astype(int))
    chains = [ClassifierChain(base_svm, order='random', random_state=i) for i in range(num_chains)]
    count_chain=1
    for chain in chains:
        chain.fit(trainX_embedded,trainY_pruned == 1)
        print('chain',count_chain,'out of',num_chains,'done')
        count_chain=count_chain+1
    return chains
Exemplo n.º 20
0
def test_features(df, truth, eval_type):
    models = [('LDA', LinearDiscriminantAnalysis())]

    res = []
    names = []
    for mod_name, model in models:
        kfold = KFold(n_splits=10, random_state=7)
        cv_results = cross_val_score(ClassifierChain(model),
                                     df.values,
                                     truth,
                                     cv=kfold,
                                     scoring=eval_type)
        res.append(cv_results)
        names.append(mod_name)
        msg = "%s: %f (%f)" % (mod_name, cv_results.mean(), cv_results.std())
        print(msg)
    return names, res
Exemplo n.º 21
0
def test_base_chain_crossval_fit_and_predict():
    # Fit chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()

    for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
        chain.fit(X, Y)
        chain_cv = clone(chain).set_params(cv=3)
        chain_cv.fit(X, Y)
        Y_pred_cv = chain_cv.predict(X)
        Y_pred = chain.predict(X)

        assert Y_pred_cv.shape == Y_pred.shape
        assert not np.all(Y_pred == Y_pred_cv)
        if isinstance(chain, ClassifierChain):
            assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
        else:
            assert mean_squared_error(Y, Y_pred_cv) < 0.25
Exemplo n.º 22
0
def CC_Fit(clfs, X_train, y_train, X_test, y_test, evaluate):
    metrics_cc = {}
    for key, clf in zip(clfs.keys(), clfs.values()):
        print('Fitting Chain %s' % key)
        chains = [ClassifierChain(clf, order='random', random_state=i) for i in range(10)]
        for chain in chains:
            chain.fit(X_train, y_train)

        Y_pred_chains = np.array([chain.predict(X_test) for chain in
                                  chains])

        pred_ens = Y_pred_chains.mean(axis=0)
        # Chain scores

        for m in evaluate:
            metrics_cc[key + ' ' + m] = [scores(m, y_test, y_pred >= .5) for y_pred in Y_pred_chains]
            metrics_cc[key + ' ' + m + ' ensemble'] = scores(m, y_test, pred_ens >= .5)
    return metrics_cc
Exemplo n.º 23
0
def multilabel_loo(data, y, lian):
    kf = KFold(n_splits=10)
    chain = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True,
                                                     n_estimators=120),
                                n_jobs=8)
    chains = [ClassifierChain(chain, order="random") for i in range(lian)]
    model = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True,
                                                     n_estimators=200),
                                n_jobs=8)
    metrics_total = []
    fea_train = np.array([])
    fea_test = np.array([])
    # for train_idx, test_idx in loo.split(data[0]):
    for train_idx, test_idx in kf.split(data[0], y):
        y_train, y_test = y[train_idx], y[test_idx]
        for i in range(lian):
            X_train, X_test = data[i % 10][train_idx], data[i % 10][test_idx]
            clf = chains[i]
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            if i == 0:
                fea_train = clf.predict(X_train)
                fea_test = y_pred
            else:
                fea_train = np.hstack([fea_train, clf.predict(X_train)])
                fea_test = np.hstack([fea_test, y_pred])
        print(fea_train.shape)
        fea_train, fea_test = wy.fea_extra(fea_train, y_train, fea_test,
                                           y_test)
        print(fea_train.shape)

        model.fit(fea_train, y_train)
        y_pred = model.predict(fea_test)
        y_score = model.predict_proba(fea_test)
        metrics = print_metrics(y_test, y_pred, y_score)
        print(y_test.shape, y_pred[:3])
        metrics_total.append(metrics)
    print([
        "Accuracy", "Average_precision", "coverage", "Hamming_loss",
        "One_error", "Ranking_loss", "Precision", "Recall"
    ])
    print(lian, '\n', metrics_total)
    print(np.mean(metrics_total, axis=0))
Exemplo n.º 24
0
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
Exemplo n.º 25
0
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
Exemplo n.º 26
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def build_model():
    '''Builds a pipeline to process the data with a final estimator and combines
    it with a grid search. The accuracy will be maximized taking into account for
    one sample all the labels to be predicted. This is a tough metric as to be
    considered accurate a prediction for one message needs to be correct for all
    categories. When testing various models, the use of chainclassifier provided
    better results that MultiOutputClassifier which fits classifier for each category
    independantly of each other.

    OUTPUT:
    GridSearchCV- a GridSearchCV object to a pipeline
    '''
    pipeline = Pipeline([('bag_of_words', CountVectorizer(tokenizer=tokenize)),
                         ('tdif', TfidfTransformer()),
                         ('ClassifierChain',
                          ClassifierChain(AdaBoostClassifier()))])

    parameters = {
        'ClassifierChain__base_estimator__n_estimators': [100, 150, 200],
        'ClassifierChain__base_estimator__learning_rate': [1, 0.8, 0.5]
    }

    return GridSearchCV(pipeline, parameters, verbose=3)
def build_model(X_training_set, Y_training_set):
    '''
    build_model - trained and tune an nlp pipeline
    input: Training dataset and labels
    output: nlp pipeline
    '''

    nlp_chain_nlp_pipeline_lgb = Pipeline([
        ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
        ('classifierchain', ClassifierChain(LGBMClassifier(n_jobs=-1)))
    ])

    print("build_model - model {} \n\n on X_train {} and Y_train {}".format(
        nlp_chain_nlp_pipeline_lgb, X_training_set.shape,
        Y_training_set.shape))

    myscoring = make_scorer(f1_score, average='weighted')

    parameters = {
        'tfidfvect__ngram_range': ((1, 1), (1, 2)),
        'tfidfvect__max_df': (0.5, 0.75, 1.0),
        'tfidfvect__max_features': (None, 100, 500, 2000)
    }

    # create grid search object
    search = RandomizedSearchCV(nlp_chain_nlp_pipeline_lgb,
                                parameters,
                                scoring=myscoring,
                                n_jobs=-1)
    result = search.fit(X_training_set, Y_training_set)

    # summarize result
    print('"build_model - best Score: %s' % result.best_score_)
    print('"build_model - best Hyperparameters: %s' % result.best_params_)

    return (search.best_estimator_)
#X_mean, y_mean, mlb_mean = create_labels(data_mean.loc[data_mean.index.get_level_values(level=1)<200,:])
#X, y, mlb = create_labels(data.loc[data.index.get_level_values(level=1)<200,:])

data_percentage = 0.1
X_train, X_test, y_train, y_test = train_test_split(
    #X, y,
    X_mixed,
    y_mixed,
    #X_mean, y_mean,
    test_size=0.33 * data_percentage,
    train_size=0.67 * data_percentage,
    random_state=42)

scaler = StandardScaler()
#clf = MultiOutputClassifier(SVC())
clf = ClassifierChain(base_estimator=SVC())
#clf = MLTSVM()

pca = PCA()


def to_Sparse(X):
    return csr_matrix(X)


to_Sparse_transformed = FunctionTransformer(func=to_Sparse)

pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('clf', clf)])

param_grid = {
    'pca__n_components': list(np.arange(5, 36, 5)),
#로지스틱 회귀
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
pred_ovr = ovr.predict(X_test)

from sklearn.metrics import jaccard_similarity_score
ovr_score = jaccard_similarity_score(Y_test, pred_ovr)
ovr_score

from sklearn.multioutput import ClassifierChain

cc = ClassifierChain(LogisticRegression(), order='random', random_state=42)
cc.fit(X_train, Y_train)
pred_cc = cc.predict(X_test)
cc_score = jaccard_similarity_score(Y_test, pred_cc)
cc_score

chains = [
    ClassifierChain(LogisticRegression(), order='random', random_state=42 + i)
    for i in range(10)
]
for chain in chains:
    chain.fit(X_train, Y_train)

pred_chains = np.array([chain.predict(X_test) for chain in chains])
chain_scores = [
    jaccard_similarity_score(Y_test, pred_chain) for pred_chain in pred_chains
def build_model(X_train, y_train):
    """
    Function the different pipelines
    and evaluates each pipeline

    Parameters:

    Training data

    Returns:

    The best performing model

    """

    # Random Forest Classifier - pipeline
    pipeline_rf = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=5)))
    ])

    # LGBMC
    pipeline_lgb = Pipeline([
        ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
        ('multiclassifier', MultiOutputClassifier(LGBMClassifier(n_jobs=5)))
    ])

    #
    pipeline_chain_lgb = Pipeline([
        ('tfidfvect', TfidfVectorizer(tokenizer=tokenize)),
        ('classifierchain', ClassifierChain(LGBMClassifier(n_jobs=5)))
    ])

    # Pipeline dictionary
    pipeline_dict = {'pipeline_rf': pipeline_rf, \
                     'pipeline_lgb ': pipeline_lgb, \
                     'pipeline_chain_lgb': pipeline_chain_lgb}

    # cross validation for F1 score
    f1_results = {}

    # Cross-validation of each pipeline
    for pipename, pipevalue in pipeline_dict.items():
        start_time = time.time()
        print("Training pipeline : {} ...".format(pipename))
        scores = cross_val_score(pipevalue,
                                 X_train,
                                 y_train,
                                 scoring='f1_weighted',
                                 cv=5)
        f1_results[pipename] = scores.mean()
        print("Pipeline : {} F1 mean score {}".format(pipename, scores.mean()))
        time_taken = round(((time.time() - start_time) / 60), 3)
        print("--- " + str(time_taken) + " minutes ---")

    # Best pipeline
    best_pipeline_name = max(f1_results, key=f1_results.get)
    best_pipeline = pipeline_dict[best_pipeline_name]

    return best_pipeline, f1_results
Exemplo n.º 32
0
features = [item.split(" ") for item in train_df]
col_dicts = [make_dict(entry) for entry in features]

features_val = [item.split(" ") for item in val_df]
col_dicts_val = [make_dict(entry) for entry in features_val]

features_df = pd.DataFrame(col_dicts)
features_df_val = pd.DataFrame(col_dicts_val)

features_df = features_df.fillna(0)
features_df_val = features_df_val.fillna(0)
print('done cleanning')
X_train = np.array(features_df)
Y_train = np.array(encoded_labels_df)
x_val = np.array(features_df_val)
y_val = np.array(encoded_labels_df_val)

base_lr = LogisticRegression(max_iter=MAX_ITER, n_jobs=-1, verbose=1)

int_rand = np.random.randint(1000)
chain = ClassifierChain(base_lr, order='random', random_state=int_rand)

chain.fit(X_train, Y_train)
filename = MAX_ITER + "_" + int_ran + ".pickle"
pickle.dump(chain, open(filename, 'wb'))

#loaded_model = pickle.load(open(filename, 'rb'))
print('start predict')
Y_pred_chains = np.array([chain.predict_proba(x_val) for chain in chains])
Exemplo n.º 33
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
Exemplo n.º 34
0
        chain_cv.fit(X, Y)
        Y_pred_cv = chain_cv.predict(X)
        Y_pred = chain.predict(X)

        assert Y_pred_cv.shape == Y_pred.shape
        assert not np.all(Y_pred == Y_pred_cv)
        if isinstance(chain, ClassifierChain):
            assert jaccard_score(Y, Y_pred_cv, average='samples') > .4
        else:
            assert mean_squared_error(Y, Y_pred_cv) < .25


@pytest.mark.parametrize('estimator', [
    RandomForestClassifier(n_estimators=2),
    MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
    ClassifierChain(RandomForestClassifier(n_estimators=2))
])
def test_multi_output_classes_(estimator):
    # Tests classes_ attribute of multioutput classifiers
    # RandomForestClassifier supports multioutput out-of-the-box
    estimator.fit(X, y)
    assert isinstance(estimator.classes_, list)
    assert len(estimator.classes_) == n_outputs
    for estimator_classes, expected_classes in zip(classes,
                                                   estimator.classes_):
        assert_array_equal(estimator_classes, expected_classes)


class DummyRegressorWithFitParams(DummyRegressor):
    def fit(self, X, y, sample_weight=None, **fit_params):
        self._fit_params = fit_params
Exemplo n.º 35
0
yeast = fetch_mldata('yeast')
X = yeast['data']
Y = yeast['target'].transpose().toarray()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)

# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(LogisticRegression(), order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(X_train, Y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in
                          chains])
chain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)
                        for Y_pred_chain in Y_pred_chains]

Y_pred_ensemble = Y_pred_chains.mean(axis=0)
ensemble_jaccard_score = jaccard_similarity_score(Y_test,
                                                  Y_pred_ensemble >= .5)

model_scores = [ovr_jaccard_score] + chain_jaccard_scores
model_scores.append(ensemble_jaccard_score)
Exemplo n.º 36
0
print("Hamming loss for decision tree classifier: ", loss_tree)

from sklearn.multiclass import OneVsRestClassifier
t0 = clock()
onerest = OneVsRestClassifier(knn)
onerest.fit(X_train, Y_train)
Y_pred = onerest.predict(X_test)
t_onerest = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_onerest = np.mean(Y_pred != Y_test)
print("Hamming loss for One vs Rest classifier: ", loss_onerest)

from sklearn.multioutput import ClassifierChain
t0 = clock()
classfierchain = ClassifierChain(knn)
classfierchain.fit(X_train, Y_train)
Y_pred = classfierchain.predict(X_test)
t_chain = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_chain = np.mean(Y_pred != Y_test)
print("Hamming loss for classifier chain: ", loss_chain)

arr_epoch = np.arange(1, len(time_h) + 1) * 10

plt.figure(figsize=(12, 9))
plt.plot(arr_epoch, time_h, label='my network', c='k')
plt.axhline(t_nn, c='r', label='Default network of Sklearn')
plt.axhline(
    t_knn,
Exemplo n.º 37
0
X_train, X_test, Y_train, Y_test = train_test_split(df[features_list], df[classes_list], test_size=.2,
                                                    random_state=0)


# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression(solver='lbfgs')
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_similarity_score(Y_test, Y_pred_ovr)

# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(base_lr, order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(X_train, Y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in
                          chains])
chain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)
                        for Y_pred_chain in Y_pred_chains]

Y_pred_ensemble = Y_pred_chains.mean(axis=0)
ensemble_jaccard_score = jaccard_similarity_score(Y_test,
                                                  Y_pred_ensemble >= .5)

model_scores = [ovr_jaccard_score] + chain_jaccard_scores
model_scores.append(ensemble_jaccard_score)
Exemplo n.º 38
0
    mlpregressor=MLPRegressor,
    stackingregressor=StackingRegressor,
    mlxtendstackingregressor=StackingRegressor,
    mlxtendstackingcvregressor=StackingCVRegressor,
    votingregressor=VotingRegressor)

_ESTIMATOR_DICT = dict(regression=_REGRESSOR_DICT)

_KERNEL_DICT = dict(dotproduct=DotProduct, rbf=RBF, whitekernel=WhiteKernel)

# We need to identify chaining as the fit method capitilizes
# the target Y, whereas other estimators conventionally write
# the target as y.
_CHAIN_FLAG = [
    RegressorChain(base_estimator=DummyRegressor()).__class__,
    ClassifierChain(base_estimator=DummyClassifier()).__class__
]

# We need to identify CatBoost as the predict method utilizes the
# parameter data for the conventional design matrix parameter X.
_CATBOOST_FLAG = cat.CatBoostRegressor().__class__

_MULTI_TARGET = ['continuous-multioutput', 'multiclass-multioutput']

_OPTIMIZE_METHOD = [
    'Nelder-Mead', 'Powell', 'CG', 'BFGS', 'Newton-CG', 'L-BFGS-B', 'TNC',
    'COBYLA', 'SLSQP', 'trust-constr', 'dogleg', 'trust-ncg', 'trust-exact',
    'trust-krylov', 'custom'
]

_SCORE_CHOICE = ['mae', 'mse', 'rmse', 'r2', 'ev', 'msle']