Exemplo n.º 1
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5,
                                               local_listen_port=listen_port,
                                               n_estimators=10,
                                               num_leaves=10)
    dask_classifier = dask_classifier.fit(dX,
                                          dy,
                                          sample_weight=dw,
                                          client=client)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    s1 = accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)

    client.close()
Exemplo n.º 2
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data('classification',
                                       output=output,
                                       centers=centers)

    dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5,
                                               local_listen_port=listen_port)
    dask_classifier = dask_classifier.fit(dX,
                                          dy,
                                          sample_weight=dw,
                                          client=client)
    p1 = dask_classifier.predict(dX)
    s1 = accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lightgbm.LGBMClassifier()
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)

    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
Exemplo n.º 3
0
def test_classifier(loop, output, listen_port, centers):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as client:
            X, y, w, dX, dy, dw = _create_data('classification',
                                               output=output,
                                               centers=centers)

            a = dlgbm.LGBMClassifier(local_listen_port=listen_port)
            a = a.fit(dX, dy, sample_weight=dw)
            p1 = a.predict(dX, client=client)
            s1 = accuracy_score(dy, p1)
            p1 = p1.compute()

            b = lightgbm.LGBMClassifier()
            b.fit(X, y, sample_weight=w)
            p2 = b.predict(X)
            s2 = b.score(X, y)
            print(confusion_matrix(y, p1))
            print(confusion_matrix(y, p2))

            assert_eq(s1, s2)
            print(s1)

            assert_eq(p1, p2)
            assert_eq(y, p1)
            assert_eq(y, p2)
    def generate_clusters(self, logger, cur_it, data_tr, data_val):
        # Generating W space
        print('Generating W space ...')

        cluster_tr = self.do_clustering(data_tr.x, alg='kmeans')
        cluster_val = self.do_clustering(data_val.x, alg='kmeans')

        accuracy_tr = accuracy_score(data_tr.labels, cluster_tr)
        accuracy_val = accuracy_score(data_val.labels, cluster_val)

        summaries_dict = {'kmeans_cluster_acc': accuracy_tr}
        logger.summarize(cur_it,
                         summarizer='train',
                         summaries_dict=summaries_dict)

        summaries_dict = {'kmeans_cluster_acc': accuracy_val}
        logger.summarize(cur_it,
                         summarizer='test',
                         summaries_dict=summaries_dict)

        print('TRAIN | kmeans Clustering Acc: ', accuracy_tr)
        print('VALID | kmeans Clustering Acc: ', accuracy_val)

        cluster_tr = self.do_clustering(data_tr.x, alg='hdbscan')
        cluster_val = self.do_clustering(data_val.x, alg='hdbscan')

        accuracy_tr = accuracy_score(data_tr.labels, cluster_tr)
        accuracy_val = accuracy_score(data_val.labels, cluster_val)

        summaries_dict = {'hdbscan_cluster_acc': accuracy_tr}
        logger.summarize(cur_it,
                         summarizer='train',
                         summaries_dict=summaries_dict)

        summaries_dict = {'hdbscan_cluster_acc': accuracy_val}
        logger.summarize(cur_it,
                         summarizer='test',
                         summaries_dict=summaries_dict)

        print('TRAIN | hdbscan Clustering Acc: ', accuracy_tr)
        print('VALID | hdbscan Clustering Acc: ', accuracy_val)

        del cluster_tr, cluster_val, accuracy_tr, accuracy_val
        gc.collect()
Exemplo n.º 5
0
def _calc_score_dask(y_true, y_preds, y_proba=None, metrics=('accuracy',), task=const.TASK_BINARY, pos_label=1,
                     classes=None, average=None):
    import dask_ml.metrics as dm_metrics
    from ._toolbox import DaskToolBox

    def to_array(name, value):
        if value is None:
            return value

        if isinstance(value, (dd.DataFrame, dd.Series)):
            value = value.values

        if len(value.shape) == 2 and value.shape[-1] == 1:
            value = value.reshape(-1)

        value = DaskToolBox.make_chunk_size_known(value)
        return value

    score = {}

    y_true = to_array('y_true', y_true)
    y_preds = to_array('y_preds', y_preds)
    y_proba = to_array('y_proba', y_proba)

    if y_true.chunks[0] != y_preds.chunks[0]:
        logger.debug(f'rechunk y_preds with {y_true.chunks[0]}')
        y_preds = y_preds.rechunk(chunks=y_true.chunks[0])

    if y_proba is None:
        y_proba = y_preds
    elif y_true.chunks[0] != y_proba.chunks[0]:
        if len(y_proba.chunks) > 1:
            chunks = (y_true.chunks[0],) + y_proba.chunks[1:]
        else:
            chunks = y_true.chunks
        logger.debug(f'rechunk y_proba with {chunks}')
        y_proba = y_proba.rechunk(chunks=chunks)

    for metric in metrics:
        if callable(metric):
            score[metric.__name__] = metric(y_true, y_preds)
        else:
            metric_lower = metric.lower()
            if metric_lower == 'accuracy':
                score[metric] = dm_metrics.accuracy_score(y_true, y_preds)
            elif metric_lower == 'logloss':
                ll = dm_metrics.log_loss(y_true, y_proba, labels=classes)
                if hasattr(ll, 'compute'):
                    ll = ll.compute()
                score[metric] = ll
            else:
                logger.warning(f'unknown metric: {metric}')
    return score
Exemplo n.º 6
0
def test_classifier(output, centers, client, listen_port):  # noqa
    X, y, w, dX, dy, dw = _create_data('classification',
                                       output=output,
                                       centers=centers)

    a = dlgbm.LGBMClassifier(time_out=5, local_listen_port=listen_port)
    a = a.fit(dX, dy, sample_weight=dw, client=client)
    p1 = a.predict(dX, client=client)
    s1 = accuracy_score(dy, p1)
    p1 = p1.compute()

    b = lightgbm.LGBMClassifier()
    b.fit(X, y, sample_weight=w)
    p2 = b.predict(X)
    s2 = b.score(X, y)
    print(confusion_matrix(y, p1))
    print(confusion_matrix(y, p2))

    assert_eq(s1, s2)
    print(s1)

    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
Exemplo n.º 7
0
    def compute(self, y_true, y_pred, sample_weight=None, **kwargs):
        """
        Parameters
        ----------
        :param y_true : 1d array-like, or label indicator array / sparse matrix
            Ground truth (correct) _labels.
        :param y_pred : 1d array-like, or label indicator array / sparse matrix
            Predicted _labels, as returned by a classifier.
        :param sample_weight : array-like of shape = [n_samples], optional
            Sample weights.
        :param normalize: bool, optional(default=True)
            If ``False``, return the number of misclassifications.
            Otherwise, return the fraction of misclassifications.

        Returns
        -------
        :return score : float
        """
        normalize = kwargs.pop("normalize", True)
        return metrics.accuracy_score(
            y_true,
            y_pred.compute() if isinstance(y_pred, Delayed) else y_pred,
            normalize=normalize,
            sample_weight=None)
Exemplo n.º 8
0
model = est.fit(train, train_labels)

#which features contribute most
import pandas as pd
featureimp = pd.DataFrame(model.feature_importances_)
featureimp.columns = ['classifier_feature_importance']
featureimp["variable"] = data['feature_names']
print("\n\n === Xgboost Classifier Feature Importance: === ")
print(
    featureimp.sort_values(by="classifier_feature_importance",
                           ascending=False))
#featureimp.to_csv()

#predictions
ypred = model.predict(test)

#sample some predictions
print("\n Sample initial five predictions: ")
print(ypred[[0, 1, 2, 3, 4]].compute())

#ensure model is predicting all classes - not just 0
print("\n Check classes other than zero predicted: ")
print(ypred[ypred > 0].compute())

#check accuracy on test set
from dask_ml import metrics
print("\n\n Model Accuracy: ")
print(metrics.accuracy_score(test_labels, model.predict(test)))

print("\n === End Dask Xgboost === \n")