def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_listen_port=listen_port, n_estimators=10, num_leaves=10) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() s1 = accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) client.close()
def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers) dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_listen_port=listen_port) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_classifier.predict(dX) s1 = accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lightgbm.LGBMClassifier() local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2)
def test_classifier(loop, output, listen_port, centers): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers) a = dlgbm.LGBMClassifier(local_listen_port=listen_port) a = a.fit(dX, dy, sample_weight=dw) p1 = a.predict(dX, client=client) s1 = accuracy_score(dy, p1) p1 = p1.compute() b = lightgbm.LGBMClassifier() b.fit(X, y, sample_weight=w) p2 = b.predict(X) s2 = b.score(X, y) print(confusion_matrix(y, p1)) print(confusion_matrix(y, p2)) assert_eq(s1, s2) print(s1) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2)
def generate_clusters(self, logger, cur_it, data_tr, data_val): # Generating W space print('Generating W space ...') cluster_tr = self.do_clustering(data_tr.x, alg='kmeans') cluster_val = self.do_clustering(data_val.x, alg='kmeans') accuracy_tr = accuracy_score(data_tr.labels, cluster_tr) accuracy_val = accuracy_score(data_val.labels, cluster_val) summaries_dict = {'kmeans_cluster_acc': accuracy_tr} logger.summarize(cur_it, summarizer='train', summaries_dict=summaries_dict) summaries_dict = {'kmeans_cluster_acc': accuracy_val} logger.summarize(cur_it, summarizer='test', summaries_dict=summaries_dict) print('TRAIN | kmeans Clustering Acc: ', accuracy_tr) print('VALID | kmeans Clustering Acc: ', accuracy_val) cluster_tr = self.do_clustering(data_tr.x, alg='hdbscan') cluster_val = self.do_clustering(data_val.x, alg='hdbscan') accuracy_tr = accuracy_score(data_tr.labels, cluster_tr) accuracy_val = accuracy_score(data_val.labels, cluster_val) summaries_dict = {'hdbscan_cluster_acc': accuracy_tr} logger.summarize(cur_it, summarizer='train', summaries_dict=summaries_dict) summaries_dict = {'hdbscan_cluster_acc': accuracy_val} logger.summarize(cur_it, summarizer='test', summaries_dict=summaries_dict) print('TRAIN | hdbscan Clustering Acc: ', accuracy_tr) print('VALID | hdbscan Clustering Acc: ', accuracy_val) del cluster_tr, cluster_val, accuracy_tr, accuracy_val gc.collect()
def _calc_score_dask(y_true, y_preds, y_proba=None, metrics=('accuracy',), task=const.TASK_BINARY, pos_label=1, classes=None, average=None): import dask_ml.metrics as dm_metrics from ._toolbox import DaskToolBox def to_array(name, value): if value is None: return value if isinstance(value, (dd.DataFrame, dd.Series)): value = value.values if len(value.shape) == 2 and value.shape[-1] == 1: value = value.reshape(-1) value = DaskToolBox.make_chunk_size_known(value) return value score = {} y_true = to_array('y_true', y_true) y_preds = to_array('y_preds', y_preds) y_proba = to_array('y_proba', y_proba) if y_true.chunks[0] != y_preds.chunks[0]: logger.debug(f'rechunk y_preds with {y_true.chunks[0]}') y_preds = y_preds.rechunk(chunks=y_true.chunks[0]) if y_proba is None: y_proba = y_preds elif y_true.chunks[0] != y_proba.chunks[0]: if len(y_proba.chunks) > 1: chunks = (y_true.chunks[0],) + y_proba.chunks[1:] else: chunks = y_true.chunks logger.debug(f'rechunk y_proba with {chunks}') y_proba = y_proba.rechunk(chunks=chunks) for metric in metrics: if callable(metric): score[metric.__name__] = metric(y_true, y_preds) else: metric_lower = metric.lower() if metric_lower == 'accuracy': score[metric] = dm_metrics.accuracy_score(y_true, y_preds) elif metric_lower == 'logloss': ll = dm_metrics.log_loss(y_true, y_proba, labels=classes) if hasattr(ll, 'compute'): ll = ll.compute() score[metric] = ll else: logger.warning(f'unknown metric: {metric}') return score
def test_classifier(output, centers, client, listen_port): # noqa X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers) a = dlgbm.LGBMClassifier(time_out=5, local_listen_port=listen_port) a = a.fit(dX, dy, sample_weight=dw, client=client) p1 = a.predict(dX, client=client) s1 = accuracy_score(dy, p1) p1 = p1.compute() b = lightgbm.LGBMClassifier() b.fit(X, y, sample_weight=w) p2 = b.predict(X) s2 = b.score(X, y) print(confusion_matrix(y, p1)) print(confusion_matrix(y, p2)) assert_eq(s1, s2) print(s1) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2)
def compute(self, y_true, y_pred, sample_weight=None, **kwargs): """ Parameters ---------- :param y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) _labels. :param y_pred : 1d array-like, or label indicator array / sparse matrix Predicted _labels, as returned by a classifier. :param sample_weight : array-like of shape = [n_samples], optional Sample weights. :param normalize: bool, optional(default=True) If ``False``, return the number of misclassifications. Otherwise, return the fraction of misclassifications. Returns ------- :return score : float """ normalize = kwargs.pop("normalize", True) return metrics.accuracy_score( y_true, y_pred.compute() if isinstance(y_pred, Delayed) else y_pred, normalize=normalize, sample_weight=None)
model = est.fit(train, train_labels) #which features contribute most import pandas as pd featureimp = pd.DataFrame(model.feature_importances_) featureimp.columns = ['classifier_feature_importance'] featureimp["variable"] = data['feature_names'] print("\n\n === Xgboost Classifier Feature Importance: === ") print( featureimp.sort_values(by="classifier_feature_importance", ascending=False)) #featureimp.to_csv() #predictions ypred = model.predict(test) #sample some predictions print("\n Sample initial five predictions: ") print(ypred[[0, 1, 2, 3, 4]].compute()) #ensure model is predicting all classes - not just 0 print("\n Check classes other than zero predicted: ") print(ypred[ypred > 0].compute()) #check accuracy on test set from dask_ml import metrics print("\n\n Model Accuracy: ") print(metrics.accuracy_score(test_labels, model.predict(test))) print("\n === End Dask Xgboost === \n")