def testRocCurve(self): import numpy as np import pandas as pd import mars.dataframe as md from mars.learn.metrics import roc_curve, auc from sklearn.metrics import roc_curve as sklearn_roc_curve, auc as sklearn_auc client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.randint(0, 10, (10, )), 'b': rs.rand(10) }) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) m = auc(fpr, tpr) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) self.assertAlmostEqual(m.fetch(), expect_m) finally: client.stop_server()
def roc_curve(predictions, targets): r"""""" predictions = predictions.view(-1, 1).numpy() targets = targets.view(-1, 1).numpy() fpr, tpr, _ = sklearn_roc_curve(targets, predictions) return fpr, tpr
def testRocCurveAuc(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.randint(0, 10, (10, )), 'b': rs.rand(10) }) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2, session=sess, run_kwargs=run_kwargs) m = auc(fpr, tpr, session=sess, run_kwargs=run_kwargs) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) self.assertAlmostEqual(m.fetch(session=sess), expect_m)
def roc_curve(y_true, y_pred_proba): """ Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. Arguments: y_true (ww.DataColumn, pd.Series or np.ndarray): True labels. y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Returns: list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary. Each dictionary contains metrics used to generate an ROC plot with the following keys: * `fpr_rate`: False positive rate. * `tpr_rate`: True positive rate. * `threshold`: Threshold values used to produce each pair of true/false positive rates. * `auc_score`: The area under the ROC curve. """ y_true = _convert_to_woodwork_structure(y_true) y_pred_proba = _convert_to_woodwork_structure(y_pred_proba) if isinstance(y_pred_proba, ww.DataTable): y_pred_proba = _convert_woodwork_types_wrapper( y_pred_proba.to_dataframe()).to_numpy() else: y_pred_proba = _convert_woodwork_types_wrapper( y_pred_proba.to_series()).to_numpy() y_true = _convert_woodwork_types_wrapper(y_true.to_series()).to_numpy() if len(y_pred_proba.shape) == 1: y_pred_proba = y_pred_proba.reshape(-1, 1) if y_pred_proba.shape[1] == 2: y_pred_proba = y_pred_proba[:, 1].reshape(-1, 1) nan_indices = np.logical_or(pd.isna(y_true), np.isnan(y_pred_proba).any(axis=1)) y_true = y_true[~nan_indices] y_pred_proba = y_pred_proba[~nan_indices] lb = LabelBinarizer() lb.fit(np.unique(y_true)) y_one_hot_true = lb.transform(y_true) n_classes = y_one_hot_true.shape[1] curve_data = [] for i in range(n_classes): fpr_rates, tpr_rates, thresholds = sklearn_roc_curve( y_one_hot_true[:, i], y_pred_proba[:, i]) auc_score = sklearn_auc(fpr_rates, tpr_rates) curve_data.append({ 'fpr_rates': fpr_rates, 'tpr_rates': tpr_rates, 'thresholds': thresholds, 'auc_score': auc_score }) return curve_data
def test_dataframe_roc_curve_auc(setup): rs = np.random.RandomState(0) raw = pd.DataFrame({'a': rs.randint(0, 10, (10, )), 'b': rs.rand(10)}) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) m = auc(fpr, tpr) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) assert pytest.approx(m.fetch()) == expect_m
def testLearnInLocalCluster(self, *_): from mars.learn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors as SkNearestNeighbors from mars.learn.metrics import roc_curve, auc from sklearn.metrics import roc_curve as sklearn_roc_curve, auc as sklearn_auc with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: rs = np.random.RandomState(0) raw_X = rs.rand(10, 5) raw_Y = rs.rand(8, 5) X = mt.tensor(raw_X, chunk_size=7) Y = mt.tensor(raw_Y, chunk_size=(5, 3)) nn = NearestNeighbors(n_neighbors=3) nn.fit(X) ret = nn.kneighbors(Y, session=cluster.session) snn = SkNearestNeighbors(n_neighbors=3) snn.fit(raw_X) expected = snn.kneighbors(raw_Y) result = [r.fetch() for r in ret] np.testing.assert_almost_equal(result[0], expected[0]) np.testing.assert_almost_equal(result[1], expected[1]) rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.randint(0, 10, (10, )), 'b': rs.rand(10) }) df = md.DataFrame(raw) y = df['a'].to_tensor().astype('int') pred = df['b'].to_tensor().astype('float') fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) m = auc(fpr, tpr) sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( raw['a'].to_numpy().astype('int'), raw['b'].to_numpy().astype('float'), pos_label=2) expect_m = sklearn_auc(sk_fpr, sk_tpr) self.assertAlmostEqual(m.fetch(), expect_m)
y_true = np.array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 ]) pred_prob = np.array([ 0.7, 0.9, 0.2, 0.8, 0.3, 0.64, 0.53, 0.12, 0.34, 0.52, 0.98, 0.03, 0.32, 0.4, 0.8, 0.21, 0.01, 0.67, 0.32, 0.08, 0.05, 0.8, 0.34, 0.8 ]) import matplotlib.pyplot as plt Ps, Rs = precision_recall_curve(y_true, pred_prob) plt.plot(Rs, Ps, label='tinyml') from sklearn.metrics import precision_recall_curve as sklearn_pr_curve Ps, Rs, _ = sklearn_pr_curve(y_true, pred_prob) plt.plot(Rs, Ps, label='sklearn') plt.legend() plt.title('PRC') plt.show() FPR, TPR = roc_curve(y_true, pred_prob) plt.plot(FPR, TPR, label='tinyml') print('tinyml_auc:', roc_auc_score(y_true, pred_prob)) from sklearn.metrics import roc_curve as sklearn_roc_curve from sklearn.metrics import roc_auc_score as sklearn_roc_auc_score FPR, TPR, _ = sklearn_roc_curve(y_true, pred_prob) plt.plot(FPR, TPR, label='sklearn') plt.legend() plt.title('ROC') plt.show() print('sklearn auc:', sklearn_roc_auc_score(y_true, pred_prob))