def _test_determenistic(distance, algorithm, weight, k): x_train, x_test, y_train, y_test = \ train_test_split(IRIS.data, IRIS.target, test_size=0.33, random_state=31) alg_results = [] for _ in range(N_TRIES): # models scikit_model = ScikitKNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm, leaf_size=30, p=2, metric=distance) daal_model = DaalKNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm, leaf_size=30, p=2, metric=distance) # training scikit_model.fit(x_train, y_train) daal_model.fit(x_train, y_train) # predict scikit_predict = scikit_model.predict(x_test) daal_predict = daal_model.predict(x_test) distances, indices = daal_model.kneighbors(x_test) alg_results.append((distances, indices, daal_predict)) # accuracy scikit_accuracy = accuracy_score(y_test, scikit_predict) daal_accuracy = accuracy_score(y_test, daal_predict) ratio = daal_accuracy / scikit_accuracy assert ratio >= ACCURACY_RATIO,\ f'kNN accuracy: scikit_accuracy={scikit_accuracy},daal_accuracy={daal_accuracy}, ratio={ratio}' # predict proba scikit_predict_proba = scikit_model.predict_proba(x_test) daal_predict_proba = daal_model.predict_proba(x_test) # log loss scikit_log_loss = log_loss(y_test, scikit_predict_proba) daal_log_loss = log_loss(y_test, daal_predict_proba) ratio = daal_log_loss / scikit_log_loss assert ratio <= LOG_LOSS_RATIO,\ f'kNN log_loss: scikit_log_loss={scikit_log_loss},daal_log_loss={daal_log_loss}, ratio={ratio}' # ROC AUC scikit_roc_auc = roc_auc_score(y_test, scikit_predict_proba, multi_class='ovr') daal_roc_auc = roc_auc_score(y_test, daal_predict_proba, multi_class='ovr') ratio = daal_roc_auc / scikit_roc_auc assert ratio >= ROC_AUC_RATIO,\ f'kNN roc_auc: scikit_roc_auc={scikit_roc_auc},daal_roc_auc={daal_roc_auc}, ratio={ratio}' for i in range(1, N_TRIES): for j, res in enumerate(alg_results[i]): assert (res == alg_results[0][j]).mean() == 1, \ f'Results are different between runs for {algorithm}, {weight}, {distance}, k={k}'
def kfold_function_template(self, data_transform_function): tracemalloc.start() x, y, data_memory_size = self.gen_clsf_data() kf = KFold(n_splits=10) x, y = data_transform_function(x, y) mem_before, _ = tracemalloc.get_traced_memory() for train_index, test_index in kf.split(x): if isinstance(x, np.ndarray): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] elif isinstance(x, pd.core.frame.DataFrame): x_train, x_test = x.iloc[train_index], x.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] knn = KNeighborsClassifier() knn.fit(x_train, y_train) del knn, x_train, x_test, y_train, y_test mem_after, _ = tracemalloc.get_traced_memory() tracemalloc.stop() self.assertTrue( mem_after - mem_before < 0.25 * data_memory_size, 'Size of extra allocated memory is greater than 25% of input data')
def check_determenistic(distance, algorithm, weight, k): x_train, x_test, y_train, y_test = make_dataset() alg_results = [] for _ in range(N_TRIES): alg = KNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm, leaf_size=30, p=2, metric=distance) alg.fit(x_train, y_train) distances, indices = alg.kneighbors(x_test) labels = alg.predict(x_test) alg_results.append((distances, indices, labels)) accuracy = accuracy_score(labels, y_test) assert accuracy >= CHECK_RATIO_KNN,\ 'kNN classifier:accuracy={}'.format(accuracy) for i in range(1, N_TRIES): for j, res in enumerate(alg_results[i]): assert (res == alg_results[0][j]).mean() == 1, \ ('Results are different between runs for %s, %s, %s, k=%d'\ % (algorithm, weight, distance, k))
def test_KNeighborsClassifier(self): check_estimator(KNeighborsClassifier(algorithm='kd_tree'))
test_size=0.5, shuffle=True): x, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=777) return train_test_split(x, y, random_state=777, test_size=test_size, shuffle=shuffle) ESTIMATORS = { 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=10), 'DaalRandomForestClassifier': DaalRandomForestClassifier(n_estimators=10, random_state=777), 'DaalRandomForestRegressor': DaalRandomForestRegressor(n_estimators=10, random_state=777), } ORDERS = ['C', 'F'] DATA_FORMATS = [pd.DataFrame, np.array] def check_data_formats_diff(name): x_train, x_test, y_train, y_test = make_dataset() alg_results = [] for data_format in DATA_FORMATS: for order in ORDERS: