def test_categorization_score(): idx = [1, 2, 3, 4, 5, 6] y = [1, 1, -1, -1, -1, 1] idx_ref = [10, 5, 3, 2, 6] y_ref = [0, 1, 0, 1, 1] scores = categorization_score(idx_ref, y_ref, idx, y) assert_allclose(scores['precision'], 1.0) assert_allclose(scores['recall'], 0.66666666, rtol=1e-4) # make sure permutations don't affect the result idx_ref2 = [10, 5, 2, 3, 6] y_ref2 = [0, 1, 1, 0, 1] scores2 = categorization_score(idx_ref2, y_ref2, idx, y) assert scores['average_precision'] == scores2['average_precision']
def test_unique_label(): """Check that testing works with only one label in the training test""" np.random.seed(10) Nshape = ground_truth.file_path.values.shape is_relevant = np.zeros(Nshape).astype(int) idx = np.arange(len(is_relevant), dtype='int') scores = categorization_score(idx, is_relevant, idx, np.random.rand(*Nshape))
def test_categorization(use_lsi, method, cv): if 'CIRCLECI' in os.environ and cv == 'fast' and method in [ 'LinearSVC', 'xgboost' ]: raise SkipTest # Circle CI is too slow and timesout if method == 'xgboost': try: import xgboost except ImportError: raise SkipTest if not use_lsi: uuid = vect_uuid else: uuid = lsi.mid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) index = cat.fe.db._search_filenames(ground_truth.file_path.values) try: model, Y_train = cat.train(index, ground_truth.is_relevant.values, method=method, cv=cv) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert cat.get_params() is not None assert Y_pred.shape == (cat.fe.n_samples_, len(np.unique(ground_truth.is_relevant.values))) if method == 'NearestNeighbor': assert md.shape == Y_pred.shape else: assert md is None if method in ['xgboost', 'ensemble-stacking']: # this parameter fail for some reason so far... return assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.68) cat.delete()
def test_lsi(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess( data_dir, file_pattern='.*\d.txt', n_features=n_features ) # TODO unused variable (overwritten on the next line) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = LSI(cache_dir=cache_dir, dsid=uuid) lsi_res, exp_var = lsi.transform(n_components=100) # TODO unused variables lsi_id = lsi.mid assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid assert lsi.get_path(lsi_id) is not None assert lsi._load_pars(lsi_id) is not None lsi.load(lsi_id) idx_gt = lsi.fe.search(ground_truth.index.values) idx_all = np.arange(lsi.fe.n_samples_, dtype='int') for accumulate in ['nearest-max', 'centroid-max']: #'nearest-diff', 'nearest-combine', 'stacking']: _, Y_train, Y_pred, ND_train = lsi.predict( idx_gt, ground_truth.is_relevant.values, accumulate=accumulate) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, idx_all, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.3) lsi.list_models() lsi.delete()
seed_filenames = ds['seed_filenames'] seed_y = ds['seed_y'] ground_truth_file = ds['ground_truth_file'] # (optional) fe_opts = {'data_dir': data_dir, 'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1, 'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001, 'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2" } fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(**fe_opts) uuid, filenames = fe.transform() seed_index = fe.search(seed_filenames) cat = Categorizer(cache_dir=cache_dir, dsid=uuid) cat.train(seed_index, seed_y) predictions = cat.predict() gt = parse_ground_truth_file( ground_truth_file) idx_ref = cat.fe.search(gt.index.values) idx_res = np.arange(cat.fe.n_samples_, dtype='int') scores = categorization_score(idx_ref, gt.is_relevant.values, idx_res, predictions) print(' => Test scores: MAP = {average_precision:.3f}, ROC-AUC = {roc_auc:.3f}'.format(**scores))
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=use_hashing) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform( n_components=100) # TODO unused variables assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) index = cat.fe.db._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.train(index, ground_truth.is_relevant.values, method=method) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels, htree = cm(2) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError
def test_features_hashing(use_hashing, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=use_hashing) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) if method == 'Categorization': cat = Categorizer(cache_dir=cache_dir, dsid=uuid, cv_n_folds=2) index = cat.fe.search(ground_truth.index.values) try: coefs, Y_train = cat.train( index, ground_truth.is_relevant.values, ) except OptionalDependencyMissing: raise SkipTest Y_pred = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.search(ground_truth.index.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.5) cat.delete() elif method == 'LSI': lsi = LSI(cache_dir=cache_dir, dsid=uuid) lsi_res, exp_var = lsi.transform( n_components=100) # TODO unused variables lsi_id = lsi.mid assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid assert lsi.get_path(lsi_id) is not None assert lsi._load_pars(lsi_id) is not None lsi.load(lsi_id) idx_gt = lsi.fe.search(ground_truth.index.values) idx_all = np.arange(lsi.fe.n_samples_, dtype='int') for accumulate in ['nearest-max', 'centroid-max']: #'nearest-diff', 'nearest-combine', 'stacking']: _, Y_train, Y_pred, ND_train = lsi.predict( idx_gt, ground_truth.is_relevant.values, accumulate=accumulate) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, idx_all, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.3) elif method == 'DuplicateDetection': dd = DuplicateDetection(cache_dir=cache_dir, dsid=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: cat = Clustering(cache_dir=cache_dir, dsid=uuid) cm = getattr(cat, 'k_means') labels, htree = cm(2, lsi_components=20) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): Clustering(cache_dir=cache_dir, dsid=uuid) else: raise ValueError