def test_features_hashing(use_hashing, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=use_hashing) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) if method == 'Categorization': cat = Categorizer(cache_dir=cache_dir, dsid=uuid, cv_n_folds=2) index = cat.fe.search(ground_truth.index.values) try: coefs, Y_train = cat.train( index, ground_truth.is_relevant.values, ) except OptionalDependencyMissing: raise SkipTest Y_pred = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.search(ground_truth.index.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.5) cat.delete() elif method == 'LSI': lsi = LSI(cache_dir=cache_dir, dsid=uuid) lsi_res, exp_var = lsi.transform( n_components=100) # TODO unused variables lsi_id = lsi.mid assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid assert lsi.get_path(lsi_id) is not None assert lsi._load_pars(lsi_id) is not None lsi.load(lsi_id) idx_gt = lsi.fe.search(ground_truth.index.values) idx_all = np.arange(lsi.fe.n_samples_, dtype='int') for accumulate in ['nearest-max', 'centroid-max']: #'nearest-diff', 'nearest-combine', 'stacking']: _, Y_train, Y_pred, ND_train = lsi.predict( idx_gt, ground_truth.is_relevant.values, accumulate=accumulate) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, idx_all, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.3) elif method == 'DuplicateDetection': dd = DuplicateDetection(cache_dir=cache_dir, dsid=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: cat = Clustering(cache_dir=cache_dir, dsid=uuid) cm = getattr(cat, 'k_means') labels, htree = cm(2, lsi_components=20) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): Clustering(cache_dir=cache_dir, dsid=uuid) else: raise ValueError
from freediscovery.engine.categorization import _CategorizerWrapper from freediscovery.engine.lsi import _LSIWrapper from freediscovery.io import parse_ground_truth_file from freediscovery.metrics import categorization_score from freediscovery.exceptions import OptionalDependencyMissing, WrongParameter from freediscovery.tests.run_suite import check_cache basename = Path(__file__).parent cache_dir = check_cache() EPSILON = 1e-4 data_dir = basename / ".." / ".." / "data" / "ds_001" / "raw" ground_truth = parse_ground_truth_file( str(data_dir / ".." / "ground_truth_file.txt")) fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') vect_uuid = fe.setup() fe.ingest(str(data_dir), file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w') lsi.fit_transform(n_components=6) _test_cases = itertools.product([False, True], [ "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid" ], [None, 'fast']) # 'MLPClassifier', 'ensemble-stacking' not supported in production the moment _test_cases = filter(lambda x: not (x[1].startswith("Nearest") and x[2]),