def test_categorization(use_lsi, method, cv): if 'CIRCLECI' in os.environ and cv == 'fast'\ and method in ['LinearSVC', 'xgboost']: raise SkipTest # Circle CI is too slow and timesout if method == 'xgboost': try: import xgboost except ImportError: raise SkipTest if not use_lsi: uuid = vect_uuid else: uuid = lsi.mid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: model, Y_train = cat.fit( index, ground_truth.is_relevant.values, method=method, cv=cv) except OptionalDependencyMissing: raise SkipTest except WrongParameter: if method in ['NearestNeighbor', 'NearestCentroid']: return else: raise Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert cat.get_params() is not None assert Y_pred.shape == (cat.fe.n_samples_, len(np.unique(ground_truth.is_relevant.values))) if method == 'NearestNeighbor': assert md.shape == Y_pred.shape else: assert md is None if method in ['xgboost', 'ensemble-stacking']: # this parameter fail for some reason so far... return assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.68) cat.delete()
def test_categorization(use_lsi, method, cv): if 'CIRCLECI' in os.environ and cv == 'fast' and method in [ 'LinearSVC', 'xgboost' ]: raise SkipTest # Circle CI is too slow and timesout if method == 'xgboost': try: import xgboost except ImportError: raise SkipTest if not use_lsi: uuid = vect_uuid else: uuid = lsi.mid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) index = cat.fe.db._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.train(index, ground_truth.is_relevant.values, method=method, cv=cv) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, Y_pred) assert cat.get_params() is not None if method == 'NearestNeighbor': assert sorted(list( md.keys())) == ['dist_n', 'dist_p', 'ind_n', 'ind_p'] for key, val in md.items(): assert val.shape == Y_pred.shape else: assert md == {} if method in ['xgboost', 'ensemble-stacking']: # this parameter fail for some reason so far... return assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.68) cat.delete()
def test_pipeline(n_steps): """ Test a 2 or 3 step pipelines with vectorizer (+ lsi) + classifier """ if n_steps == 2: uuid = vect_uuid elif n_steps == 3: uuid = lsi.mid else: raise ValueError cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values) cat.predict() assert len(cat.pipeline) == n_steps - 1 # additional tests if n_steps == 3: pf = PipelineFinder.by_id(cat.mid, cache_dir) assert list(pf.keys()) == ['vectorizer', 'lsi', 'categorizer'] assert list(pf.parent.keys()) == ['vectorizer', 'lsi'] assert list(pf.parent.parent.keys()) == ['vectorizer'] assert pf.mid == cat.mid assert pf.parent.mid == lsi.mid assert pf.parent.parent.mid == vect_uuid with pytest.raises(ValueError): pf.parent.parent.parent for estimator_type, mid in pf.items(): path = str(pf.get_path(mid, absolute=False)) if estimator_type == 'vectorizer': assert re.match('ediscovery_cache.*', path) elif estimator_type == 'lsi': assert re.match('ediscovery_cache.*lsi', path) elif estimator_type == 'categorizer': assert re.match('ediscovery_cache.*lsi.*categorizer', path) else: raise ValueError
def test_explain_categorization(): from freediscovery.categorization import binary_sensitivity_analysis uuid = vect_uuid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) model, _ = cat.fit(index, ground_truth.is_relevant.values, method='LogisticRegression') X = cat.fe._load_features() vect = cat.fe.vect_ weights = binary_sensitivity_analysis(model, vect.vocabulary_, X[0, :]) # not all vocabulary keys are returned assert len(list(weights.keys())) < len(vect.vocabulary_)
def test_explain_categorization(): from freediscovery.categorization import explain_binary_categorization uuid = vect_uuid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) index = cat.fe.db._search_filenames(ground_truth.file_path.values) model, _ = cat.fit(index, ground_truth.is_relevant.values, method='LogisticRegression') _, X = cat.fe.load() vect = cat.fe._load_model() weights = explain_binary_categorization(model, vect.vocabulary_, X[0, :]) assert len(weights.keys()) < len( vect.vocabulary_) # not all vocabulary keys are returned
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values, method=method) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError