def test_custom_mid(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) mid_orig = "sklds" lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) lsi._load_features() assert lsi.mid == mid_orig with pytest.raises(WrongParameter): lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='w') lsi.fit_transform(n_components=2, alpha=1.0) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='fw') lsi.fit_transform(n_components=2, alpha=1.0) with pytest.raises(WrongParameter): lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid='?', mode='fw') lsi.fit_transform(n_components=2, alpha=1.0)
def test_feature_extraction_storage(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir, file_pattern='.*\d.txt') db = pd.read_pickle(os.path.join(cache_dir, 'ediscovery_cache', uuid, 'db')) assert 'file_path' not in db.columns
def fd_setup(**fe_options): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(n_features=n_features, use_hashing=True, stop_words='english', **fe_options) fe.ingest(data_dir, file_pattern=r'.*\d.txt') return cache_dir, uuid, fe.filenames_, fe
def test_search_filenames(use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') assert fe.db_ is not None for low, high, step in [(0, 1, 1), (0, 4, 1), (3, 1, -1)]: idx_slice = list(range(low, high, step)) filenames_slice = [fe.filenames_[idx] for idx in idx_slice] idx0 = fe.db_._search_filenames(filenames_slice) assert_equal(idx0, idx_slice) assert_equal(filenames_slice, fe[idx0]) with pytest.raises(NotFound): fe.db_._search_filenames(['DOES_NOT_EXIST.txt']) if not use_hashing: n_top_words = 5 terms = fe.query_features([2, 3, 5], n_top_words=n_top_words) assert len(terms) == n_top_words fe.list_datasets()
def __init__(self, cache_dir='/tmp/', parent_id=None, mid=None, load_model=False, mode='r'): if parent_id is None and mid is None: raise WrongParameter('At least one of parent_id or mid ' 'should be provided!') if self._wrapper_type == 'lsi' and self.mode in ['w', 'fw']: # lsi supports explicitly providing mid at creation if parent_id is None: raise WrongParameter(('parent_id={} must be provided for ' 'model creation!').format(parent_id)) else: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) if mid is not None: validate_mid(mid) self.mid = mid else: if parent_id is None and mid is not None: validate_mid(mid) self.pipeline = PipelineFinder.by_id(mid, cache_dir).parent self.mid = mid elif parent_id is not None: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) self.mid = None # this only affects LSI if mode not in ['r', 'w', 'fw']: raise WrongParameter( 'mode={} must be one of "r", "w", "fw"'.format(mode)) self.mode = mode # this is an alias that should be deprecated self.fe = FeatureVectorizer(cache_dir=cache_dir, dsid=self.pipeline['vectorizer']) self.model_dir = self.pipeline.get_path() / self._wrapper_type if self._wrapper_type == 'search': # no data need to be stored on disk return if not self.model_dir.exists(): self.model_dir.mkdir() if self.mid is not None and self.mode == 'r': self._pars = self._load_pars() else: self._pars = None if load_model: if self.mid is not None and self.mode == 'r': self.cmod = self._load_model() else: self.cmod = None
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') dsid = fe.setup(n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9) fe.ingest(data_dir, file_pattern=r'.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid, mode='w') lsi.fit_transform(n_components=6) return cache_dir, dsid, fe.filenames_, lsi.mid
def _list(args): cache_dir = _parse_cache_dir(args.cache_dir) fe = FeatureVectorizer(cache_dir) res = fe.list_datasets() res = sorted(res, key=lambda row: row['creation_date'], reverse=True) for row in res: print(' * Processed dataset {}'.format(row['id'])) print(' - data_dir: {}'.format(row['data_dir'])) print(' - creation_date: {}'.format(row['creation_date'])) for method in ['lsi', 'categorizer', 'cluster', 'dupdet', 'threading']: dpath = os.path.join(fe.cache_dir, row['id'], method) if not os.path.exists(dpath): continue mid_list = os.listdir(dpath) if mid_list: print(' # {}'.format(method)) for mid in mid_list: print(' * {}'.format(mid)) print(' ')
def test_email_parsing(): data_dir = os.path.join(basename, "..", "..", "data", "fedora-devel-list-2008-October") cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) email_md = fe.parse_email_headers() assert len(fe.filenames_) == len(email_md) fe.delete()
def test_search_wrapper(kind): # check for syntax errors etc in the wrapper fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') vect_uuid = fe.setup() fe.ingest(data_dir, file_pattern=r'.*\d.txt') if kind == 'semantic': lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w') lsi.fit_transform(n_components=20) parent_id = lsi.mid else: parent_id = vect_uuid sw = _SearchWrapper(cache_dir=cache_dir, parent_id=parent_id) dist = sw.search("so that I can reserve a room") assert dist.shape == (fe.n_samples_, ) # document 1 found by # grep -rn "so that I can reserve a room" # freediscovery/data/ds_001/raw/ assert dist.argmax() == 1
def test_lsi(): cache_dir = check_cache() n_components = 2 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir, file_pattern=r'.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0) assert lsi_res.components_.shape[0] == 5 assert lsi_res.components_.shape[1] == fe.n_features_ assert lsi._load_pars() is not None lsi._load_model() X_lsi = lsi._load_features() assert_allclose(normalize(X_lsi), X_lsi) lsi.list_models() lsi.delete()
def test_ingestion_csv(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') fe.setup(column_ids=[1, 3]) fe.ingest(dataset_definition=[{ 'file_path': os.path.join(csv_data_dir, 'example.csv') }], ) X = fe._load_features() assert X.shape[0] == 4 assert len(fe.filenames_) == X.shape[0] assert X.shape[0] == fe.n_samples_
def test_feature_extraction_weighting(weighting, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(weighting=weighting, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \ "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized fe.delete()
def test_feature_extraction_cyrillic(use_hashing): data_dir = os.path.join(basename, "..", "..", "data", "ds_002", "raw") cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) filenames = fe.filenames_ fe._filenames = None filenames2 = fe.filenames_ assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2),\ "not an array {}".format(res2) assert np.isfinite(res2.data).all() fe.delete()
def test_ingestion_csv_wrong_params(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') fe.setup(column_ids=[1, 3]) with pytest.raises( ValueError, match=".*can only be privided using `dataset_definition.*"): fe.ingest(csv_data_dir) with pytest.raises(ValueError, match=".*one CSV can be provided at a time.*"): fe.ingest(dataset_definition=[ { 'file_path': os.path.join(csv_data_dir, 'example.csv') }, { 'file_path': os.path.join(csv_data_dir, 'example.csv') }, ], )
def test_feature_extraction_nfeatures(n_features, weighting, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(n_features=n_features, weighting=weighting, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern=r'.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \ "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert res2.shape[1] == fe.n_features_ fe.delete()
def test_lsi_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_threading(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir=data_dir) fe.parse_email_headers() cat = _EmailThreadingWrapper(cache_dir=cache_dir, parent_id=uuid) tree = cat.thread() cat.get_params() tree_ref = [{ 'id': 0, 'parent': None, 'children': [{ 'id': 1, 'children': [], 'parent': 0 }, { 'id': 2, 'parent': 0, 'children': [{ 'id': 3, 'children': [], 'parent': 2 }, { 'id': 4, 'children': [], 'parent': 2 }] }] }] assert [el.to_dict() for el in tree] == tree_ref assert len(fe.filenames_) == sum([el.tree_size for el in tree]) assert len(fe.filenames_) == 5 assert len(tree[0].flatten()) == 5
def test_lsi_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() n_samples = fe.n_samples_ docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0]*2 assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
from freediscovery.metrics import categorization_score from freediscovery.exceptions import OptionalDependencyMissing, WrongParameter from freediscovery.tests.run_suite import check_cache basename = Path(__file__).parent cache_dir = check_cache() EPSILON = 1e-4 data_dir = basename / ".." / ".." / "data" / "ds_001" / "raw" ground_truth = parse_ground_truth_file( str(data_dir / ".." / "ground_truth_file.txt")) fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') vect_uuid = fe.setup() fe.ingest(str(data_dir), file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w') lsi.fit_transform(n_components=6) _test_cases = itertools.product([False, True], [ "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid" ], [None, 'fast']) # 'MLPClassifier', 'ensemble-stacking' not supported in production the moment _test_cases = filter(lambda x: not (x[1].startswith("Nearest") and x[2]), _test_cases)
def test_non_random_dsid(): data_dir = os.path.join(basename, "..", "..", "data", "ds_002", "raw") cache_dir = check_cache() dsid = 'test-dataset' fe = FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid) uuid = fe.setup() assert dsid == uuid fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False) # writing with the same name fails with pytest.raises(WrongParameter): FeatureVectorizer(cache_dir=cache_dir, mode='w', dsid=dsid) FeatureVectorizer(cache_dir=cache_dir, mode='r', dsid=dsid) FeatureVectorizer(cache_dir=cache_dir, mode='fw', dsid=dsid) # special characters are not allowed with pytest.raises(WrongParameter): fh = FeatureVectorizer(cache_dir=cache_dir, mode='fw', dsid='?+ds$$') uuid = fh.setup()
def test_ingestion_content(): data_dir = Path(basename, "..", "..", "data", "ds_002", "raw") dd = [] for idx, fname in enumerate(sorted(data_dir.glob('*txt'))): with fname.open('rt', encoding='utf-8') as fh: dd.append({'document_id': idx + 19, 'content': fh.read()}) cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(dataset_definition=dd, vectorize=True) assert len(fe.filenames_) == 6 assert fe.filenames_[0] == '000000000_0.txt' X = fe._load_features() assert X.shape[0] == 6 assert fe.db_.data.shape[0] == len(fe.filenames_) fe2 = FeatureVectorizer(cache_dir=cache_dir, mode='w') fe2.setup() fe2.ingest(data_dir=str(data_dir)) X2 = fe2._load_features() assert X.shape == X2.shape assert_array_equal(X.indices, X2.indices) assert_array_equal(X.data, X2.data)
def test_ingestion_batches(): data_dir = os.path.join(basename, "..", "..", "data", "ds_002", "raw") cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() with pytest.raises(ValueError): fe.ingest(vectorize=True) # no ingested files fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False) fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False) fe.ingest(data_dir, file_pattern='.*\d.txt', vectorize=False) fe.ingest(vectorize=True) assert fe.db_.data.shape[0] == len(fe.filenames_) assert len(fe.filenames_) == 6 * 3 X = fe._load_features() assert X.shape[0] == 6 * 3 with pytest.raises(ValueError): fe.ingest(vectorize=True) # already vectorized
def test_sampling_filenames(): cache_dir = check_cache() fe_pars = {'weighting': 'bnn'} fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') with pytest.warns(UserWarning): # there is a warning because we don't use norm='l2' uuid = fe.setup(use_hashing=True, **fe_pars) fe.ingest(data_dir, file_pattern='.*\d.txt') X = fe._load_features(uuid) # don't use any sampling fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=None) X_s = fes._load_features(uuid) pars = fe.pars_ fnames = fe.filenames_ fnames_s = fes.filenames_ assert_array_equal(fnames, fnames_s) assert_array_equal(X.data, X_s.data) assert fes.n_samples_ == len(fnames) fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=fnames[::-1]) assert fes.sampling_index is not None X_s = fes._load_features(uuid) pars_s = fes.pars_ fnames_s = fes.filenames_ assert_array_equal(fnames[::-1], fnames_s) assert_array_equal(X[::-1, :].data, X_s.data) for key in pars: if key == 'filenames': assert pars[key][::-1] == pars_s[key] else: assert pars[key] == pars_s[key] # repeat twice the filenames fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=(fnames + fnames)) assert fes.sampling_index is not None X_s = fes._load_features(uuid) pars_s = fes.pars_ fnames_s = fes.filenames_ assert_array_equal(fnames + fnames, fnames_s) assert_array_equal(X.data, X_s[:len(fnames)].data) assert_array_equal(X.data, X_s[len(fnames):].data) assert fes.n_samples_ == len(fnames) * 2 # for key in pars: # assert pars[key] == pars_s[key] # downsample the filenames N = len(fnames) // 2 np.random.seed(1) idx = np.random.choice(fe.n_samples_, size=(N, )) fnames_s_in = np.array(fnames)[idx].tolist() fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=fnames_s_in) assert fes.sampling_index is not None X_s = fes._load_features(uuid) pars_s = fes.pars_ fnames_s = fes.filenames_ assert_array_equal(fnames_s_in, fnames_s) assert_array_equal(X[idx].data, X_s.data) assert fes.n_samples_ == N fe.delete()
def test_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_.data filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] - 2 assert fe.db_.data.shape[0] == db.shape[0] - 2 assert len(fe.filenames_) == len(filenames) - 2 dbn = fe.db_.data assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values, dbn['document_id'].values) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples - 2 fe.delete()
def test_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_ filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] * 2 assert fe.db_.data.shape[0] == db.data.shape[0] * 2 assert len(fe.filenames_) == len(filenames) * 2 dbn = fe.db_.data assert_equal(dbn.iloc[:n_samples]['document_id'].values, dbn.iloc[n_samples:]['document_id'].values - 10) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples * 2 fe.delete()
def test_df_filtering(use_hashing, min_df, max_df): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(min_df=min_df, max_df=max_df, use_hashing=use_hashing) fe.ingest(data_dir) X = fe._load_features(uuid) fe2 = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid2 = fe2.setup(use_hashing=use_hashing) fe2.ingest(data_dir) X2 = fe2._load_features(uuid2) if use_hashing: # min/max_df does not affect the number of features assert X.shape[1] == X2.shape[1] else: # min/max_df removes some features assert X.shape[1] < X2.shape[1] fe.delete()
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file(os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit( index, ground_truth.is_relevant.values, method=method ) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError