def test_search_filenames(use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') assert fe.db_ is not None for low, high, step in [(0, 1, 1), (0, 4, 1), (3, 1, -1)]: idx_slice = list(range(low, high, step)) filenames_slice = [fe.filenames_[idx] for idx in idx_slice] idx0 = fe.db_._search_filenames(filenames_slice) assert_equal(idx0, idx_slice) assert_equal(filenames_slice, fe[idx0]) with pytest.raises(NotFound): fe.db_._search_filenames(['DOES_NOT_EXIST.txt']) if not use_hashing: n_top_words = 5 terms = fe.query_features([2, 3, 5], n_top_words=n_top_words) assert len(terms) == n_top_words fe.list_datasets()
def _list(args): cache_dir = _parse_cache_dir(args.cache_dir) fe = FeatureVectorizer(cache_dir) res = fe.list_datasets() res = sorted(res, key=lambda row: row['creation_date'], reverse=True) for row in res: print(' * Processed dataset {}'.format(row['id'])) print(' - data_dir: {}'.format(row['data_dir'])) print(' - creation_date: {}'.format(row['creation_date'])) for method in ['lsi', 'categorizer', 'cluster', 'dupdet', 'threading']: dpath = os.path.join(fe.cache_dir, row['id'], method) if not os.path.exists(dpath): continue mid_list = os.listdir(dpath) if mid_list: print(' # {}'.format(method)) for mid in mid_list: print(' * {}'.format(mid)) print(' ')