예제 #1
0
def test_search_filenames(use_hashing):
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup(use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    assert fe.db_ is not None

    for low, high, step in [(0, 1, 1), (0, 4, 1), (3, 1, -1)]:
        idx_slice = list(range(low, high, step))
        filenames_slice = [fe.filenames_[idx] for idx in idx_slice]
        idx0 = fe.db_._search_filenames(filenames_slice)
        assert_equal(idx0, idx_slice)
        assert_equal(filenames_slice, fe[idx0])

    with pytest.raises(NotFound):
        fe.db_._search_filenames(['DOES_NOT_EXIST.txt'])

    if not use_hashing:
        n_top_words = 5
        terms = fe.query_features([2, 3, 5], n_top_words=n_top_words)
        assert len(terms) == n_top_words

    fe.list_datasets()
예제 #2
0
def _list(args):
    cache_dir = _parse_cache_dir(args.cache_dir)
    fe = FeatureVectorizer(cache_dir)
    res = fe.list_datasets()
    res = sorted(res, key=lambda row: row['creation_date'], reverse=True)
    for row in res:
        print(' * Processed dataset {}'.format(row['id']))
        print('    - data_dir: {}'.format(row['data_dir']))
        print('    - creation_date: {}'.format(row['creation_date']))
        for method in ['lsi', 'categorizer', 'cluster', 'dupdet', 'threading']:
            dpath = os.path.join(fe.cache_dir, row['id'], method)
            if not os.path.exists(dpath):
                continue
            mid_list = os.listdir(dpath)
            if mid_list:
                print('     # {}'.format(method))
                for mid in mid_list:
                    print('       * {}'.format(mid))
        print(' ')