Exemplo n.º 1
0
def test_api_dupdetection(app, kind, options):

    if kind == 'simhash':
        try:
            import simhash
        except ImportError:
            raise SkipTest

    dsid, pars, _ = get_features_cached(app, hashed=False)

    method = V01 + "/feature-extraction/{}".format(dsid)
    data = app.get_check(method)

    url = V01 + "/duplicate-detection"
    pars = {'parent_id': dsid,
            'method': kind}
    data = app.post_check(url, json=pars)
    assert dict2type(data) == {'id': 'str'}
    mid = data['id']

    url += '/{}'.format(mid)
    data = app.get_check(url, query_string=options)

    assert dict2type(data, max_depth=1) == {'data': 'list'}
    for row in data['data']:
        assert dict2type(row, max_depth=1) == {'cluster_id': 'int',
                                               'cluster_similarity': 'float',
                                               'documents': 'list'}

    app.delete_check(url)
Exemplo n.º 2
0
def test_dictkey2type():
    from freediscovery.utils import dict2type

    assert dict2type('djsk') == 'str'
    assert dict2type(['t', 1]) == ['str', 'int']
    assert dict2type({'t': {'b': 0.1}}) == {'t': {'b': 'float'}}

    # make sure we don't change the original object
    x = {'x': {'a': 3}}
    dict2type(x)
    assert x == {'x': {'a': 3}}
Exemplo n.º 3
0
def test_stop_words_integration(app, hashed):
    url = V01 + '/stop-words/'

    sw_name = 'test1w'
    pars = {'name': sw_name,
            'stop_words': ['and', 'or', 'in']}

    res = app.post_check(url, json=pars)
    assert dict2type(res, collapse_lists=True) == {'name': 'str'}
    assert res['name'] == sw_name

    res = app.get_check(url + sw_name)
    assert dict2type(res, collapse_lists=True) == {'name': 'str',
                                                   'stop_words': ['str']}
    assert res['name'] == sw_name
    assert res['stop_words'] == pars['stop_words']

    dsid, pars, _ = get_features(app, hashed=hashed, stop_words=sw_name)
Exemplo n.º 4
0
def test_get_search_filenames(app):

    dsid, _, _ = get_features_cached(app)

    method = V01 + "/feature-extraction/{}/id-mapping".format(dsid)

    def _filter_dict(x, filter_field):
        return {
            key: val
            for key, val in list(x.items()) if key == filter_field
        }

    response_ref = {
        'internal_id': 'int',
        'file_path': 'str',
        'document_id': 'int'
    }

    # Query 1
    file_path_obj = [{'file_path': val} for val in ['00401.txt', '00506.txt']]
    data = app.post_check(method, json={'data': file_path_obj})
    data = data['data']

    for idx in range(len(data)):
        assert dict2type(data[idx]) == response_ref
    assert [_filter_dict(row, 'file_path') for row in data] == file_path_obj
    assert_equal(
        np.asarray([row['internal_id'] for row in data])**2,
        [row['document_id'] for row in data])

    with pytest.raises(NotFound):
        res = app.post(method, json={'data': [{'file_path': '00400.txt'}]})

    # Query 2
    file_path_obj = [{'document_id': 4}, {'document_id': 9}]
    data = app.post_check(method, json={'data': file_path_obj})
    data = data['data']

    for idx in range(len(data)):
        assert dict2type(data[idx]) == response_ref
    assert [_filter_dict(row, 'document_id') for row in data] == file_path_obj
    assert_equal(
        np.asarray([row['internal_id'] for row in data])**2,
        [row['document_id'] for row in data])
Exemplo n.º 5
0
def test_append_documents(app, ingestion_method):

    method = V01 + "/feature-extraction/"
    data = app.post_check(method)
    dsid = data['id']
    method += dsid
    app.post_check(method, json={'data_dir': data_dir})

    data = app.get_check(method)

    # check that the file_path is correctly returned by the id-mapping
    data = app.post_check(method + '/id-mapping',
                          json={'return_file_path': False})
    assert dict2type(data['data'][0]) == \
        {'document_id': 'int', 'internal_id': 'int'}

    data = app.post_check(method + '/id-mapping',
                          json={'return_file_path': True})
    assert dict2type(data['data'][0]) == \
        {'document_id': 'int', 'file_path': 'str', 'internal_id': 'int'}
    db_old = data['data']

    dataset_definition = []
    for idx, row in enumerate(db_old):
        row_out = {'document_id': idx + 10}
        if ingestion_method == 'file_path':
            row_out['file_path'] = os.path.join(data_dir, row['file_path'])
        elif ingestion_method == 'content':
            with Path(data_dir, row['file_path']).open('rt',
                                                       encoding='utf-8') as fh:
                row_out['content'] = fh.read()
        dataset_definition.append(row_out)

    app.post_check(method + '/append',
                   json={'dataset_definition': dataset_definition})
    data = app.post_check(method + '/id-mapping',
                          json={'return_file_path': True})

    db_old = pd.DataFrame(db_old)
    db_new = pd.DataFrame(data['data'])
    assert db_old.shape[0] * 2 == db_new.shape[0]
Exemplo n.º 6
0
def test_get_feature_extraction(app, hashed):
    dsid, _, _ = get_features_cached(app, hashed=hashed)
    method = V01 + "/feature-extraction/{}".format(dsid)
    data = app.get_check(method)
    assert dict2type(data, collapse_lists=True) == {'analyzer': 'str',
                     'ngram_range': ['int'], 'stop_words': 'NoneType',
                     'n_jobs': 'int', 'chunk_size': 'int', 'norm': 'str',
                     'data_dir': 'str', 'n_samples': 'int',
                     'n_features': 'int', 'use_idf': 'bool',
                     'binary': 'bool', 'sublinear_tf': 'bool', 'use_hashing': 'bool',
                     'filenames': ['str'], 'max_df': 'float', 'min_df': 'float',
                     'parse_email_headers': 'bool', 'n_samples_processed': 'int'}
Exemplo n.º 7
0
def test_stop_words(app):
    name = "test_acstw"
    tested_stop_words = ['one', 'two', 'three', 'foure', 'five', 'six']

    method = V01 + "/stop-words/"
    pars = dict(name=name, stop_words=tested_stop_words)
    data = app.post_check(method, json=pars)

    method = V01 + "/stop-words/{}".format(name)
    data = app.get_check(method)

    assert dict2type(data, collapse_lists=True) == {
        'name': 'str',
        'stop_words': ['str']
    }
    assert data["stop_words"] == tested_stop_words

    method = V01 + "/stop-words/{}".format(name)
    app.delete_check(method)
Exemplo n.º 8
0
def test_get_feature_extraction(app, hashed, weighting):
    norm_alpha = 0.5
    dsid, _, _ = get_features_cached(app,
                                     hashed=hashed,
                                     weighting=weighting,
                                     norm_alpha=norm_alpha)
    method = V01 + "/feature-extraction/{}".format(dsid)
    data = app.get_check(method)
    assert dict2type(data, collapse_lists=True) == {
        'analyzer': 'str',
        'ngram_range': ['int'],
        'stop_words': 'str',
        'n_jobs': 'int',
        'chunk_size': 'int',
        'data_dir': 'str',
        'n_samples': 'int',
        'n_features': 'int',
        'weighting': 'str',
        'norm_alpha': 'float',
        'use_hashing': 'bool',
        'filenames': ['str'],
        'max_df': 'float',
        'min_df': 'float',
        'parse_email_headers': 'bool',
        'n_samples_processed': 'int',
        'preprocess': [],
        'column_ids': 'NoneType',
        'column_separator': 'str'
    }

    assert data['use_hashing'] == hashed
    assert data['weighting'] == weighting
    assert data['norm_alpha'] == norm_alpha

    vect = joblib.load(
        os.path.join(CACHE_DIR, 'ediscovery_cache', dsid, 'vectorizer'))
    assert (data['use_hashing'] is True) == ('hashing'
                                             in type(vect).__name__.lower())