예제 #1
0
def test_ingestion_metadata(n_fields):
    metadata = []
    for idx, fname in enumerate(fnames_in_abs):
        el = {'file_path': fname}
        if n_fields >= 2:
            el['document_id'] = 'a' + str(idx + 100)
        if n_fields >= 3:
            el['rendition_id'] = 1
        metadata.append(el)

    dbi = DocumentIndex.from_list(metadata)
    dbi._make_relative_paths()
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data

    if n_fields in [1, 2]:
        columns_ref = sorted(['file_path', 'document_id', 'internal_id'])
    elif n_fields == 3:
        columns_ref = sorted(['file_path', 'document_id', 'rendition_id',
                              'internal_id'])

    assert_array_equal(sorted(db.columns.values), columns_ref)
    assert_array_equal([os.path.normpath(os.path.join(data_dir_res, el))
                        for el in filenames],
                       [os.path.join(data_dir_res, el)
                        for el in db.file_path.values])
예제 #2
0
def test_search_document_id():
    md = [{'file_path': '/test',  'document_id': 2},
          {'file_path': '/test2', 'document_id': 1},
          {'file_path': '/test3', 'document_id': 7},
          {'file_path': '/test8', 'document_id': 9},
          {'file_path': '/test9', 'document_id': 4}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()
    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'internal_id': 1}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure we use internal id first
    query = pd.DataFrame([{'internal_id': 1, 'document_id': 2},
                          {'internal_id': 2, 'document_id': 2},
                          {'internal_id': 1, 'document_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])

    query = pd.DataFrame([{'document_id': 4},
                          {'document_id': 9},
                          {'document_id': 2}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [4, 3, 0])
예제 #3
0
def test_bad_search_document_rendition_id():
    md = [{'file_path': '/test',  'document_id': 0, 'rendition_id': 0},
          {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1},
          {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0},
          {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0},
          {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}]
    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'document_id': 1}])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
예제 #4
0
def test_ingestion_render(return_file_path):

    def _process_results(rd):
        rd = pd.DataFrame(rd)
        if return_file_path:
            assert 'file_path' in rd.columns
            del rd['file_path']
        return rd

    # make it a binary variable
    return_file_path = (return_file_path == 'return_file_path')

    md = [{'file_path': '/test',  'document_id': 2},
          {'file_path': '/test2', 'document_id': 1},
          {'file_path': '/test3', 'document_id': 7},
          {'file_path': '/test8', 'document_id': 9},
          {'file_path': '/test9', 'document_id': 4}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{'a': 2, 'internal_id': 3},
                          {'a': 4, 'internal_id': 1}])
    res = pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9},
                        {'a': 4, 'internal_id': 1, 'document_id': 1}])

    rd = dbi.render_dict(query, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd, res)
    rd = dbi.render_dict(return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd.loc[[0]],
                       pd.DataFrame([{'internal_id': 0, 'document_id': 2}]))
    assert len(rd) == len(md)

    rd = dbi.render_list(res, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a'])
    assert_frame_equal(pd.DataFrame(rd),
                       pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9},
                                     {'a': 4, 'internal_id': 1, 'document_id': 1}]))

    rd = dbi.render_list()
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])
예제 #5
0
def test_search_document_rendition_id():
    md = [{'file_path': '/test',  'document_id': 0, 'rendition_id': 0},
          {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1},
          {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0},
          {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0},
          {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()

    query = pd.DataFrame([{'internal_id': 1},
                          {'internal_id': 2},
                          {'internal_id': 1}])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path',
                               'document_id', 'rendition_id']))

    # the internal id is not sufficient to fully index documents in this case
    query = pd.DataFrame([{'document_id': 0},
                          {'document_id': 1},
                          {'document_id': 2}])
    with pytest.raises(ValueError):
        sres = dbi.search(query)

    query = pd.DataFrame([{'document_id': 0, 'rendition_id': 0},
                          {'document_id': 1, 'rendition_id': 0},
                          {'document_id': 2, 'rendition_id': 0}])

    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [0, 2, 3])