def test_ingestion_metadata(n_fields): metadata = [] for idx, fname in enumerate(fnames_in_abs): el = {'file_path': fname} if n_fields >= 2: el['document_id'] = 'a' + str(idx + 100) if n_fields >= 3: el['rendition_id'] = 1 metadata.append(el) dbi = DocumentIndex.from_list(metadata) dbi._make_relative_paths() data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data if n_fields in [1, 2]: columns_ref = sorted(['file_path', 'document_id', 'internal_id']) elif n_fields == 3: columns_ref = sorted(['file_path', 'document_id', 'rendition_id', 'internal_id']) assert_array_equal(sorted(db.columns.values), columns_ref) assert_array_equal([os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_search_document_id(): md = [{'file_path': '/test', 'document_id': 2}, {'file_path': '/test2', 'document_id': 1}, {'file_path': '/test3', 'document_id': 7}, {'file_path': '/test8', 'document_id': 9}, {'file_path': '/test9', 'document_id': 4}] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'internal_id': 1}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure we use internal id first query = pd.DataFrame([{'internal_id': 1, 'document_id': 2}, {'internal_id': 2, 'document_id': 2}, {'internal_id': 1, 'document_id': 2}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) query = pd.DataFrame([{'document_id': 4}, {'document_id': 9}, {'document_id': 2}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [4, 3, 0])
def test_bad_search_document_rendition_id(): md = [{'file_path': '/test', 'document_id': 0, 'rendition_id': 0}, {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1}, {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0}, {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0}, {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'document_id': 1}]) with pytest.raises(NotFound): sres = dbi.search(query)
def test_ingestion_render(return_file_path): def _process_results(rd): rd = pd.DataFrame(rd) if return_file_path: assert 'file_path' in rd.columns del rd['file_path'] return rd # make it a binary variable return_file_path = (return_file_path == 'return_file_path') md = [{'file_path': '/test', 'document_id': 2}, {'file_path': '/test2', 'document_id': 1}, {'file_path': '/test3', 'document_id': 7}, {'file_path': '/test8', 'document_id': 9}, {'file_path': '/test9', 'document_id': 4}] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{'a': 2, 'internal_id': 3}, {'a': 4, 'internal_id': 1}]) res = pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9}, {'a': 4, 'internal_id': 1, 'document_id': 1}]) rd = dbi.render_dict(query, return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd, res) rd = dbi.render_dict(return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd.loc[[0]], pd.DataFrame([{'internal_id': 0, 'document_id': 2}])) assert len(rd) == len(md) rd = dbi.render_list(res, return_file_path=return_file_path) rd = _process_results(rd) assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a']) assert_frame_equal(pd.DataFrame(rd), pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9}, {'a': 4, 'internal_id': 1, 'document_id': 1}])) rd = dbi.render_list() assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])
def test_search_document_rendition_id(): md = [{'file_path': '/test', 'document_id': 0, 'rendition_id': 0}, {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1}, {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0}, {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0}, {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'internal_id': 1}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'rendition_id'])) # the internal id is not sufficient to fully index documents in this case query = pd.DataFrame([{'document_id': 0}, {'document_id': 1}, {'document_id': 2}]) with pytest.raises(ValueError): sres = dbi.search(query) query = pd.DataFrame([{'document_id': 0, 'rendition_id': 0}, {'document_id': 1, 'rendition_id': 0}, {'document_id': 2, 'rendition_id': 0}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [0, 2, 3])