def _read_documents_impl(sample_document_files): # Test case for returning native points to documents, may be pathlib.Path or URI for filepath, num_docs in sample_document_files: all_docs = list(read_documents(filepath)) assert len(all_docs) == num_docs for path, doc in all_docs: assert isinstance(doc, dict) assert set(str(f) for f, _ in all_docs) == set([filepath]) # Test case for returning URIs pointing to documents for filepath, num_docs in sample_document_files: all_docs = list(read_documents(filepath, uri=True)) assert len(all_docs) == num_docs for uri, doc in all_docs: assert isinstance(doc, dict) assert isinstance(uri, str) url = as_url(filepath) if num_docs > 1: expect_uris = [ as_url(url) + '#part={}'.format(i) for i in range(num_docs) ] else: expect_uris = [as_url(url)] assert [f for f, _ in all_docs] == expect_uris
def load_documents(path): """ Load document/s from the specified path. At the moment can handle: - JSON and YAML locally and remotely. - Compressed JSON and YAML locally - Data Cube Dataset Documents inside local NetCDF files. :param path: path or URI to load documents from :return: generator of dicts """ path = str(path) url = as_url(path) scheme = urlparse(url).scheme compressed = url[-3:] == '.gz' if scheme == 'file' and path[-3:] == '.nc': path = uri_to_local_path(url) yield from load_from_netcdf(path) else: with _PROTOCOL_OPENERS[scheme](url) as fh: if compressed: fh = gzip.open(fh) path = path[:-3] suffix = Path(path).suffix parser = _PARSERS[suffix] yield from parser(fh)
def _test_read_docs_impl(sample_documents: Iterable[Tuple[str, int]]): # Test case for returning URIs pointing to documents for doc_url, num_docs in sample_documents: all_docs = list(read_documents(doc_url, uri=True)) assert len(all_docs) == num_docs for uri, doc in all_docs: assert isinstance(doc, dict) assert isinstance(uri, str) url = as_url(doc_url) if num_docs > 1: expect_uris = [as_url(url) + '#part={}'.format(i) for i in range(num_docs)] else: expect_uris = [as_url(url)] assert [f for f, _ in all_docs] == expect_uris
def process_file(path): docs = load_documents(path) if not uri: for doc in docs: yield path, doc else: url = as_url(path) def add_uri_no_part(x): idx, doc = x return url, doc def add_uri_with_part(x): idx, doc = x return mk_part_uri(url, idx), doc yield from map_with_lookahead(enumerate(docs), if_one=add_uri_no_part, if_many=add_uri_with_part)
def test_is_url(test_input, expected): assert is_url(test_input) == expected if expected: assert as_url(test_input) is test_input