def test_url_pipeline(): document = Document.from_url("http://www.google.com") new_document_store = LocalDocumentStore() stats = Pipeline(document).add_step(TextParser(encoding='ISO-8859-1')).add_step( DocumentStoreWriter(new_document_store)).run().statistics assert stats.documents_processed == 1 assert stats.document_exceptions == 0 assert new_document_store.count() == 1 new_doc = new_document_store.get_latest_document("http://www.google.com") print(new_doc.content_node.get_all_content())
def from_url(url, headers=None, *args, **kwargs): """Build a new pipeline with the input being a document created from the given URL Args: url: The URL ie. https://www.google.com headers: A dictionary of headers (Default value = None) *args: **kwargs: Returns: A new instance of a pipeline """ return Pipeline(Document.from_url(url, headers), *args, **kwargs)
def test_get_source(): document = Document.from_url('https://www.google.com') with get_source(document) as fh: data = fh.read() print(data)