예제 #1
0
    def __next__(self):
        if self.index > len(self.files) - 1:
            raise StopIteration

        self.index += 1
        if self.unpack:
            return Document.from_kdxa(self.files[self.index - 1])

        document = Document(
            DocumentMetadata({
                "source_path":
                self.files[self.index - 1],
                "connector":
                self.get_name(),
                "mime_type":
                mimetypes.guess_type(self.files[self.index - 1]),
                "connector_options": {
                    "path": self.path,
                    "file_filter": self.file_filter
                }
            }))
        document.source.original_filename = os.path.basename(
            self.files[self.index - 1])
        document.source.original_path = self.path
        document.source.connector = self.get_name()

        # TODO we need to get the checksum and last_updated and created times
        return document
예제 #2
0
def get_test_document():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    document.content_node.add_child(
        document.create_node(type='bar', content='fishstick'))
    return document
예제 #3
0
def create_document():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    foo2 = document.create_node(type='bar')
    foo2.content = "fishstick"
    document.content_node.add_child(foo2)
    return document
예제 #4
0
def create_document():
    document = Document(DocumentMetadata())
    document.source.original_filename = "test.doc"
    node = document.create_node(node_type='foo')
    node.content = "cheese"
    document.content_node = node

    foo2 = document.create_node(node_type='bar')
    foo2.content = "fishstick"
    document.content_node.add_child(foo2)
    return document
예제 #5
0
 def __next__(self):
     if self.completed:
         raise StopIteration
     else:
         return Document(
             DocumentMetadata({
                 "source_path": self.file,
                 "connector": self.get_name(),
                 "mime_type": mimetypes.guess_type(self.file),
                 "connector_options": {
                     "file": self.file
                 }
             }))
예제 #6
0
 def __next__(self):
     if self.completed:
         raise StopIteration
     else:
         self.completed = True
         return Document(
             DocumentMetadata({
                 "connector": self.get_name(),
                 "connector_options": {
                     "url": self.url,
                     "headers": self.headers
                 }
             }))
예제 #7
0
def get_test_document_with_three_children():
    document = Document(DocumentMetadata())
    node = document.create_node(type='foo')
    node.content = "cheese"
    document.content_node = node

    document.content_node.add_child(
        document.create_node(type='bar', content='fishstick'))
    document.content_node.add_child(
        document.create_node(type='bar', content='cheeseburger'))
    document.content_node.add_child(
        document.create_node(type='bar', content='beans'))

    return document
예제 #8
0
def test_virtual_navigation_with_no_0_index():
    document = Document(DocumentMetadata())
    document.add_mixin('core')
    node = document.create_node(type='loopy')
    node.content = "banana"
    document.content_node = node

    document.content_node.add_child(document.create_node(type='loopy',
                                                         content='banana2'),
                                    index=2)

    assert document.content_node.get_node_at_index(0).content is None
    assert document.content_node.get_node_at_index(
        0).next_node().content is None
    assert document.content_node.get_node_at_index(
        0).next_node().next_node().content is 'banana2'
예제 #9
0
    def __next__(self):
        if self.completed:
            raise StopIteration

        self.completed = True
        document = Document(
            DocumentMetadata({
                "connector": self.get_name(),
                "connector_options": {
                    "url": self.url,
                    "headers": self.headers
                }
            }))
        document.source.connector = self.get_name()
        document.source.original_path = self.url
        document.source.headers = self.headers
        return document
예제 #10
0
 def __next__(self):
     if self.index > len(self.files) - 1:
         raise StopIteration
     else:
         self.index += 1
         return Document(
             DocumentMetadata({
                 "source_path":
                 self.files[self.index - 1],
                 "connector":
                 self.get_name(),
                 "mime_type":
                 mimetypes.guess_type(self.files[self.index - 1]),
                 "connector_options": {
                     "path": self.path,
                     "file_filter": self.file_filter
                 }
             }))
예제 #11
0
    def __next__(self):
        if self.completed:
            raise StopIteration

        document = Document(
            DocumentMetadata({
                "source_path": self.file,
                "connector": self.get_name(),
                "mime_type": mimetypes.guess_type(self.file),
                "connector_options": {
                    "file": self.file
                }
            }))
        document.source.original_filename = self.file
        document.source.original_path = os.path.basename(self.file)
        document.source.connector = self.get_name()

        # TODO we need to get the checksum and last_updated and created times
        return document
예제 #12
0
def test_url_pipeline():
    document = Document(
        DocumentMetadata({
            "connector": "url",
            "connector_options": {
                "url": "http://www.google.com"
            }
        }))
    new_document_store = JsonDocumentStore("/tmp/test-json-store",
                                           force_initialize=True)

    stats = Pipeline(document).add_step(TextParser(
        encoding='ISO-8859-1')).set_sink(new_document_store).run().statistics

    assert stats.documents_processed == 1
    assert stats.document_exceptions == 0
    assert new_document_store.count() == 1

    new_doc = new_document_store.get_document(0)
    new_doc.add_mixin('core')
    print(new_doc.content_node.get_all_content())
예제 #13
0
def test_virtual_navigation_with_no_0_index():
    document = Document(DocumentMetadata())
    node = document.create_node(node_type='loopy')
    node.content = "banana"
    document.content_node = node

    document.content_node.add_child(document.create_node(node_type='loopy',
                                                         content='banana2'),
                                    index=2)

    assert document.content_node.get_node_at_index(0).content is None
    assert document.content_node.get_node_at_index(
        0).next_node().content is None
    assert document.content_node.get_node_at_index(
        0).next_node().next_node().content == 'banana2'

    test_kddb = document.to_kddb()
    new_kddb = Document.from_kddb(test_kddb)

    assert new_kddb.content_node.get_node_at_index(0).content is None
    assert new_kddb.content_node.get_node_at_index(
        0).next_node().content is None
    assert new_kddb.content_node.get_node_at_index(
        0).next_node().next_node().content == 'banana2'