Python DocumentIndex示例

编程语言: Python

命名空间/包名称: outline

方法/功能: DocumentIndex

hotexamples.com的示例: 5

Python DocumentIndex - 已找到5个示例。这些是从开源项目中提取的最受好评的outline.DocumentIndex现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_outline.py 项目： jlbradley1844/python-stuff

def test_indexer():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()

    doc = outline.DocumentIndex(text)
    assert doc.lookup(100) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 1,
        "section_seq": 1
    }
    assert doc.lookup(1000) == {
        "section": "I: LAUDED AND GLORIFIED ART THOU, O LORD, MY...",
        "paragraph": 2,
        "section_seq": 1
    }
    assert doc.lookup(10000) == {
        "section": "VI: BEHOLD, HOW THE DIVERS PEOPLES AND KINDREDS...",
        "paragraph": 19,
        "section_seq": 6
    }
    assert doc.lookup(100000) == {
        "section": "XXIX: THE PURPOSE OF GOD IN CREATING MAN HATH...",
        "paragraph": 132,
        "section_seq": 29
    }
    # overflow - point to terminal datum??
    assert doc.lookup(500000) == {
        "section": "CLXVI: WHOSO LAYETH CLAIM TO A REVELATION DIRECT...",
        "paragraph": 718,
        "section_seq": 166
    }

示例#2

显示文件

文件： test_outline.py 项目： jlbradley1844/python-stuff

def test_para_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_paragraph_span(2)
    assert span == (349, 1081)

示例#3

显示文件

文件： test_outline.py 项目： jlbradley1844/python-stuff

def test_sect_getter():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    span = doc.get_section_span(4)
    assert span == (4850, 5875)

示例#4

显示文件

文件： test_outline.py 项目： jlbradley1844/python-stuff

def test_simple_getters():
    with open('texts/gleanings.txt', 'r', encoding='utf8') as myfile:
        text = myfile.read()
    doc = outline.DocumentIndex(text)

    assert doc.get_number_of_paragraphs() == 718
    assert doc.get_number_of_sections() == 166

示例#5

显示文件

    def __init__(self, document_index=DOCUMENT_INDEX, nlpengine=None):
        """
        Initializes engine; reads and indexes everything.
        :param document_index: List of file metadata, as per the object declaration
        in documetadata.py under documentadata.DOCUMENT_INDEX. In practice, the
        docmetadata.DOCUMENT_INDEX will be used, which will contain all of
        the documents analyzed.
        :param nlpengine: the spacy nlpengine. Normally this is not passed in
        because the whole purpose of DocumentCollection is to encapsulate said
        engine. But this is necessary if you contruct different DocumentCollections
        (e.g. for test purposes) because of the size of that object.
        """
        if nlpengine is None:
            self.nlp = spacy.load('en')  # use English
        else:
            self.nlp = nlpengine
        self.document_index = document_index

        for doc_obj in self.document_index:
            # read in the text file you wish to analyze
            with open(DocumentCollection.DOC_FOLDER +
                      self.document_index[doc_obj]["file"],
                      'r',
                      encoding='utf8') as next_file:
                text = next_file.read()

                # create a documentation index
                doc_index = outline.DocumentIndex(text)

                # tokenize and process the document into spacy document
                doc = self.nlp.tokenizer(text)
                self.nlp.tagger(doc)
                self.nlp.parser(doc)
                self.nlp.entity(doc)

                self.document_index[doc_obj]["raw"] = text
                self.document_index[doc_obj]["index"] = doc_index
                self.document_index[doc_obj]["nlpdoc"] = doc