def doc_to_fields(doc_id, radius=100):

    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document==doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
def doc_to_fields(doc_id, radius=100):
    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document == doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
def test_text_extraction_succeeds(mock_osp):
    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == 'text'
def test_text_extraction_succeeds(mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document==document)
    assert row.text == 'text'