def doc_to_fields(doc_id, radius=100):

    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document==doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
Пример #2
0
def syllabus_refinement(in_file, out_file, r, threshold):
    """
    Select the N documents around a given threshold in the syllabus /
    not-syllabus classifier predictions.
    """

    cols = ['path', 'score']
    reader = csv.DictReader(in_file, cols)

    # Gather ordered (path, score) tuples.
    scores = [(r['path'], float(r['score'])) for r in reader]
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Get the index of the document at the threshold.
    center = min(range(len(scores)),
                 key=lambda x: abs(scores[x][1] - threshold))

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    for path, score in scores[center - r:center + r]:

        row = (Document_Text.select(Document_Text.text,
                                    Document.path).join(Document).where(
                                        Document.path == path).naive().first())

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def doc_to_fields(doc_id, radius=100):
    """
    Search for field / department codes in a document.

    Args:
        doc_id (int)
        radius (int)
    """

    doc_text = Document_Text.get(Document_Text.document == doc_id)

    # Search for each field.
    for subfield in Subfield.select():

        match = subfield.search(doc_text.text)

        # If found, link field -> doc.
        if match:

            # Slice out the snippet.
            i1 = max(match.start() - radius, 0)
            i2 = min(match.end() + radius, len(doc_text.text))
            snippet = doc_text.text[i1:i2]

            Subfield_Document.create(
                subfield=subfield,
                document=doc_text.document,
                offset=match.start(),
                snippet=crunch(snippet),
            )
def test_text_extraction_fails(mock_osp):
    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
def test_text_extraction_fails(mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
    def es_stream_docs(cls):
        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id=row.document_id,
                body=row.text,
            )
    def es_stream_docs(cls):

        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id = row.document_id,
                body = row.text,
            )
def test_text_extraction_succeeds(mock_osp):
    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == 'text'
Пример #9
0
def term_counts(out_file, n):
    """
    Write word frequency counts for N docs.
    """

    # CSV writer.
    cols = ['term', 'count']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Pull counts.
    counts = Document_Text.term_counts(n)

    for term, count in counts.most_common():
        writer.writerow({'term': term, 'count': count})
def test_text_extraction_succeeds(mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document==document)
    assert row.text == 'text'
def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )
Пример #12
0
def random(out_file, n):
    """
    Write a CSV with plaintext for N random docs.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document).order_by(
                                      fn.random()).limit(n))

    for row in query_bar(query):

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def term_counts(out_file, n):

    """
    Write word frequency counts for N docs.
    """

    # CSV writer.
    cols = ['term', 'count']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Pull counts.
    counts = Document_Text.term_counts(n)

    for term, count in counts.most_common():
        writer.writerow({
            'term': term,
            'count': count
        })
Пример #14
0
def truncated(out_file, frag_len):
    """
    Write a CSV with truncated document texts.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document))

    for row in query_bar(query):

        # Truncate the text.
        fragment = row.text[:frag_len]

        writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})