示例#1
0
def syllabus_refinement(in_file, out_file, r, threshold):
    """
    Select the N documents around a given threshold in the syllabus /
    not-syllabus classifier predictions.
    """

    cols = ['path', 'score']
    reader = csv.DictReader(in_file, cols)

    # Gather ordered (path, score) tuples.
    scores = [(r['path'], float(r['score'])) for r in reader]
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Get the index of the document at the threshold.
    center = min(range(len(scores)),
                 key=lambda x: abs(scores[x][1] - threshold))

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    for path, score in scores[center - r:center + r]:

        row = (Document_Text.select(Document_Text.text,
                                    Document.path).join(Document).where(
                                        Document.path == path).naive().first())

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def test_text_extraction_fails(mock_osp):
    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
def test_text_extraction_fails(mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
    def es_stream_docs(cls):
        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id=row.document_id,
                body=row.text,
            )
    def es_stream_docs(cls):

        """
        Index document texts.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Document_Text.select()):

            yield dict(
                _id = row.document_id,
                body = row.text,
            )
示例#6
0
def random(out_file, n):
    """
    Write a CSV with plaintext for N random docs.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document).order_by(
                                      fn.random()).limit(n))

    for row in query_bar(query):

        writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
示例#7
0
def truncated(out_file, frag_len):
    """
    Write a CSV with truncated document texts.
    """

    # CSV writer.
    cols = ['id', 'title', 'text']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    query = (Document_Text.select(Document_Text.text,
                                  Document.path).join(Document))

    for row in query_bar(query):

        # Truncate the text.
        fragment = row.text[:frag_len]

        writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})