Python Document_Text примеры использования

Язык программирования: Python

Пространство имен/Пакет: osp.corpus.models.text

Класс/Тип: Document_Text

Примеров на hotexamples.com: 16

Python Document_Text - 16 примеров найдено. Это лучшие примеры Python кода для osp.corpus.models.text.Document_Text, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

create(4)

es_insert(4)

es_count(2)

es_reset(2)

get(2)

select(2)

es_create(1)

es_delete(1)

term_counts(1)

Пример #1

Показать файл

Файл: corpus_index.py Проект: samzhang111/osp

def insert():

    """
    Index documents.
    """

    Document_Text.es_insert()

Пример #2

Показать файл

Файл: conftest.py Проект: overview/osp

def corpus_index(requires_es):

    """
    Clear the corpus index.
    """

    Document_Text.es_reset()

Пример #3

Показать файл

Файл: test_query.py Проект: overview/osp

def test_matches(corpus_index, mock_hlom, add_doc, add_hlom):

    """
    When OSP documents match the query, write link rows.
    """

    d1 = add_doc('War and Peace, Leo Tolstoy 1')
    d2 = add_doc('War and Peace, Leo Tolstoy 2')
    d3 = add_doc('War and Peace, Leo Tolstoy 3')
    d4 = add_doc('Anna Karenina, Leo Tolstoy 1')
    d5 = add_doc('Anna Karenina, Leo Tolstoy 2')

    Document_Text.es_insert()

    record = add_hlom('War and Peace', 'Leo Tolstoy')
    query(record.id)

    # Should write 3 citation links.
    assert HLOM_Citation.select().count() == 3

    # Should match the right documents.
    for doc in [d1, d2, d3]:

        assert HLOM_Citation.select().where(
            HLOM_Citation.document==doc,
            HLOM_Citation.record==record
        )

Пример #4

Показать файл

Файл: corpus_index.py Проект: samzhang111/osp

def reset():

    """
    Reset the index.
    """

    Document_Text.es_reset()

Пример #5

Показать файл

Файл: corpus_index.py Проект: samzhang111/osp

def delete():

    """
    Delete the index.
    """

    Document_Text.es_delete()

Пример #6

Показать файл

Файл: corpus_index.py Проект: samzhang111/osp

def create():

    """
    Create the index.
    """

    Document_Text.es_create()

Пример #7

Показать файл

Файл: test_query.py Проект: overview/osp

def test_no_matches(corpus_index, add_doc, add_hlom):

    """
    When no documents match, don't write any rows.
    """

    add_doc('War and Peace, Leo Tolstoy')
    Document_Text.es_insert()

    record = add_hlom('Master and Man', 'Leo Tolstoy')
    query(record.id)

    # Shouldn't write any rows.
    assert HLOM_Citation.select().count() == 0

Пример #8

Показать файл

Файл: ext_semester.py Проект: overview/osp

def ext_semester(doc_id):

    """
    Try to find a "Spring/Fall YY/YYY" pattern.

    Args:
        doc_id (int): The document id.
    """

    doc_text = Document_Text.get(Document_Text.document==doc_id)

    pattern = re.compile(r'''
        (?P<semester>fall|autumn|winter|spring|summer)
        [\s\']+
        (?P<year>\d{4}|\d{2})
    ''', re.I+re.X)

    match = re.search(pattern, doc_text.text)

    if match:

        row = Document_Date_Semester(
            document=doc_id,
            offset=match.start(),
            semester=match.group('semester'),
            year=match.group('year')
        )

        if row.date.year > 1980 and row.date < datetime.now():
            row.save()
            return row

Пример #9

Показать файл

Файл: corpus_index.py Проект: samzhang111/osp

def count():

    """
    Count documents.
    """

    click.echo(Document_Text.es_count())

Пример #10

Показать файл

Файл: test_es_doc.py Проект: overview/osp

def test_es_doc(models):

    """
    Document_Text#es_doc() should return an Elasticsearch document.
    """

    doc = Document.create(path='000/abc')
    text = Document_Text.create(document=doc, text='text')

    assert text.es_doc['_id']       == '000/abc'
    assert text.es_doc['doc_id']    == doc.id
    assert text.es_doc['body']      == 'text'

Пример #11

Показать файл

Файл: test_es_insert.py Проект: overview/osp

def test_es_insert(models, config, corpus_index):

    """
    CorpusIndex.index() should index all rows in Elasticsearch.
    """

    # Index 100 documents.
    for i in range(10):
        doc = Document.create(path=str(i))
        Document_Text.create(document=doc, text=str(i))

    Document_Text.es_insert()

    # Should insert 10 docs.
    assert Document_Text.es_count() == 10

    # For each text row:
    for t in Document_Text.select():

        # A document should exist.
        doc = config.es.get('osp', t.document.path)

        # Should index text / doc ID.
        assert doc['_source']['doc_id'] == t.document.id
        assert doc['_source']['body']   == t.document.path

Пример #12

Показать файл

Файл: test_ext_text.py Проект: samzhang111/osp

def test_text_extraction_fails(models, mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content="")
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0

Пример #13

Показать файл

Файл: test_ext_text.py Проект: samzhang111/osp

def test_text_extraction_succeeds(models, mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content="text")
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == "text"

Пример #14

Показать файл

Файл: ext_text.py Проект: overview/osp

def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )

Пример #15

Показать файл

Файл: test_ext_semester.py Проект: overview/osp

def test_link_with_document(models, mock_osp):

    """
    When a semester marker is found, the metadata row should be associated
    with the document that was passed to the job.
    """

    # 2 document rows.
    doc1 = Document.create(path='path1')
    doc2 = Document.create(path='path2')

    # Just 1 text row.
    doc_text = Document_Text.create(document=doc2, text='Fall 2012')
    assert doc_text.id != doc_text.document.id

    row = ext_semester(doc2.id)
    assert row.document == doc2

Пример #16

Показать файл

Файл: corpus_csv.py Проект: overview/osp

def term_counts(out_file, n):

    """
    Write word frequency counts for N docs.
    """

    # CSV writer.
    cols = ['term', 'count']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Pull counts.
    counts = Document_Text.term_counts(n)

    for term, count in counts.most_common():
        writer.writerow({
            'term': term,
            'count': count
        })