def insert(): """ Index documents. """ Document_Text.es_insert()
def corpus_index(requires_es): """ Clear the corpus index. """ Document_Text.es_reset()
def test_matches(corpus_index, mock_hlom, add_doc, add_hlom): """ When OSP documents match the query, write link rows. """ d1 = add_doc('War and Peace, Leo Tolstoy 1') d2 = add_doc('War and Peace, Leo Tolstoy 2') d3 = add_doc('War and Peace, Leo Tolstoy 3') d4 = add_doc('Anna Karenina, Leo Tolstoy 1') d5 = add_doc('Anna Karenina, Leo Tolstoy 2') Document_Text.es_insert() record = add_hlom('War and Peace', 'Leo Tolstoy') query(record.id) # Should write 3 citation links. assert HLOM_Citation.select().count() == 3 # Should match the right documents. for doc in [d1, d2, d3]: assert HLOM_Citation.select().where( HLOM_Citation.document==doc, HLOM_Citation.record==record )
def reset(): """ Reset the index. """ Document_Text.es_reset()
def delete(): """ Delete the index. """ Document_Text.es_delete()
def create(): """ Create the index. """ Document_Text.es_create()
def test_no_matches(corpus_index, add_doc, add_hlom): """ When no documents match, don't write any rows. """ add_doc('War and Peace, Leo Tolstoy') Document_Text.es_insert() record = add_hlom('Master and Man', 'Leo Tolstoy') query(record.id) # Shouldn't write any rows. assert HLOM_Citation.select().count() == 0
def ext_semester(doc_id): """ Try to find a "Spring/Fall YY/YYY" pattern. Args: doc_id (int): The document id. """ doc_text = Document_Text.get(Document_Text.document==doc_id) pattern = re.compile(r''' (?P<semester>fall|autumn|winter|spring|summer) [\s\']+ (?P<year>\d{4}|\d{2}) ''', re.I+re.X) match = re.search(pattern, doc_text.text) if match: row = Document_Date_Semester( document=doc_id, offset=match.start(), semester=match.group('semester'), year=match.group('year') ) if row.date.year > 1980 and row.date < datetime.now(): row.save() return row
def count(): """ Count documents. """ click.echo(Document_Text.es_count())
def test_es_doc(models): """ Document_Text#es_doc() should return an Elasticsearch document. """ doc = Document.create(path='000/abc') text = Document_Text.create(document=doc, text='text') assert text.es_doc['_id'] == '000/abc' assert text.es_doc['doc_id'] == doc.id assert text.es_doc['body'] == 'text'
def test_es_insert(models, config, corpus_index): """ CorpusIndex.index() should index all rows in Elasticsearch. """ # Index 100 documents. for i in range(10): doc = Document.create(path=str(i)) Document_Text.create(document=doc, text=str(i)) Document_Text.es_insert() # Should insert 10 docs. assert Document_Text.es_count() == 10 # For each text row: for t in Document_Text.select(): # A document should exist. doc = config.es.get('osp', t.document.path) # Should index text / doc ID. assert doc['_source']['doc_id'] == t.document.id assert doc['_source']['body'] == t.document.path
def test_text_extraction_fails(models, mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content="") document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def test_text_extraction_succeeds(models, mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content="text") document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == "text"
def ext_text(doc_id): """ Write the document as plain text. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) if doc.syllabus.text: return Document_Text.create( text=doc.syllabus.text, document=doc )
def test_link_with_document(models, mock_osp): """ When a semester marker is found, the metadata row should be associated with the document that was passed to the job. """ # 2 document rows. doc1 = Document.create(path='path1') doc2 = Document.create(path='path2') # Just 1 text row. doc_text = Document_Text.create(document=doc2, text='Fall 2012') assert doc_text.id != doc_text.document.id row = ext_semester(doc2.id) assert row.document == doc2
def term_counts(out_file, n): """ Write word frequency counts for N docs. """ # CSV writer. cols = ['term', 'count'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Pull counts. counts = Document_Text.term_counts(n) for term, count in counts.most_common(): writer.writerow({ 'term': term, 'count': count })