def test_es_insert(models, config, corpus_index): """ CorpusIndex.index() should index all rows in Elasticsearch. """ # Index 100 documents. for i in range(10): doc = Document.create(path=str(i)) Document_Text.create(document=doc, text=str(i)) Document_Text.es_insert() # Should insert 10 docs. assert Document_Text.es_count() == 10 # For each text row: for t in Document_Text.select(): # A document should exist. doc = config.es.get('osp', t.document.path) # Should index text / doc ID. assert doc['_source']['doc_id'] == t.document.id assert doc['_source']['body'] == t.document.path
def test_link_with_document(models, mock_osp): """ When a semester marker is found, the metadata row should be associated with the document that was passed to the job. """ # 2 document rows. doc1 = Document.create(path='path1') doc2 = Document.create(path='path2') # Just 1 text row. doc_text = Document_Text.create(document=doc2, text='Fall 2012') assert doc_text.id != doc_text.document.id row = ext_semester(doc2.id) assert row.document == doc2
def test_es_doc(models): """ Document_Text#es_doc() should return an Elasticsearch document. """ doc = Document.create(path='000/abc') text = Document_Text.create(document=doc, text='text') assert text.es_doc['_id'] == '000/abc' assert text.es_doc['doc_id'] == doc.id assert text.es_doc['body'] == 'text'
def test_format_counts(models): """ Document.format_counts() """ d1 = Document.create(path='1') d2 = Document.create(path='2') d3 = Document.create(path='3') d4 = Document.create(path='4') d5 = Document.create(path='5') d6 = Document.create(path='6') # 1 doc with 'format1'. f1 = Document_Format.create(document=d1, format='format1') # 2 docs with 'format2'. f2 = Document_Format.create(document=d2, format='format2') f3 = Document_Format.create(document=d3, format='format2') # 3 docs with 'format3'. f4 = Document_Format.create(document=d4, format='format3') f5 = Document_Format.create(document=d5, format='format3') f6 = Document_Format.create(document=d6, format='format3') assert Document_Format.format_counts() == [ ('format3', 3), ('format2', 2), ('format1', 1) ]
def test_institution_counts(models): """ Document_Institution.institution_counts() should provide syllabus counts for each institution id. """ i1 = Institution.create() i2 = Institution.create() i3 = Institution.create() d1 = Document.create(path='d1') d2 = Document.create(path='d2') d3 = Document.create(path='d3') d4 = Document.create(path='d4') d5 = Document.create(path='d5') d6 = Document.create(path='d6') # 1 document for institution 1. Document_Institution.create(institution=i1, document=d1) # 2 documents for institution 2. Document_Institution.create(institution=i2, document=d2) Document_Institution.create(institution=i2, document=d3) # 3 documents for institution 3. Document_Institution.create(institution=i3, document=d4) Document_Institution.create(institution=i3, document=d5) Document_Institution.create(institution=i3, document=d6) assert Document_Institution.institution_counts() == { d1.id: 1, d2.id: 2, d3.id: 3, }
def _doc(content='content'): # Write a file. path = mock_osp.add_file(content=content) syllabus = Syllabus(path) # Insert the document row. document = Document.create(path=syllabus.relative_path) # Extract text. text = ext_text(document.id) return document
def test_text_extraction_fails(models, mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content="") document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def test_read_format(models, mock_osp): """ read_format() should write the format to the `document_format` table. """ # Add a file, create a document row. path = mock_osp.add_file() document = Document.create(path=path) ext_format(document.id) # Pop out the new row. row = Document_Format.get(Document_Format.document==document) assert row.format == 'text/plain'
def _ext(ftype): # Create a document. path = mock_osp.add_file(ftype=ftype) document = Document.create(path=path) # Extract the date. ext_file_metadata(document.id) # Pop out the new row. return ( Document_Date_File_Metadata .select() .where(Document_Date_File_Metadata.document==document) .first() )
def _ext(url): # Create a document. path = mock_osp.add_file(log={'url': url}) document = Document.create(path=path) # Extract the date. ext_archive_url(document.id) # Pop out the new row. return ( Document_Date_Archive_Url .select() .where(Document_Date_Archive_Url.document==document) .first() )
def test_text_extraction_succeeds(models, mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content="text") document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == "text"
def _ext(content): # Create a document. path = mock_osp.add_file(content=content) document = Document.create(path=path) # Extract text, then date. ext_text(document.id) ext_semester(document.id) # Pop out the new row. return ( Document_Date_Semester .select() .where(Document_Date_Semester.document==document) .first() )