def test_insert_documents(models, mock_osp): """ Corpus.insert_documents() should create a row for each syllabus. """ # 10 segments x 10 files. for s in segment_range(10): for i in range(10): mock_osp.add_file(segment=s, name=s+'-'+str(i)) # Insert document rows. Document.insert_documents() # Should create 100 rows. assert Document.select().count() == 100 # All docs should have rows. for s in segment_range(10): for i in range(10): # Path is [segment]/[file] path = s+'/'+s+'-'+str(i) # Query for the document path. query = Document.select().where(Document.path==path) assert query.count() == 1
def insert_documents(): """ Insert documents in the database. """ Document.insert_documents()
def match_doc(id): """ Find an institution with the same base URL as a document. Args: id (int): A document id. """ doc = Document.get(Document.id==id) # Break if no manifest. if not doc.syllabus.registered_domain: return # Form the domain query. q = '%'+doc.syllabus.registered_domain+'%' inst = ( Institution .select() .where(Institution.metadata['Institution_Web_Address'] ** (q)) .order_by(Institution.id) .first() ) if inst: Document_Institution.create( document=doc.id, institution=inst )
def test_es_insert(models, config, corpus_index): """ CorpusIndex.index() should index all rows in Elasticsearch. """ # Index 100 documents. for i in range(10): doc = Document.create(path=str(i)) Document_Text.create(document=doc, text=str(i)) Document_Text.es_insert() # Should insert 10 docs. assert Document_Text.es_count() == 10 # For each text row: for t in Document_Text.select(): # A document should exist. doc = config.es.get('osp', t.document.path) # Should index text / doc ID. assert doc['_source']['doc_id'] == t.document.id assert doc['_source']['body'] == t.document.path
def ext_archive_url(doc_id): """ Try to extract an Internet Archive timestamp from the URL. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) match = re.search( 'web\.archive\.org\/web\/(?P<timestamp>\d+)', doc.syllabus.url ) if match: date = datetime.strptime( match.group('timestamp'), date_format ) if date < datetime.now(): return Document_Date_Archive_Url.create( document=doc, date=date )
def queue_file_metadata(): """ Queue file metadata extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_file_metadata, doc.id)
def test_link_with_document(models, mock_osp): """ When a semester marker is found, the metadata row should be associated with the document that was passed to the job. """ # 2 document rows. doc1 = Document.create(path='path1') doc2 = Document.create(path='path2') # Just 1 text row. doc_text = Document_Text.create(document=doc2, text='Fall 2012') assert doc_text.id != doc_text.document.id row = ext_semester(doc2.id) assert row.document == doc2
def queue_archive_url(): """ Queue Internet Archive timestamp extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_archive_url, doc.id)
def queue_semester(): """ Queue semester regex extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_semester, doc.id)
def queue_match_doc(): """ Queue institution matching tasks in the worker. """ for doc in Document.select(): config.rq.enqueue(match_doc, doc.id)
def queue_text(): """ Queue text extraction tasks in the worker. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_text, doc.id)
def test_es_doc(models): """ Document_Text#es_doc() should return an Elasticsearch document. """ doc = Document.create(path='000/abc') text = Document_Text.create(document=doc, text='text') assert text.es_doc['_id'] == '000/abc' assert text.es_doc['doc_id'] == doc.id assert text.es_doc['body'] == 'text'
def test_institution_counts(models): """ Document_Institution.institution_counts() should provide syllabus counts for each institution id. """ i1 = Institution.create() i2 = Institution.create() i3 = Institution.create() d1 = Document.create(path='d1') d2 = Document.create(path='d2') d3 = Document.create(path='d3') d4 = Document.create(path='d4') d5 = Document.create(path='d5') d6 = Document.create(path='d6') # 1 document for institution 1. Document_Institution.create(institution=i1, document=d1) # 2 documents for institution 2. Document_Institution.create(institution=i2, document=d2) Document_Institution.create(institution=i2, document=d3) # 3 documents for institution 3. Document_Institution.create(institution=i3, document=d4) Document_Institution.create(institution=i3, document=d5) Document_Institution.create(institution=i3, document=d6) assert Document_Institution.institution_counts() == { d1.id: 1, d2.id: 2, d3.id: 3, }
def test_format_counts(models): """ Document.format_counts() """ d1 = Document.create(path='1') d2 = Document.create(path='2') d3 = Document.create(path='3') d4 = Document.create(path='4') d5 = Document.create(path='5') d6 = Document.create(path='6') # 1 doc with 'format1'. f1 = Document_Format.create(document=d1, format='format1') # 2 docs with 'format2'. f2 = Document_Format.create(document=d2, format='format2') f3 = Document_Format.create(document=d3, format='format2') # 3 docs with 'format3'. f4 = Document_Format.create(document=d4, format='format3') f5 = Document_Format.create(document=d5, format='format3') f6 = Document_Format.create(document=d6, format='format3') assert Document_Format.format_counts() == [ ('format3', 3), ('format2', 2), ('format1', 1) ]
def _doc(content='content'): # Write a file. path = mock_osp.add_file(content=content) syllabus = Syllabus(path) # Insert the document row. document = Document.create(path=syllabus.relative_path) # Extract text. text = ext_text(document.id) return document
def test_text_extraction_fails(models, mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content="") document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def ext_format(doc_id): """ Write the libmagic file format. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) return Document_Format.create( format=doc.syllabus.libmagic_file_type, document=doc )
def test_read_format(models, mock_osp): """ read_format() should write the format to the `document_format` table. """ # Add a file, create a document row. path = mock_osp.add_file() document = Document.create(path=path) ext_format(document.id) # Pop out the new row. row = Document_Format.get(Document_Format.document==document) assert row.format == 'text/plain'
def _ext(ftype): # Create a document. path = mock_osp.add_file(ftype=ftype) document = Document.create(path=path) # Extract the date. ext_file_metadata(document.id) # Pop out the new row. return ( Document_Date_File_Metadata .select() .where(Document_Date_File_Metadata.document==document) .first() )
def test_text_extraction_succeeds(models, mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content="text") document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == "text"
def _ext(url): # Create a document. path = mock_osp.add_file(log={'url': url}) document = Document.create(path=path) # Extract the date. ext_archive_url(document.id) # Pop out the new row. return ( Document_Date_Archive_Url .select() .where(Document_Date_Archive_Url.document==document) .first() )
def _ext(content): # Create a document. path = mock_osp.add_file(content=content) document = Document.create(path=path) # Extract text, then date. ext_text(document.id) ext_semester(document.id) # Pop out the new row. return ( Document_Date_Semester .select() .where(Document_Date_Semester.document==document) .first() )
def ext_text(doc_id): """ Write the document as plain text. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) if doc.syllabus.text: return Document_Text.create( text=doc.syllabus.text, document=doc )
def ext_file_metadata(doc_id): """ Try to extract a created date from PDF and DOCX file metadata. Args: id (int): The document id. """ doc = Document.get(Document.id==doc_id) date = doc.syllabus.created_date if date: return Document_Date_File_Metadata.create( document=doc, date=date )
def test_insert_new_documents(models, mock_osp): """ When new documents are added to the corpus, just the new documents should be registered in the database. """ # 10 files in `000`. for i in range(10): mock_osp.add_file(segment='000', name='000-'+str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 10 # 10 new files in `001`. for i in range(10): mock_osp.add_file(segment='001', name='001-'+str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 20