def test_queue(api_client): """ /queue should queue a work order. """ for i in range(100): Document.create(path=str(i)) r = api_client.post('/queue', data=dict( model_import='osp.corpus.models.Document', job_import='osp.corpus.jobs.ext_text', worker_count=20, offset=10, )) # Should queue meta-job. assert config.rq.count == 1 # Run the queue-job. meta = config.rq.dequeue() meta.perform() # Should spool the work jobs. for i, doc in enumerate(Document.page_cursor(20, 10)): assert config.rq.jobs[i].func == ext_text assert config.rq.jobs[i].args == (doc.id, )
def insert_documents(): """ Insert documents in the database. """ Document.insert_documents()
def test_insert_documents(mock_osp): """ Corpus.insert_documents() should create a row for each syllabus. """ # 10 segments x 10 files. for s in segment_range(10): for i in range(10): mock_osp.add_file(segment=s, name=s + "-" + str(i)) # Insert document rows. Document.insert_documents() # Should create 100 rows. assert Document.select().count() == 100 # All docs should have rows. for s in segment_range(10): for i in range(10): # Path is [segment]/[file] path = s + "/" + s + "-" + str(i) # Query for the document path. query = Document.select().where(Document.path == path) assert query.count() == 1
def test_queue(api_client): """ /queue should queue a work order. """ for i in range(100): Document.create(path=str(i)) r = api_client.post('/queue', data=dict( model_import = 'osp.corpus.models.Document', job_import = 'osp.corpus.jobs.ext_text', worker_count = 20, offset = 10, )) # Should queue meta-job. assert config.rq.count == 1 # Run the queue-job. meta = config.rq.dequeue() meta.perform() # Should spool the work jobs. for i, doc in enumerate(Document.page_cursor(20, 10)): assert config.rq.jobs[i].func == ext_text assert config.rq.jobs[i].args == (doc.id,)
def queue_text(): """ Queue text extraction tasks in the worker. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_text, doc.id)
def run_doc_to_inst(): """ Match documents -> institutions. """ for doc in query_bar(Document.select()): try: doc_to_inst(doc.id) except: pass
def run_doc_to_fields(): """ Match documents -> fields. """ for doc in query_bar(Document.select()): try: doc_to_fields(doc.id) except: pass
def test_syllabus(mock_osp): """ Document#syllabus should provide a Syllabus instance bound to the file referenced by the document row. """ path = mock_osp.add_file('000', name='123') doc = Document.create(path='000/123') assert isinstance(doc.syllabus, Syllabus) assert doc.syllabus.path == path
def test_format_counts(): """ Document.format_counts() """ d1 = Document.create(path='1') d2 = Document.create(path='2') d3 = Document.create(path='3') d4 = Document.create(path='4') d5 = Document.create(path='5') d6 = Document.create(path='6') # 1 doc with 'format1'. f1 = Document_Format.create(document=d1, format='format1') # 2 docs with 'format2'. f2 = Document_Format.create(document=d2, format='format2') f3 = Document_Format.create(document=d3, format='format2') # 3 docs with 'format3'. f4 = Document_Format.create(document=d4, format='format3') f5 = Document_Format.create(document=d5, format='format3') f6 = Document_Format.create(document=d6, format='format3') assert Document_Format.format_counts() == [('format3', 3), ('format2', 2), ('format1', 1)]
def test_format_counts(): """ Document.format_counts() """ d1 = Document.create(path="1") d2 = Document.create(path="2") d3 = Document.create(path="3") d4 = Document.create(path="4") d5 = Document.create(path="5") d6 = Document.create(path="6") # 1 doc with 'format1'. f1 = Document_Format.create(document=d1, format="format1") # 2 docs with 'format2'. f2 = Document_Format.create(document=d2, format="format2") f3 = Document_Format.create(document=d3, format="format2") # 3 docs with 'format3'. f4 = Document_Format.create(document=d4, format="format3") f5 = Document_Format.create(document=d5, format="format3") f6 = Document_Format.create(document=d6, format="format3") assert Document_Format.format_counts() == [("format3", 3), ("format2", 2), ("format1", 1)]
def link(cls): """ Link documents -> institutions. """ domain_to_inst = defaultdict(list) # Map domain -> [(regex, inst), ...] for inst in ServerSide(Institution.select()): domain = parse_domain(inst.url) regex = seed_to_regex(inst.url) domain_to_inst[domain].append((regex, inst)) for doc in query_bar(Document.select()): try: # TODO: Get rid of @property. url = doc.syllabus.url domain = parse_domain(url) # Find institutions with matching URLs. matches = [] for pattern, inst in domain_to_inst[domain]: match = pattern.search(url) if match: matches.append((match.group(), inst)) if matches: # Sort by length of match, descending. matches = sorted( matches, key=lambda x: len(x[0]), reverse=True, ) # Link to the institution with the longest match. cls.create( institution=matches[0][1], document=doc, ) except Exception as e: print(e)
def _doc(*args, **kwargs): # Write a file. path = mock_osp.add_file(*args, **kwargs) syllabus = Syllabus(path) # Insert the document row. document = Document.create(path=syllabus.relative_path) # Extract text. text = ext_text(document.id) return document
def test_text_extraction_fails(mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content='') document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def test_read_format(mock_osp): """ read_format() should write the format to the `document_format` table. """ # Add a file, create a document row. path = mock_osp.add_file() document = Document.create(path=path) ext_format(document.id) # Pop out the new row. row = Document_Format.get(Document_Format.document == document) assert row.format == 'text/plain'
def ext_format(doc_id): """ Write the libmagic file format. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) return Document_Format.create( format=doc.syllabus.libmagic_file_type, document=doc )
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == 'text'
def test_read_format(mock_osp): """ read_format() should write the format to the `document_format` table. """ # Add a file, create a document row. path = mock_osp.add_file() document = Document.create(path=path) ext_format(document.id) # Pop out the new row. row = Document_Format.get(Document_Format.document==document) assert row.format == 'text/plain'
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document==document) assert row.text == 'text'
def ext_text(doc_id): """ Write the document as plain text. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) if doc.syllabus.text: return Document_Text.create( text=doc.syllabus.text, document=doc )
def doc_to_inst(doc_id): """ Match a document with an institution. """ doc = Document.get(Document.id==doc_id) inst = ( Institution .select() .where(Institution.domain==doc.syllabus.domain) .first() ) if inst: Institution_Document.create( institution=inst, document=doc, )
def test_insert_new_documents(mock_osp): """ When new documents are added to the corpus, just the new documents should be registered in the database. """ # 10 files in `000`. for i in range(10): mock_osp.add_file(segment="000", name="000-" + str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 10 # 10 new files in `001`. for i in range(10): mock_osp.add_file(segment="001", name="001-" + str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 20