def test_insert_documents(mock_osp): """ Corpus.insert_documents() should create a row for each syllabus. """ # 10 segments x 10 files. for s in segment_range(10): for i in range(10): mock_osp.add_file(segment=s, name=s + "-" + str(i)) # Insert document rows. Document.insert_documents() # Should create 100 rows. assert Document.select().count() == 100 # All docs should have rows. for s in segment_range(10): for i in range(10): # Path is [segment]/[file] path = s + "/" + s + "-" + str(i) # Query for the document path. query = Document.select().where(Document.path == path) assert query.count() == 1
def queue_text(): """ Queue text extraction tasks in the worker. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_text, doc.id)
def run_doc_to_fields(): """ Match documents -> fields. """ for doc in query_bar(Document.select()): try: doc_to_fields(doc.id) except: pass
def run_doc_to_inst(): """ Match documents -> institutions. """ for doc in query_bar(Document.select()): try: doc_to_inst(doc.id) except: pass
def link(cls): """ Link documents -> institutions. """ domain_to_inst = defaultdict(list) # Map domain -> [(regex, inst), ...] for inst in ServerSide(Institution.select()): domain = parse_domain(inst.url) regex = seed_to_regex(inst.url) domain_to_inst[domain].append((regex, inst)) for doc in query_bar(Document.select()): try: # TODO: Get rid of @property. url = doc.syllabus.url domain = parse_domain(url) # Find institutions with matching URLs. matches = [] for pattern, inst in domain_to_inst[domain]: match = pattern.search(url) if match: matches.append((match.group(), inst)) if matches: # Sort by length of match, descending. matches = sorted( matches, key=lambda x: len(x[0]), reverse=True, ) # Link to the institution with the longest match. cls.create( institution=matches[0][1], document=doc, ) except Exception as e: print(e)
def test_insert_new_documents(mock_osp): """ When new documents are added to the corpus, just the new documents should be registered in the database. """ # 10 files in `000`. for i in range(10): mock_osp.add_file(segment="000", name="000-" + str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 10 # 10 new files in `001`. for i in range(10): mock_osp.add_file(segment="001", name="001-" + str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 20