def test_match_paths(add_doc, add_institution): """ If the document has a path, it should be matched "greedily" against the institutions - find the institution with the longest shared path. """ i1 = add_institution(url='http://yale.edu') i2 = add_institution(url='http://yale.edu/p1') i3 = add_institution(url='http://yale.edu/p1/p2') i4 = add_institution(url='http://yale.edu/p1/p2/p3') d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://yale.edu/p1/syllabus.pdf')) d3 = add_doc(log=dict(url='http://yale.edu/p1/p2/syllabus.pdf')) d4 = add_doc(log=dict(url='http://yale.edu/p1/p2/p3/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), (i4, d4), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_link(add_doc, add_institution): """ .link() should link documents -> institutions. """ i1 = add_institution(url='http://d1.edu') i2 = add_institution(url='http://d2.edu') i3 = add_institution(url='http://d3.edu') d1 = add_doc(log=dict(url='http://d1.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://d2.edu/syllabus.pdf')) d3 = add_doc(log=dict(url='http://d3.edu/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_index_institution_refs(add_citation, add_institution): """ When the document is linked with an institution, an institution reference should be included in the document. """ citation = add_citation() institution = add_institution(state='CA', country='US') # Link inst -> citation. Institution_Document.create( institution=institution, document=citation.document, ) Citation_Index.es_insert() doc = config.es.get( index='citation', id=citation.id, ) assert doc['_source']['institution_id'] == institution.id assert doc['_source']['state'] == 'CA' assert doc['_source']['country'] == 'US'
def doc_to_inst(): """ Link document -> institution. """ Institution_Document.link()
def test_match(add_doc, add_institution): """ When a doc URL matches an institution domain, write a link. """ doc = add_doc(log={ 'url': 'http://yale.edu/syllabus.pdf' }) yale = add_institution( name='Yale University', domain='yale.edu', ) harvard = add_institution( name='Harvard University', domain='harvard.edu', ) doc_to_inst(doc.id) # Should write a link. assert Institution_Document.select().count() == 1 # Should link the right rows. assert Institution_Document.select().where( Institution_Document.institution==yale, Institution_Document.document==doc, )
def test_match_subdomains(add_doc, add_institution): """ If the document has a subdomain(s), it should be matched "greedily" against the institutions - find the institution with the most shared subdomains. """ i1 = add_institution(url='http://yale.edu') i2 = add_institution(url='http://sub1.yale.edu') i3 = add_institution(url='http://sub2.sub1.yale.edu') i4 = add_institution(url='http://sub3.sub2.sub1.yale.edu') d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://sub1.yale.edu/syllabus.pdf')) d3 = add_doc(log=dict(url='http://sub2.sub1.yale.edu/syllabus.pdf')) d4 = add_doc(log=dict(url='http://sub3.sub2.sub1.yale.edu/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), (i4, d4), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_institution(add_doc, add_institution, add_citation): """ Citation#institution should provide the document's institution. """ document = add_doc() institution = add_institution() # Link inst -> document. Institution_Document.create( institution=institution, document=document, ) citation = add_citation(document=document) assert citation.institution.id == institution.id
def test_unique_pairs(add_doc, add_institution): """ Don't allow duplicate links between the same doc -> inst pair. """ inst = add_institution() doc = add_doc() Institution_Document.create( institution=inst, document=doc, ) with pytest.raises(IntegrityError): Institution_Document.create( institution=inst, document=doc, )
def doc_to_inst(doc_id): """ Match a document with an institution. """ doc = Document.get(Document.id==doc_id) inst = ( Institution .select() .where(Institution.domain==doc.syllabus.domain) .first() ) if inst: Institution_Document.create( institution=inst, document=doc, )
def test_merge_included(add_institution, add_citation): """ When ids are passed for institutions that fall outside of the default page, merge the extra facets into the baseline ranking. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() Institution_Index.es_insert() facets = institution_facets(depth=2, include=[i2.id, i3.id]) assert facets == [ dict(label='Institution 1', value=i1.id, count=3), dict(label='Institution 2', value=i2.id, count=2), # Dedupe 2. dict(label='Institution 3', value=i3.id, count=1), # Append 3. ]
def test_institution_facets(add_institution, add_citation): """ institution_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() Institution_Index.es_insert() facets = institution_facets() assert facets == [ dict(label='Institution 1', value=i1.id, count=3), dict(label='Institution 2', value=i2.id, count=2), dict(label='Institution 3', value=i3.id, count=1), ]
def test_state_facets(add_institution, add_citation): """ state_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(state='CA') i2 = add_institution(state='AL') i3 = add_institution(state='MA') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() facets = state_facets() assert facets == [ dict(label='California', value=i1.state, count=3), dict(label='Alabama', value=i2.state, count=2), dict(label='Massachusetts', value=i3.state, count=1), ]
def test_filter_country(add_text, add_citation, add_institution): """ Filter on country as a keyword value. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution(country='USA') i2 = add_institution(country='CAN') for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(country='USA')) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_filter_institution(add_text, add_citation, add_institution): """ Filter by institution. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution() i2 = add_institution() for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict(institution_id=i1.id)) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_country_facets(add_institution, add_citation): """ country_facets() should provide a list of label/value/count dicts. """ i1 = add_institution(country='AU') i2 = add_institution(country='CA') i3 = add_institution(country='NZ') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() facets = country_facets() assert facets == [ dict(label='Australia', value=i1.country, count=3), dict(label='Canada', value=i2.country, count=2), dict(label='New Zealand', value=i3.country, count=1), ]
def test_merge_included_facets(add_institution, add_citation): """ Don't duplicate included facets are already present in the ranking. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') for i in range(3): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i3, document=c.document) Citation_Index.es_insert() counts = Citation_Index.count_facets( 'institution_id', include=[i2.id, i3.id], ) # Dedupe 2 and 3. assert counts == [ (i1.id, 3), (i2.id, 2), (i3.id, 1), ]
def test_filter_country(add_text, add_citation, add_institution): """ Filter on country as a keyword value. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution(country='USA') i2 = add_institution(country='CAN') for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( country='USA' )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_filter_institution(add_text, add_citation, add_institution): """ Filter by institution. """ t1 = add_text() t2 = add_text() t3 = add_text() i1 = add_institution() i2 = add_institution() for i in range(3): c = add_citation(text=t1) Institution_Document.create(institution=i1, document=c.document) for i in range(2): c = add_citation(text=t2) Institution_Document.create(institution=i1, document=c.document) for i in range(1): c = add_citation(text=t3) Institution_Document.create(institution=i2, document=c.document) Citation_Index.es_insert() ranks = Citation_Index.compute_ranking(dict( institution_id=i1.id )) assert ranks == { str(t1.id): 3, str(t2.id): 2, }
def test_append_included_facets(add_institution, add_citation): """ When "included" facets have counts that put them below of the baseline ranking, append the extra facets to the bottom of the list. """ i1 = add_institution(name='Institution 1') i2 = add_institution(name='Institution 2') i3 = add_institution(name='Institution 3') i4 = add_institution(name='Institution 4') for i in range(4): c = add_citation() Institution_Document.create(institution=i1, document=c.document) for i in range(3): c = add_citation() Institution_Document.create(institution=i2, document=c.document) for i in range(2): c = add_citation() Institution_Document.create(institution=i3, document=c.document) for i in range(1): c = add_citation() Institution_Document.create(institution=i4, document=c.document) Citation_Index.es_insert() counts = Citation_Index.count_facets( 'institution_id', include=[i3.id, i4.id], depth=2, ) assert counts == [ (i1.id, 4), (i2.id, 3), # Include 3 and 4. (i3.id, 2), (i4.id, 1), ]
def doc_to_inst(out_file, n): """ Dump N institution -> document matches. """ cols = ['inst_url', 'doc_url'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Pull matches. matches = Institution_Document.select().limit(n) for row in matches: writer.writerow(dict( inst_url=row.institution.url, doc_url=row.document.syllabus.url, ))
def test_no_match(add_doc, add_institution): """ When the URL doesn't match an institution, don't write a row. """ doc = add_doc(log={ 'url': 'http://yale.edu/syllabus.pdf' }) harvard = add_institution( name='Harvard University', domain='harvard.edu', ) doc_to_inst(doc.id) # Shouldn't write a link. assert Institution_Document.select().count() == 0