def test_max_citations(models, add_hlom, add_doc): """ Syllabi with more than `max_citations` should be ignored. """ t1 = add_hlom() t2 = add_hlom() t3 = add_hlom() s1 = add_doc('syllabus1') s2 = add_doc('syllabus2') # 2 citations in s1. HLOM_Citation.create(document=s1, record=t1) HLOM_Citation.create(document=s1, record=t2) # 3 citations in s2. HLOM_Citation.create(document=s2, record=t1) HLOM_Citation.create(document=s2, record=t2) HLOM_Citation.create(document=s2, record=t3) n = Network() n.add_edges(2) # Just register citations from s1. assert n.graph.edge[t1.control_number][t2.control_number]['weight'] == 1 assert nx.number_of_edges(n.graph) == 1
def test_matches(corpus_index, mock_hlom, add_doc, add_hlom): """ When OSP documents match the query, write link rows. """ d1 = add_doc('War and Peace, Leo Tolstoy 1') d2 = add_doc('War and Peace, Leo Tolstoy 2') d3 = add_doc('War and Peace, Leo Tolstoy 3') d4 = add_doc('Anna Karenina, Leo Tolstoy 1') d5 = add_doc('Anna Karenina, Leo Tolstoy 2') Document_Text.es_insert() record = add_hlom('War and Peace', 'Leo Tolstoy') query(record.id) # Should write 3 citation links. assert HLOM_Citation.select().count() == 3 # Should match the right documents. for doc in [d1, d2, d3]: assert HLOM_Citation.select().where( HLOM_Citation.document==doc, HLOM_Citation.record==record )
def test_unique_pairs(models, add_hlom, add_doc): """ Don't allow duplicate links between the same text -> syllabus pair. """ d = add_doc() r = add_hlom() HLOM_Citation.create(document=d, record=r) with pytest.raises(IntegrityError): HLOM_Citation.create(document=d, record=r)
def test_no_matches(corpus_index, add_doc, add_hlom): """ When no documents match, don't write any rows. """ add_doc('War and Peace, Leo Tolstoy') Document_Text.es_insert() record = add_hlom('Master and Man', 'Leo Tolstoy') query(record.id) # Shouldn't write any rows. assert HLOM_Citation.select().count() == 0
def build(self): """ Construct the network. """ # Select all cited HLOM records. nodes = (HLOM_Citation.select(HLOM_Citation.record).distinct( HLOM_Citation.record)) # Add each record as a node. for node in ServerSide(nodes): self.graph.add_node(node.record.control_number, title=node.record.title(), author=node.record.author())
def query(id): """ Query a MARC record against the OSP corpus. :param id: The hlom_record row id. """ row = HLOM_Record.get(HLOM_Record.id==id) # Execute the query. results = config.es.search('osp', 'document', timeout=30, body={ 'fields': ['doc_id'], 'size': 100000, 'filter': { 'query': { 'match_phrase': { 'body': { 'query': row.query, 'slop': 50 } } } } }) if results['hits']['total'] > 0: citations = [] for hit in results['hits']['hits']: citations.append({ 'document': hit['fields']['doc_id'][0], 'record': row.id }) # Write the citation links. HLOM_Citation.insert_many(citations).execute()
def test_state_abbreviations(add_hlom, add_doc): """ HLOM_Citation.index_state() should denormalize state abbreviations. """ t1 = add_hlom() t2 = add_hlom() t3 = add_hlom() s1 = add_doc('syllabus1') s2 = add_doc('syllabus2') s3 = add_doc('syllabus3') c1 = HLOM_Citation.create(document=s1, record=t1) c2 = HLOM_Citation.create(document=s2, record=t2) c3 = HLOM_Citation.create(document=s3, record=t3) # Create institutions with states. AL = Institution.create(metadata={'Institution_State': 'AL'}) CT = Institution.create(metadata={'Institution_State': 'CT'}) CA = Institution.create(metadata={'Institution_State': 'CA'}) # Link documents -> institutions. Document_Institution.create(document=t1, institution=AL) Document_Institution.create(document=t2, institution=CT) Document_Institution.create(document=t3, institution=CA) HLOM_Citation.index_institutions() c1 = HLOM_Citation.reload(c1) c2 = HLOM_Citation.reload(c2) c3 = HLOM_Citation.reload(c3) assert c1.state == 'AL' assert c2.state == 'CT' assert c3.state == 'CA'
def test_institution_ids(add_hlom, add_doc): """ HLOM_Citation.index_state() should denormalize institution ids. """ t1 = add_hlom() t2 = add_hlom() t3 = add_hlom() s1 = add_doc('syllabus1') s2 = add_doc('syllabus2') s3 = add_doc('syllabus3') c1 = HLOM_Citation.create(document=s1, record=t1) c2 = HLOM_Citation.create(document=s2, record=t2) c3 = HLOM_Citation.create(document=s3, record=t3) i1 = Institution.create() i2 = Institution.create() i3 = Institution.create() # Link documents -> institutions. Document_Institution.create(document=t1, institution=i1) Document_Institution.create(document=t2, institution=i2) Document_Institution.create(document=t3, institution=i3) HLOM_Citation.index_institutions() c1 = HLOM_Citation.reload(c1) c2 = HLOM_Citation.reload(c2) c3 = HLOM_Citation.reload(c3) assert c1.institution == i1 assert c2.institution == i2 assert c3.institution == i3
def test_add_edges(models, add_hlom, add_doc): """ Network#add_edges() should register edges from the citation table. """ t1 = add_hlom() t2 = add_hlom() t3 = add_hlom() t4 = add_hlom() t5 = add_hlom() t6 = add_hlom() s1 = add_doc('syllabus1') s2 = add_doc('syllabus2') s3 = add_doc('syllabus3') # texts 1-4 in s1. HLOM_Citation.create(document=s1, record=t1) HLOM_Citation.create(document=s1, record=t2) HLOM_Citation.create(document=s1, record=t3) HLOM_Citation.create(document=s1, record=t4) # texts 2-5 in s2. HLOM_Citation.create(document=s2, record=t2) HLOM_Citation.create(document=s2, record=t3) HLOM_Citation.create(document=s2, record=t4) HLOM_Citation.create(document=s2, record=t5) # texts 3-6 in s3. HLOM_Citation.create(document=s3, record=t3) HLOM_Citation.create(document=s3, record=t4) HLOM_Citation.create(document=s3, record=t5) HLOM_Citation.create(document=s3, record=t6) n = Network() n.add_edges() assert n.graph.edge[t1.control_number][t2.control_number]['weight'] == 1 assert n.graph.edge[t2.control_number][t3.control_number]['weight'] == 2 assert n.graph.edge[t3.control_number][t4.control_number]['weight'] == 3 assert n.graph.edge[t4.control_number][t5.control_number]['weight'] == 2 assert n.graph.edge[t5.control_number][t6.control_number]['weight'] == 1