def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document==doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document == doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def test_unique_pairs(add_subfield, add_doc): """ Don't allow duplicate links between the same field -> document. """ s = add_subfield() d = add_doc() Subfield_Document.create(subfield=s, document=d, offset=1, snippet='abc') with pytest.raises(IntegrityError): Subfield_Document.create(subfield=s, document=d, offset=2, snippet='def')
def test_no_matches(add_doc, add_subfield): """ When no fields match, don't write any rows. """ doc = add_doc(content='abc Field2 101 def') sf1 = add_subfield(name='Field1') doc_to_fields(doc.id) # Shouldn't write any rows. assert Subfield_Document.select().count() == 0
def test_matches(add_doc, add_subfield): """ When a document contains a field code, write a doc->field link. """ doc = add_doc(content='abc Field1 101 def Field2 101 ghi') sf1 = add_subfield(name='Field1') sf2 = add_subfield(name='Field2') sf3 = add_subfield(name='Field3') doc_to_fields(doc.id) # Should write 2 field -> doc links. assert Subfield_Document.select().count() == 2 # Should match the right fields. for sf in [sf1, sf2]: assert Subfield_Document.select().where( Subfield_Document.subfield == sf, Subfield_Document.document == doc, )
def test_matches(add_doc, add_subfield): """ When a document contains a field code, write a doc->field link. """ doc = add_doc(content='abc Field1 101 def Field2 101 ghi') sf1 = add_subfield(name='Field1') sf2 = add_subfield(name='Field2') sf3 = add_subfield(name='Field3') doc_to_fields(doc.id) # Should write 2 field -> doc links. assert Subfield_Document.select().count() == 2 # Should match the right fields. for sf in [sf1, sf2]: assert Subfield_Document.select().where( Subfield_Document.subfield==sf, Subfield_Document.document==doc, )
def test_character_offset(add_doc, add_subfield): """ Record the character offset of the first match. """ # 01234 doc = add_doc(content='abc Field1 101 def Field1 201 ghi') sf1 = add_subfield(name='Field1') doc_to_fields(doc.id) assert Subfield_Document.select().where( Subfield_Document.subfield == sf1, Subfield_Document.document == doc, Subfield_Document.offset == 3, )
def test_character_offset(add_doc, add_subfield): """ Record the character offset of the first match. """ # 01234 doc = add_doc(content='abc Field1 101 def Field1 201 ghi') sf1 = add_subfield(name='Field1') doc_to_fields(doc.id) assert Subfield_Document.select().where( Subfield_Document.subfield==sf1, Subfield_Document.document==doc, Subfield_Document.offset==3, )
def _subfield_document( subfield=None, document=None, snippet='field', offset=100, ): if not subfield: subfield = add_subfield() if not document: document = add_doc() return Subfield_Document.create( subfield=subfield, document=document, offset=offset, snippet=snippet, )