def test_basic(self): t1 = "This is a basic text. Two sentences. Maybe three?" t2 = "Two sentences. Maybe...three? this is a basic text." s = PhraseSequencer(self.corpus) p1 = sentence_parse(t1, s) p2 = sentence_parse(t2, s) self.assertEqual([(0, [(0, 21)]), (1, [(22, 36)]), (2, [(37, 49)])], p1) self.assertEqual([(0, [(30, 51)]), (1, [(0, 14)]), (2, [(15, 29)])], p2)
def test_duplicated_phrases(self): doc = 'The same sentence. The same sentence. A different sentence. The SAME sentence.' i = DocumentIngester(self.corpus) self.assertEqual([(0, [(0, 18), (19, 37), (60, 78)]), (1, [(38, 59)])], sentence_parse(doc, i.sequencer)) i.ingest([doc])
def test_sentence_parse(self): s = PhraseSequencer(self.corpus) t = '' self.assertEqual([], sentence_parse(t, s)) t = ' ' self.assertEqual([], sentence_parse(t, s)) t = 'A simple test case. Of two sentences.' self.assertEqual([(0, [(0, 19)]), (1, [(20, 37)])], sentence_parse(t, s)) t = ' \n A simple test case. \t \t \n Of two sentences.\n' self.assertEqual([(0, [(3, 22)]), (1, [(29, 46)])], sentence_parse(t, s)) t = 'of two sentences. of two sentences?' self.assertEqual([(1, [(0, 17), (18, 35)])], sentence_parse(t, s))
def test_empty(self): s = PhraseSequencer(self.corpus) c = connection.cursor() p = sentence_parse('', s) self.assertEqual([], p) c.execute('select count(*) from phrases') self.assertEqual(0, c.fetchone()[0])
def test_ingester(self): i = DocumentIngester(self.corpus) s = PhraseSequencer(self.corpus) t1 = 'This document has three sentences. One of which matches. Two of which do not.' t2 = 'This document has only two sentences. One of which matches.' i._record_document(t1, sentence_parse(t1, s), {}) i._record_document(t2, sentence_parse(t2, s), {}) s.upload_new_phrases() i._upload_new_documents() c = connection.cursor() c.execute("select count(*) from documents") self.assertEqual(2, c.fetchone()[0]) c.execute("select count(*) from phrase_occurrences") self.assertEqual(5, c.fetchone()[0]) # make sure we can add on to existing data i = DocumentIngester(self.corpus) s = PhraseSequencer(self.corpus) t3 = 'This document has only two sentences. Only one of which is new.' p3 = sentence_parse(t3, s) doc_id = i._record_document(t3, p3, {}) self.assertEqual(2, doc_id) self.assertEqual([(3, [(0, 37)]), (4, [(38, 63)])], p3) s.upload_new_phrases() i._upload_new_documents() c.execute("select count(*) from documents") self.assertEqual(3, c.fetchone()[0]) c.execute("select count(*) from phrase_occurrences") self.assertEqual(7, c.fetchone()[0])