def test_gz_trie_match(self): taxonomy_matcher = Matcher(gazetteer=self.gz_file) text = 'ab Foo bar Foo foo chao foo\nBar foo bar foo foo' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(taxonomy_matcher.matching(text)) ], [ ('Foo bar Foo', 3, 13), ('chao', 19, 22), ('foo\nBar foo', 24, 37), ('bar foo foo', 39, 49) ] ) text = 'ab Foo Foo foo foo\nBar bar foo foo' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(taxonomy_matcher.matching(text)) ], [ ('foo\nBar', 15, 21), ('bar foo foo', 27, 37) ] )
class ParticalMatcherTestCases(TestCase): def setUp(self): gz_content = '''dutch dutch (flemish) foo bar''' self.gz_file = tempfile.NamedTemporaryFile(mode='w', delete=False) with self.gz_file as f: f.write(gz_content) def test_partial_trie_match_braket(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = "November 1954 Place of Birth : Rotterdam Holland Passport : \ dutch (Current) Domiciled in NZ : 47 years" self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('dutch', 62, 66), ] ) def test_partial_trie_match_doc_end(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = ''' foo bar ''' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('foo bar', 2, 8), ] ) def tearDown(self): path = pathlib.Path(self.gz_file.name) path.unlink()
def test_likelihood_from_nt(self): """the likelihood with the correct value""" taxonomy_matcher = Matcher(normtable=self.nt_file) print(taxonomy_matcher.trie_matcher.token_trie) text = '''try this: UI programming or Ui Programming, and math tools like Mathématiques or mathematiques, or mathématiques.''' matched_phrases = list(taxonomy_matcher.matching(text)) print(matched_phrases) self.assertEqual(len(matched_phrases), 5) self.assertEqual([ matched_phrase.surface_form for matched_phrase in matched_phrases ], [ 'UI programming', 'Ui Programming', 'Mathématiques', 'mathematiques', 'mathématiques' ], 'extracted phrases') self.assertEqual([ matched_phrase.skill_likelihood for matched_phrase in matched_phrases ], [0.5, 0.5, 0.4, 0.4, 0.4], 'skill_likelihood')
def test_nt_trie_match(self): taxonomy_matcher = Matcher(normtable=self.nt_file) text = '''A build script is required to do the UI programming or so called user interface programming, the whole process can be managed by applying Oracle agile product lifecycle management with the help of Orale Apache Mahout expert (using Oracle agile PLM and MAHOUT). ''' self.assertEqual( [(match.surface_form, match.start_pos, match.end_pos, match.code_description) for match in list(taxonomy_matcher.matching(text))], [('build script', 2, 13, 'Build Script'), ('UI programming', 37, 50, 'User Interface Programming'), ('user interface programming', 65, 90, 'User Interface Programming'), ('Oracle agile product lifecycle management', 138, 178, 'Oracle Agile Product Lifecycle Management'), ('Apache Mahout', 203, 215, 'Mahout'), ('Oracle agile PLM', 231, 246, 'Oracle Agile Product Lifecycle Management'), ('MAHOUT', 252, 257, 'Mahout')])