def test_spacy_rules(): try: obj = request.get_json(force=True) rule_extractor = SpacyRuleExtractor( etk.default_nlp, obj, "test_extractor") tokens = rule_extractor.tokenizer.tokenize_to_spacy_doc(obj['test_text']) obj['test_tokens'] = [] for t in tokens: obj['test_tokens'].append({ 'index': t.i, 'whitespace': t.whitespace_, 'text': t.text }) obj['results'] = [] for extraction in rule_extractor.extract(obj['test_text']): obj['results'].append({ 'confidence': extraction.confidence, 'start_token': extraction.provenance['start_token'], 'end_token': extraction.provenance['end_token'], 'start_char': extraction.provenance['start_char'], 'end_char': extraction.provenance['end_char'], 'identifier': extraction.rule_id, 'text': extraction.value, 'token_based_match_mapping': extraction.token_based_match_mapping }) return json.dumps(obj), 201 except Exception as e: print(e) return json.dumps({'message': 'exception: {}'.format(e.message)}), 400
def test_SpacyRuleExtractor_word_2(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_word_2"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and others." ) expected = [('rule_0', 'Name: Sylvia lin')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_number_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_number_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "Extract from the following number: 32 12 54435 23 665.3 34 65.42 23 4545" ) expected = [('rule_0', '665.3'), ('rule_0', '4545')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_linebreak_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_linebreak_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, \n\n\n Sylvia-lin, Amandeep and others." ) expected = [('rule_0', 'Length 3 linebreak: \n\n\n ')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_shape_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_shape_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by RqS, DongYu94 Li, Sylvia lin, Amandeep and others." ) expected = [('rule_0', 'RqS'), ('rule_0', 'DongYu94')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_punc_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_punc_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, Sylvia-lin, Amandeep and others." ) expected = [('rule_0', 'Name: Rq, Shao'), ('rule_0', 'Name: Sylvia, lin')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_word_5(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_word_5"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi Shao, DongYu94 Li, Sylvia lin, Amandeep and others." ) expected = [ ('rule_0', 'First Name: Runqi, Last Name: Shao. Full name: Runqi Shao'), ('rule_0', 'First Name: DongYu94, Last Name: Li. Full name: DongYu94 Li') ] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor(self) -> None: hme = HTMLMetadataExtractor() with open('etk/unit_tests/ground_truth/news.html', 'r') as f: sample_html = f.read() sample_rules = json.load( open('etk/unit_tests/ground_truth/sample_spacy_rule.json')) title_extraction = hme.extract(sample_html, extract_title=True)[0].value sample_rule_extractor = SpacyRuleExtractor( spacy.load("en_core_web_sm"), sample_rules, "dummy") extractions = sample_rule_extractor.extract(title_extraction) expected_extraction = 'Trump' self.assertEqual(extractions[0].value, expected_extraction)
def test_SpacyRuleExtractor(self) -> None: sample_rules = { "field_name": "test", "rules": [{ "dependencies": [], "description": "", "identifier": "rule_3", "is_active": "true", "output_format": "firstName:{1}, lastName:{2}", "pattern": [{ "capitalization": ["title"], "contain_digit": "true", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "true", "length": [], "match_all_forms": "true", "maximum": "", "minimum": "", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "word" }, { "capitalization": ["title"], "contain_digit": "false", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "false", "length": [], "match_all_forms": "true", "maximum": "", "minimum": "", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "word" }], "polarity": "true" }, { "dependencies": [], "description": "", "identifier": "rule_4", "is_active": "true", "output_format": "number:{1}", "pattern": [{ "capitalization": [], "contain_digit": "true", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "true", "length": [], "match_all_forms": "true", "maximum": "5", "minimum": "0", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "number" }], "polarity": "true" }] } sample_rule_extractor = SpacyRuleExtractor( spacy.load("en_core_web_sm"), sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others." ) expected = [('rule_4', 'number:2'), ('rule_3', 'firstName:Runqi12, lastName:Shao'), ('rule_3', 'firstName:Dongyu, lastName:Li'), ('rule_3', 'firstName:Sylvia, lastName:{2}'), ('rule_3', 'firstName:Amandeep, lastName:{2}')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)