def test_spacy_rules(): try: obj = request.get_json(force=True) rule_extractor = SpacyRuleExtractor( etk.default_nlp, obj, "test_extractor") tokens = rule_extractor.tokenizer.tokenize_to_spacy_doc(obj['test_text']) obj['test_tokens'] = [] for t in tokens: obj['test_tokens'].append({ 'index': t.i, 'whitespace': t.whitespace_, 'text': t.text }) obj['results'] = [] for extraction in rule_extractor.extract(obj['test_text']): obj['results'].append({ 'confidence': extraction.confidence, 'start_token': extraction.provenance['start_token'], 'end_token': extraction.provenance['end_token'], 'start_char': extraction.provenance['start_char'], 'end_char': extraction.provenance['end_char'], 'identifier': extraction.rule_id, 'text': extraction.value, 'token_based_match_mapping': extraction.token_based_match_mapping }) return json.dumps(obj), 201 except Exception as e: print(e) return json.dumps({'message': 'exception: {}'.format(e.message)}), 400
def test_SpacyRuleExtractor_word_2(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_word_2"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and others." ) expected = [('rule_0', 'Name: Sylvia lin')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_number_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_number_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "Extract from the following number: 32 12 54435 23 665.3 34 65.42 23 4545" ) expected = [('rule_0', '665.3'), ('rule_0', '4545')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_linebreak_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_linebreak_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, \n\n\n Sylvia-lin, Amandeep and others." ) expected = [('rule_0', 'Length 3 linebreak: \n\n\n ')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_shape_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_shape_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by RqS, DongYu94 Li, Sylvia lin, Amandeep and others." ) expected = [('rule_0', 'RqS'), ('rule_0', 'DongYu94')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_punc_1(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_punc_1"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, Sylvia-lin, Amandeep and others." ) expected = [('rule_0', 'Name: Rq, Shao'), ('rule_0', 'Name: Sylvia, lin')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor_word_5(self) -> None: sample_rules = rules["test_SpacyRuleExtractor_word_5"] sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi Shao, DongYu94 Li, Sylvia lin, Amandeep and others." ) expected = [ ('rule_0', 'First Name: Runqi, Last Name: Shao. Full name: Runqi Shao'), ('rule_0', 'First Name: DongYu94, Last Name: Li. Full name: DongYu94 Li') ] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
def test_SpacyRuleExtractor(self) -> None: hme = HTMLMetadataExtractor() with open('etk/unit_tests/ground_truth/news.html', 'r') as f: sample_html = f.read() sample_rules = json.load( open('etk/unit_tests/ground_truth/sample_spacy_rule.json')) title_extraction = hme.extract(sample_html, extract_title=True)[0].value sample_rule_extractor = SpacyRuleExtractor( spacy.load("en_core_web_sm"), sample_rules, "dummy") extractions = sample_rule_extractor.extract(title_extraction) expected_extraction = 'Trump' self.assertEqual(extractions[0].value, expected_extraction)
def __init__(self, etk): ETKModule.__init__(self, etk) sample_rules = self.etk.load_spacy_rule( "./extraction_modules/resources/sample_rules.json") self.sample_rule_extractor = SpacyRuleExtractor( self.etk.default_nlp, sample_rules, "test_extractor")
def _extract_relative_dates(self, text: str) -> List[Extraction]: """ Extract relative dates using spaCy rules Args: text: str - the text to extract the relative date strings from Returns: List of Extraction(s) """ if not text or not self._etk: return list() base = self._settings[RELATIVE_BASE] if self._settings[RELATIVE_BASE] else datetime.datetime.now() if not self._settings[RETURN_AS_TIMEZONE_AWARE]: base = base.replace(tzinfo=None) elif not base.tzinfo: base = base.astimezone(self._default_tz) res = SpacyRuleExtractor(self._etk.default_nlp, spacy_rules, 'relative_date_extractor').extract(text) ans = list() for relative_date in res: if relative_date.rule_id == 'direction_number_unit': direction, measure, unit = relative_date.value.split() measure = num_to_digit[measure.lower()] elif relative_date.rule_id == 'number_unit_direction': measure, unit, direction = relative_date.value.split() measure = num_to_digit[measure.lower()] elif relative_date.rule_id == 'direction_digit_unit': direction, measure, unit = relative_date.value.split() elif relative_date.rule_id == 'digit_unit_direction': measure, unit, direction = relative_date.value.split() elif relative_date.rule_id == 'direction_unit': direction, unit = relative_date.value.split() measure = '1' elif relative_date.rule_id == 'the_day': unit = 'days' key_ = relative_date.value.split()[-1].lower() if key_ == 'today': direction = 'ago' measure = '0' else: direction = 'ago' if key_ == 'yesterday' else 'later' measure = '1' if len(relative_date.value.split()) == 1 else '2' else: continue unit = unit if unit[-1] == 's' else unit+'s' direction = directions[direction.lower()] if direction.lower() in directions else '+' delta_args = {unit: int(direction+measure)} relative_delta = relativedelta(**delta_args) date = self._post_process_date(base+relative_delta) if date: extraction_date = self._wrap_extraction(date, relative_date.value, relative_date.provenance['start_char'], relative_date.provenance['end_char']) if extraction_date: ans.append(extraction_date) return ans
def __init__(self, etk): ETKModule.__init__(self, etk) self.rule_extractor = SpacyRuleExtractor( self.etk.default_nlp, self.etk.load_spacy_rule("sample_rules.json"), "test_extractor")
def test_SpacyRuleExtractor(self) -> None: sample_rules = { "field_name": "test", "rules": [{ "dependencies": [], "description": "", "identifier": "rule_3", "is_active": "true", "output_format": "firstName:{1}, lastName:{2}", "pattern": [{ "capitalization": ["title"], "contain_digit": "true", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "true", "length": [], "match_all_forms": "true", "maximum": "", "minimum": "", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "word" }, { "capitalization": ["title"], "contain_digit": "false", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "false", "length": [], "match_all_forms": "true", "maximum": "", "minimum": "", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "word" }], "polarity": "true" }, { "dependencies": [], "description": "", "identifier": "rule_4", "is_active": "true", "output_format": "number:{1}", "pattern": [{ "capitalization": [], "contain_digit": "true", "is_in_output": "true", "is_in_vocabulary": "false", "is_out_of_vocabulary": "false", "is_required": "true", "length": [], "match_all_forms": "true", "maximum": "5", "minimum": "0", "numbers": [], "part_of_speech": [], "prefix": "", "shapes": [], "suffix": "", "token": [], "type": "number" }], "polarity": "true" }] } sample_rule_extractor = SpacyRuleExtractor( spacy.load("en_core_web_sm"), sample_rules, "test_extractor") extractions = sample_rule_extractor.extract( "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others." ) expected = [('rule_4', 'number:2'), ('rule_3', 'firstName:Runqi12, lastName:Shao'), ('rule_3', 'firstName:Dongyu, lastName:Li'), ('rule_3', 'firstName:Sylvia, lastName:{2}'), ('rule_3', 'firstName:Amandeep, lastName:{2}')] self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)