예제 #1
0
def test_spacy_rules():

    try:
        obj = request.get_json(force=True)

        rule_extractor = SpacyRuleExtractor(
            etk.default_nlp,
            obj, "test_extractor")
        tokens = rule_extractor.tokenizer.tokenize_to_spacy_doc(obj['test_text'])
        obj['test_tokens'] = []
        for t in tokens:
            obj['test_tokens'].append({
                'index': t.i,
                'whitespace': t.whitespace_,
                'text': t.text
            })
        obj['results'] = []
        for extraction in rule_extractor.extract(obj['test_text']):
            obj['results'].append({
                'confidence': extraction.confidence,
                'start_token': extraction.provenance['start_token'],
                'end_token': extraction.provenance['end_token'],
                'start_char': extraction.provenance['start_char'],
                'end_char': extraction.provenance['end_char'],
                'identifier': extraction.rule_id,
                'text': extraction.value,
                'token_based_match_mapping': extraction.token_based_match_mapping
            })

        return json.dumps(obj), 201

    except Exception as e:
        print(e)
        return json.dumps({'message': 'exception: {}'.format(e.message)}), 400
예제 #2
0
 def test_SpacyRuleExtractor_word_2(self) -> None:
     sample_rules = rules["test_SpacyRuleExtractor_word_2"]
     sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                "test_extractor")
     extractions = sample_rule_extractor.extract(
         "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and others."
     )
     expected = [('rule_0', 'Name: Sylvia lin')]
     self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
예제 #3
0
    def test_SpacyRuleExtractor_number_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_number_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "Extract from the following number: 32 12 54435 23 665.3 34 65.42 23 4545"
        )

        expected = [('rule_0', '665.3'), ('rule_0', '4545')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
예제 #4
0
    def test_SpacyRuleExtractor_linebreak_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_linebreak_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, \n\n\n Sylvia-lin, Amandeep and others."
        )

        expected = [('rule_0', 'Length 3 linebreak: \n\n\n ')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
예제 #5
0
    def test_SpacyRuleExtractor_shape_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_shape_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by RqS, DongYu94 Li, Sylvia lin, Amandeep and others."
        )

        expected = [('rule_0', 'RqS'), ('rule_0', 'DongYu94')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
예제 #6
0
    def test_SpacyRuleExtractor_punc_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_punc_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, Sylvia-lin, Amandeep and others."
        )

        expected = [('rule_0', 'Name: Rq, Shao'),
                    ('rule_0', 'Name: Sylvia, lin')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
예제 #7
0
    def test_SpacyRuleExtractor_word_5(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_word_5"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Runqi Shao, DongYu94 Li, Sylvia lin, Amandeep and others."
        )
        expected = [
            ('rule_0',
             'First Name: Runqi, Last Name: Shao. Full name: Runqi Shao'),
            ('rule_0',
             'First Name: DongYu94, Last Name: Li. Full name: DongYu94 Li')
        ]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor(self) -> None:
        hme = HTMLMetadataExtractor()
        with open('etk/unit_tests/ground_truth/news.html', 'r') as f:
            sample_html = f.read()

        sample_rules = json.load(
            open('etk/unit_tests/ground_truth/sample_spacy_rule.json'))

        title_extraction = hme.extract(sample_html,
                                       extract_title=True)[0].value

        sample_rule_extractor = SpacyRuleExtractor(
            spacy.load("en_core_web_sm"), sample_rules, "dummy")
        extractions = sample_rule_extractor.extract(title_extraction)
        expected_extraction = 'Trump'
        self.assertEqual(extractions[0].value, expected_extraction)
예제 #9
0
파일: em_spacy.py 프로젝트: xkgoodbest/etk
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        sample_rules = self.etk.load_spacy_rule(
            "./extraction_modules/resources/sample_rules.json")

        self.sample_rule_extractor = SpacyRuleExtractor(
            self.etk.default_nlp, sample_rules, "test_extractor")
예제 #10
0
    def _extract_relative_dates(self, text: str) -> List[Extraction]:
        """

        Extract relative dates using spaCy rules

        Args:
            text: str - the text to extract the relative date strings from

        Returns: List of Extraction(s)

        """
        if not text or not self._etk:
            return list()
        base = self._settings[RELATIVE_BASE] if self._settings[RELATIVE_BASE] else datetime.datetime.now()
        if not self._settings[RETURN_AS_TIMEZONE_AWARE]:
            base = base.replace(tzinfo=None)
        elif not base.tzinfo:
            base = base.astimezone(self._default_tz)
        res = SpacyRuleExtractor(self._etk.default_nlp, spacy_rules, 'relative_date_extractor').extract(text)
        ans = list()
        for relative_date in res:
            if relative_date.rule_id == 'direction_number_unit':
                direction, measure, unit = relative_date.value.split()
                measure = num_to_digit[measure.lower()]
            elif relative_date.rule_id == 'number_unit_direction':
                measure, unit, direction = relative_date.value.split()
                measure = num_to_digit[measure.lower()]
            elif relative_date.rule_id == 'direction_digit_unit':
                direction, measure, unit = relative_date.value.split()
            elif relative_date.rule_id == 'digit_unit_direction':
                measure, unit, direction = relative_date.value.split()
            elif relative_date.rule_id == 'direction_unit':
                direction, unit = relative_date.value.split()
                measure = '1'
            elif relative_date.rule_id == 'the_day':
                unit = 'days'
                key_ = relative_date.value.split()[-1].lower()
                if key_ == 'today':
                    direction = 'ago'
                    measure = '0'
                else:
                    direction = 'ago' if key_ == 'yesterday' else 'later'
                    measure = '1' if len(relative_date.value.split()) == 1 else '2'
            else:
                continue
            unit = unit if unit[-1] == 's' else unit+'s'
            direction = directions[direction.lower()] if direction.lower() in directions else '+'
            delta_args = {unit: int(direction+measure)}
            relative_delta = relativedelta(**delta_args)
            date = self._post_process_date(base+relative_delta)
            if date:
                extraction_date = self._wrap_extraction(date,
                                                       relative_date.value,
                                                       relative_date.provenance['start_char'],
                                                       relative_date.provenance['end_char'])
                if extraction_date:
                    ans.append(extraction_date)
        return ans
예제 #11
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.rule_extractor = SpacyRuleExtractor(
         self.etk.default_nlp,
         self.etk.load_spacy_rule("sample_rules.json"), "test_extractor")
예제 #12
0
    def test_SpacyRuleExtractor(self) -> None:
        sample_rules = {
            "field_name":
            "test",
            "rules": [{
                "dependencies": [],
                "description":
                "",
                "identifier":
                "rule_3",
                "is_active":
                "true",
                "output_format":
                "firstName:{1}, lastName:{2}",
                "pattern": [{
                    "capitalization": ["title"],
                    "contain_digit": "true",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "true",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "",
                    "minimum": "",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "word"
                }, {
                    "capitalization": ["title"],
                    "contain_digit": "false",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "false",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "",
                    "minimum": "",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "word"
                }],
                "polarity":
                "true"
            }, {
                "dependencies": [],
                "description":
                "",
                "identifier":
                "rule_4",
                "is_active":
                "true",
                "output_format":
                "number:{1}",
                "pattern": [{
                    "capitalization": [],
                    "contain_digit": "true",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "true",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "5",
                    "minimum": "0",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "number"
                }],
                "polarity":
                "true"
            }]
        }
        sample_rule_extractor = SpacyRuleExtractor(
            spacy.load("en_core_web_sm"), sample_rules, "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others."
        )

        expected = [('rule_4', 'number:2'),
                    ('rule_3', 'firstName:Runqi12, lastName:Shao'),
                    ('rule_3', 'firstName:Dongyu, lastName:Li'),
                    ('rule_3', 'firstName:Sylvia, lastName:{2}'),
                    ('rule_3', 'firstName:Amandeep, lastName:{2}')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)