예제 #1
0
class TestExtractionsUsingRegex(unittest.TestCase):

    def setUp(self):

        self.c = Core(load_spacy=True)
        file_path_age = os.path.join(os.path.dirname(__file__), "ground_truth/age.jl")
        f = open(file_path_age, 'r')

        data = f.read().split('\n')
        self.doc = dict()
        self.doc['age'] = []

        for t in data:
            self.doc['age'].append(json.loads(t))

        f.close()
        file_path_date = os.path.join(os.path.dirname(__file__), "ground_truth/date.jl")
        f = open(file_path_date, 'r')

        # data = f.read().split('\n')
        self.doc['date'] = []

        for t in f:
            self.doc['date'].append(json.loads(t))

        f.close()

    def test_extraction_from_date_spacy(self):
        for t in self.doc['date']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['content']))
            extracted_dates = spacy_date_extractor.extract(
                self.c.nlp, self.c.matchers['date'], crf_tokens)

            extracted_dates = [date['value'] for date in extracted_dates]

            correct_dates = t['extracted']

            self.assertEquals(extracted_dates, correct_dates)

    def test_extraction_from_age_spacy(self):
        for t in self.doc['age']:
            extracted_ages = spacy_age_extractor.extract(
                t['content'], self.c.nlp, self.c.matchers['age'])
            extracted_ages = [age['value'] for age in extracted_ages]
            for extracted_age in extracted_ages:
                for correct_age in t['correct']:
                    if extracted_age == correct_age:
                        self.assertTrue(extracted_age, correct_age)
예제 #2
0
class TestExtractionsUsingSpacy(unittest.TestCase):
    def setUp(self):

        e_config = {
            'data_extraction': [{
                'input_path': 'text.`parent`',
                'fields': {
                    "posting_date": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "age": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "social_media": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "address": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    }
                }
            }]
        }

        self.c = Core(extraction_config=e_config, load_spacy=True)
        self.ground_truth = dict()

        ground_truth_files = {
            "age":
            os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"),
            "date":
            os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"),
            "social_media":
            os.path.join(os.path.dirname(__file__),
                         "ground_truth/social_media.jl"),
            "address":
            os.path.join(os.path.dirname(__file__), "ground_truth/address.jl")
        }

        for extractor, file_name in ground_truth_files.items():
            with open(file_name, 'r') as f:
                test_data = f.read().split('\n')
                self.ground_truth[extractor] = list()
                for test_case in test_data:
                    self.ground_truth[extractor].append(json.loads(test_case))

    def test_spacy_extractions(self):

        # Date extractor
        for t in self.ground_truth['date']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'posting_date'}
            d = {'simple_tokens': crf_tokens}

            extracted_dates = self.c.extract_using_spacy(d, extraction_config)

            extracted_dates = [date['value'] for date in extracted_dates]

            correct_dates = t['extracted']

            self.assertEquals(extracted_dates, correct_dates)

        # Age extractor
        for t in self.ground_truth['age']:

            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'age'}
            d = {'simple_tokens': crf_tokens}

            extracted_ages = self.c.extract_using_spacy(d, extraction_config)

            extracted_ages = [match['value'] for match in extracted_ages]

            if len(extracted_ages) == 0 and len(t['correct']) == 0:
                self.assertFalse(extracted_ages)

            self.assertEquals(sorted(extracted_ages), sorted(t['correct']))

        # Social media extractor
        for t in self.ground_truth['social_media']:

            for social_media in t['correct']:
                t['correct'][social_media] = [
                    h.lower() for h in t['correct'][social_media]
                ]

            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'social_media'}
            d = {'simple_tokens': crf_tokens}

            extracted_social_media_handles = self.c.extract_using_spacy(
                d, extraction_config)

            extracted_handles = dict()

            for match in extracted_social_media_handles:
                social_network = match['metadata']['social_network']
                if social_network not in extracted_handles:
                    extracted_handles[social_network] = [match['value']]
                else:
                    extracted_handles[social_network].append(match['value'])

            if len(extracted_social_media_handles) == 0 and len(
                    t['correct']) == 0:
                self.assertFalse(extracted_social_media_handles)

            self.assertEquals(extracted_handles, t['correct'])

        # Address extractor
        for t in self.ground_truth['address']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'address'}
            d = {'simple_tokens': crf_tokens}

            extracted_addresses = self.c.extract_using_spacy(
                d, extraction_config)

            extracted_addresses = [
                address['value'] for address in extracted_addresses
            ]

            correct_addresses = t['extracted']

            self.assertEquals(extracted_addresses, correct_addresses)

        # Extract using config

        # Date extractor
        for t in self.ground_truth['date']:
            r = self.c.process(t)
            if 'data_extraction' in r:
                extracted_dates = [
                    x['value'] for x in r['data_extraction']['posting_date']
                    ['extract_using_spacy']['results']
                ]
            else:
                extracted_dates = []

            correct_dates = t['extracted']

            self.assertEquals(extracted_dates, correct_dates)

        # Age extractor
        for t in self.ground_truth['age']:
            r = self.c.process(t)
            if 'data_extraction' in r:
                extracted_ages = [
                    x['value'] for x in r['data_extraction']['age']
                    ['extract_using_spacy']['results']
                ]
            else:
                extracted_ages = []

            self.assertEquals(sorted(extracted_ages), sorted(t['correct']))

        # Social media extractor
        for t in self.ground_truth['social_media']:
            for social_media in t['correct']:
                t['correct'][social_media] = [
                    h.lower() for h in t['correct'][social_media]
                ]

            extracted_social_media_handles = self.c.process(t)

            if 'data_extraction' in extracted_social_media_handles:
                extracted_social_media_handles = [
                    x
                    for x in extracted_social_media_handles['data_extraction']
                    ['social_media']['extract_using_spacy']['results']
                ]
            else:
                extracted_social_media_handles = []

            extracted_handles = dict()

            for match in extracted_social_media_handles:
                social_network = match['metadata']['social_network']
                if social_network not in extracted_handles:
                    extracted_handles[social_network] = [match['value']]
                else:
                    extracted_handles[social_network].append(match['value'])

            if len(extracted_social_media_handles) == 0 and len(
                    t['correct']) == 0:
                self.assertFalse(extracted_social_media_handles)

            self.assertEquals(extracted_handles, t['correct'])

        # Address extractor
        for t in self.ground_truth['address']:
            r = self.c.process(t)
            if 'data_extraction' in r:
                extracted_addresses = [
                    x['value'] for x in r['data_extraction']['address']
                    ['extract_using_spacy']['results']
                ]
            else:
                extracted_addresses = []

            correct_addresses = t['extracted']

            self.assertEquals(extracted_addresses, correct_addresses)
예제 #3
0
class TestExtractionsUsingSpacy(unittest.TestCase):
    def setUp(self):
        self.c = Core()
        self.ground_truth = dict()

        ground_truth_files = {"name": os.path.join(os.path.dirname(__file__), "ground_truth/name_my_name_1.jl"),
                              "name_i_am_2": os.path.join(os.path.dirname(__file__), "ground_truth/name_i_am_2.jl"),
                              "name_name_3": os.path.join(os.path.dirname(__file__), "ground_truth/name_name_3.jl"),
                              "name_it_is_4": os.path.join(os.path.dirname(__file__), "ground_truth/name_it_is_4.jl"),
                              "name_this_is_5": os.path.join(os.path.dirname(__file__), "ground_truth/name_this_is_5.jl"),
                              "name_im_6": os.path.join(os.path.dirname(__file__), "ground_truth/name_im_6.jl"),
                              "name_its_7": os.path.join(os.path.dirname(__file__), "ground_truth/name_its_7.jl"),
                              "name_teleph_number_split_8": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_split_8.jl"),
                              "name_teleph_number_9": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_9.jl")
                              }

        for extractor, file_name in ground_truth_files.items():
            with open(file_name, 'r') as f:
                test_data = f.read().split('\n')  #test_data is a list, contains dictionary
                self.ground_truth[extractor] = list()
                for test_case in test_data:
                    self.ground_truth[extractor].append(json.loads(test_case))  # ground_truth = {"text": 'Hello guy's, it's Jessica', 'extracted': 'Sara'}

    def generic_token(slef, type="word", token=[], shape=[], capitalization=[], part_of_speech=[], length=[],
                      prefix="", suffix="", is_followed_by_space="", is_required="true",
                      is_in_output="true", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""):
        return {
            "type": type,
            "token": token,
            "shapes": shape,
            "capitalization": capitalization,
            "part_of_speech": part_of_speech,
            "length": length,
            "prefix": prefix,
            "suffix": suffix,
            "is_followed_by_space": is_followed_by_space,
            "is_required": is_required,
            "is_in_output": is_in_output,
            "is_out_of_vocabulary": is_out_of_vocabulary,
            "is_in_vocabulary": is_in_vocabulary,
            "contain_digit": contain_digit
        }

    def word_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="",
                   is_followed_by_space="", is_required="true", is_in_output="false",
                   is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""):
        return self.generic_token(type="word", token=token, capitalization=capitalization,
                             part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix,
                             is_followed_by_space=is_followed_by_space, is_required=is_required,
                             is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary,
                             is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit)

    def punctuation_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="",
                          is_followed_by_space="", is_required="true", is_in_output="false",
                          is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""):
        return self.generic_token(type="punctuation", token=token, capitalization=capitalization,
                             part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix,
                             is_followed_by_space=is_followed_by_space, is_required=is_required,
                             is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary,
                             is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit)

    def shape_token(self, shape=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="",
                    is_followed_by_space="", is_required="true", is_in_output="false",
                    is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""):
        return self.generic_token(type="shape", shape=shape, capitalization=capitalization,
                             part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix,
                             is_followed_by_space=is_followed_by_space, is_required=is_required,
                             is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary,
                             is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit)


    #1.my name / names is
    def test_rule_my_name(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_01",
                    "description": "a description",
                    "is_active": "false",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["my"]),
                        self.word_token(token=["name", "names"]),
                        self.word_token(token=["is"], is_required="false"),
                        self.word_token(capitalization=["title", "upper"], is_in_output="true")
                    ]
                }
            ]
        }

        for t in self.ground_truth['name']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)


#2. i am
    def test_rule_i_am(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_02",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["i"]),
                        self.word_token(token=["am"]),
                        self.word_token(capitalization=["title", "upper"], is_in_output="true")
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_i_am_2']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)



#3. name :   Name:
    def test_rule_name_(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_03",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["name"]),
                        self.punctuation_token(token=[":"]),
                        self.word_token(token=[], is_in_output="true"),
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_name_3']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)


# 4.it is
    def test_rule_it_is(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_04",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["it"]),
                        self.word_token(token=["is"]),
                        #           word_token(capitalization=["title", "mixed"], is_in_output="true")
                        self.word_token(part_of_speech=["proper noun"], is_in_output="true")
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_it_is_4']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)



# 5.this is , This is
    def test_rule_this_is(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_05",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["this"]),
                        self.word_token(token=["is"]),
                        self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"],
                                   is_in_output="true")
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_this_is_5']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)

# 6.I'm
    def test_rule_Im(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_06",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["i"]),
                        self.punctuation_token(token=["'"]),
                        self.word_token(token=["m"]),
                        self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"],
                                   is_in_output="true")
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_im_6']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)


# 7. it's
    def test_rule_its(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_07",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(token=["it"]),
                        self.punctuation_token(token=["'"]),
                        self.word_token(token=["s"]),
                        self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"],
                                   is_in_output="true")
                    ]
                }

            ]
        }

        for t in self.ground_truth['name_its_7']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)


# 8.Ashley (702)
    def test_rule_teleph_number_split(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_08",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(capitalization=["title"], is_in_output="true"),
                        self.punctuation_token(token=["(", "["]),
                        self.shape_token(shape=["ddd"])
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_teleph_number_split_8']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)


#9. Jessica 7135975313
    def test_rule_teleph_number(self):
        field_rules = {
            "rules": [
                {
                    "identifier": "name_rule_09",
                    "description": "a description",
                    "is_active": "true",
                    "polarity": [],
                    "pattern": [
                        self.word_token(capitalization=["title", "upper", "mixed"], is_in_output="true"),
                        self.shape_token(shape=["dddddddddd"])
                    ]
                }
            ]
        }

        for t in self.ground_truth['name_teleph_number_9']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text'], lowercase=False))

            extraction_config = {'field_name': 'my_name_is'}
            d = {'simple_tokens_original_case': crf_tokens}

            extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules)

            extracted_names = [name['value'] for name in extracted_names]

            correct_names = t['extracted']

            self.assertEquals(extracted_names, correct_names)
예제 #4
0
class TestExtractionsUsingSpacy(unittest.TestCase):
    def setUp(self):

        e_config = {
            'data_extraction': [{
                'input_path': 'text.`parent`',
                'fields': {
                    "posting_date": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "age": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "social_media": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "address": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "email": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    }
                }
            }]
        }

        self.c = Core(extraction_config=e_config, load_spacy=True)
        self.ground_truth = dict()

        ground_truth_files = {
            "age":
            os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"),
            "date":
            os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"),
            "social_media":
            os.path.join(os.path.dirname(__file__),
                         "ground_truth/social_media.jl"),
            "address":
            os.path.join(os.path.dirname(__file__), "ground_truth/address.jl"),
            "email":
            os.path.join(os.path.dirname(__file__), "ground_truth/email.jl")
        }

        for extractor, file_name in ground_truth_files.items():
            with open(file_name, 'r') as f:
                test_data = f.read().split('\n')
                self.ground_truth[extractor] = list()
                for test_case in test_data:
                    self.ground_truth[extractor].append(json.loads(test_case))

    @staticmethod
    def create_list_from_kg(extractions):
        results = list()
        for e in extractions:
            ps = e['provenance']
            if not isinstance(ps, list):
                ps = [ps]
            for p in ps:
                results.append(p['extracted_value'])
        return results

    @staticmethod
    def create_list_from_social_media(extractions):
        results = dict()
        for e in extractions:
            ps = e['provenance']
            if not isinstance(ps, list):
                ps = [ps]
            for p in ps:
                x = p['qualifiers']['social_network']
                results[x] = [p['extracted_value']]
        return results

    def test_spacy_extractions(self):

        # Date extractor
        for t in self.ground_truth['date']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'posting_date'}
            d = {'simple_tokens': crf_tokens}

            extracted_dates = self.c.extract_using_spacy(d, extraction_config)

            extracted_dates = [date['value'] for date in extracted_dates]

            correct_dates = t['extracted']

            self.assertEquals(extracted_dates, correct_dates)

        # Age extractor
        for t in self.ground_truth['age']:

            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'age'}
            d = {'simple_tokens': crf_tokens}

            extracted_ages = self.c.extract_using_spacy(d, extraction_config)

            extracted_ages = [match['value'] for match in extracted_ages]

            if len(extracted_ages) == 0 and len(t['correct']) == 0:
                self.assertFalse(extracted_ages)

            self.assertEquals(sorted(extracted_ages), sorted(t['correct']))

        # Social media extractor
        for t in self.ground_truth['social_media']:

            for social_media in t['correct']:
                t['correct'][social_media] = [
                    h.lower() for h in t['correct'][social_media]
                ]

            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'social_media'}
            d = {'simple_tokens': crf_tokens}

            extracted_social_media_handles = self.c.extract_using_spacy(
                d, extraction_config)

            extracted_handles = dict()

            for match in extracted_social_media_handles:
                social_network = match['metadata']['social_network']
                if social_network not in extracted_handles:
                    extracted_handles[social_network] = [match['value']]
                else:
                    extracted_handles[social_network].append(match['value'])

            if len(extracted_social_media_handles) == 0 and len(
                    t['correct']) == 0:
                self.assertFalse(extracted_social_media_handles)

            self.assertEquals(extracted_handles, t['correct'])

        # Address extractor
        for t in self.ground_truth['address']:
            crf_tokens = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(t['text']))

            extraction_config = {'field_name': 'address'}
            d = {'simple_tokens': crf_tokens}

            extracted_addresses = self.c.extract_using_spacy(
                d, extraction_config)

            extracted_addresses = [
                address['value'] for address in extracted_addresses
            ]

            correct_addresses = t['extracted']

            self.assertEquals(extracted_addresses, correct_addresses)

        # Extract using config

        # Date extractor
        for t in self.ground_truth['date']:
            r = self.c.process(t)
            if 'knowledge_graph' in r:
                extracted_dates = self.create_list_from_kg(
                    r["knowledge_graph"]['posting_date'])
            else:
                extracted_dates = []

            correct_dates = t['extracted']

            self.assertEquals(extracted_dates, correct_dates)

        # Age extractor
        for t in self.ground_truth['age']:
            r = self.c.process(t)
            if 'knowledge_graph' in r:
                extracted_ages = self.create_list_from_kg(
                    r["knowledge_graph"]['age'])
            else:
                extracted_ages = []

            self.assertEquals(sorted(extracted_ages), sorted(t['correct']))

        # Email extractor
        for t in self.ground_truth['email']:
            r = self.c.process(t)
            if 'knowledge_graph' in r:
                extracted_ages = self.create_list_from_kg(
                    r["knowledge_graph"]['email'])
            else:
                extracted_ages = []
            self.assertEquals(sorted(extracted_ages), sorted(t['correct']))

        # Social media extractor
        for t in self.ground_truth['social_media']:
            for social_media in t['correct']:
                t['correct'][social_media] = [
                    h.lower() for h in t['correct'][social_media]
                ]
            extracted_social_media_handles = self.c.process(t)
            if 'knowledge_graph' in extracted_social_media_handles:
                extracted_social_media_handles = self.create_list_from_social_media(
                    extracted_social_media_handles["knowledge_graph"]
                    ['social_media'])
            else:
                extracted_social_media_handles = {}

            if len(extracted_social_media_handles) == 0 and len(
                    t['correct']) == 0:
                self.assertFalse(extracted_social_media_handles)
            self.assertEquals(extracted_social_media_handles, t['correct'])

        # Address extractor
        for t in self.ground_truth['address']:
            r = self.c.process(t)
            if 'knowledge_graph' in r:
                extracted_addresses = self.create_list_from_kg(
                    r["knowledge_graph"]['address'])
            else:
                extracted_addresses = []

            correct_addresses = t['extracted']
            self.assertEquals(extracted_addresses.sort(),
                              correct_addresses.sort())

    def test_spacy_date(self):
        doc = {
            "url": "http://date.test.com",
            "doc_id": "12344",
            "content_extraction": {
                "useful_text": {
                    "text": u"Alert: Tue, 2006-02-07"
                }
            }
        }
        e_config = {
            "document_id":
            "doc_id",
            'data_extraction': [{
                "fields": {
                    "event_date": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {
                                    "post_filter": "parse_date"
                                }
                            }
                        }
                    }
                },
                "input_path": ["content_extraction.useful_text.text.`parent`"]
            }]
        }
        core = Core(extraction_config=e_config)
        r = core.process(doc)
        kg = r['knowledge_graph']
        self.assertTrue('event_date' in kg)
        self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')
예제 #5
0
    def test_extraction_input_path(self):
        c = Core()
        d = dict()
        d['simple_tokens_original_case'] = c.extract_tokens_from_crf(
            c.extract_crftokens(self.obj['test_text'], lowercase=False))

        p_filtered = [[
            x.decode('string_escape').decode("utf-8") for x in pp if x
        ] for pp in self.obj['positive_examples']]
        infered_rule = c.infer_rule_using_custom_spacy(d, p_filtered)
        expected_result = [{
            "polarity":
            "true",
            "description":
            "",
            "pattern": [{
                "shapes": [],
                "prefix": "",
                "is_in_output": "true",
                "capitalization": [],
                "part_of_speech": [],
                "maximum": "",
                "match_all_forms": "true",
                "length": [],
                "minimum": "",
                "numbers": [],
                "contain_digit": "true",
                "is_in_vocabulary": "",
                "is_out_of_vocabulary": "",
                "is_required": "false",
                "type": "punctuation",
                "token": [";"],
                "suffix": ""
            }],
            "output_format":
            "",
            "is_active":
            "true",
            "dependencies": [],
            "identifier":
            "infer_rule"
        }, {
            "polarity":
            "true",
            "description":
            "",
            "pattern": [{
                "shapes": [],
                "prefix": "",
                "is_in_output": "true",
                "capitalization": [],
                "part_of_speech": [],
                "maximum": "",
                "match_all_forms": "true",
                "length": [],
                "minimum": "",
                "numbers": [],
                "contain_digit": "true",
                "is_in_vocabulary": "",
                "is_out_of_vocabulary": "",
                "is_required": "true",
                "type": "punctuation",
                "token": [";"],
                "suffix": ""
            }],
            "output_format":
            "",
            "is_active":
            "true",
            "dependencies": [],
            "identifier":
            "infer_rule"
        }]
        self.assertIn(infered_rule, expected_result)
예제 #6
0
class TestCustomSpacyNameExtraction(unittest.TestCase):
    def setUp(self):
        self.c = Core()
        self.data = dict()
        rule_01 = {
            "identifier":
            "name_rule_01",
            "description":
            "my name/names is",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["my"]),
                word_token(token=["name", "names"]),
                word_token(token=["is"], is_required="false"),
                word_token(part_of_speech=["proper noun"],
                           capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_02 = {
            "identifier":
            "name_rule_02",
            "description":
            "i am",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["i"]),
                word_token(token=["am"]),
                word_token(capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_03 = {
            "identifier":
            "name_rule_03",
            "description":
            "name : Sara",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["name"]),
                punctuation_token(token=[":"]),
                word_token(token=[], is_in_output="true"),
            ]
        }

        rule_04 = {
            "identifier":
            "name_rule_04",
            "description":
            "it is Jessicala",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["it"]),
                word_token(token=["is"]),
                word_token(part_of_speech=["proper noun"],
                           capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_05 = {
            "identifier":
            "name_rule_05",
            "description":
            "this is",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["this"]),
                word_token(token=["is"]),
                word_token(part_of_speech=["proper noun"],
                           capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_06 = {
            "identifier":
            "name_rule_06",
            "description":
            "i'm",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["i"]),
                punctuation_token(token=["'"]),
                word_token(token=["m"]),
                word_token(part_of_speech=["proper noun"],
                           capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_07 = {
            "identifier":
            "name_rule_07",
            "description":
            "it's",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(token=["it"]),
                punctuation_token(token=["'"]),
                word_token(token=["s"]),
                word_token(part_of_speech=["proper noun"],
                           capitalization=["title", "upper"],
                           is_in_output="true")
            ]
        }

        rule_08 = {
            "identifier":
            "name_rule_08",
            "description":
            "name followed by telephone number[123]",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(capitalization=["title"], is_in_output="true"),
                punctuation_token(token=["(", "["]),
                shape_token(shape=["ddd"])
            ]
        }

        rule_09 = {
            "identifier":
            "name_rule_09",
            "description":
            "name followed by telephone number 7135975313",
            "is_active":
            "true",
            "output_format":
            "{1}",
            "pattern": [
                word_token(capitalization=["title", "upper"],
                           is_in_output="true"),
                shape_token(shape=["dddddddddd"])
            ]
        }

        text_01 = u"Hi Gentlemen, My name is Ashley . my name Monica I am the one and, My names is Alanda"
        text_02 = u"I'm Ashley I'm bored i am Alison, I am Gimly"
        text_03 = u"Name : Sara . I am the one and, Name: JILL , Name:Jessie"
        text_04 = u"Hello guy's, it's Jessica here from the #@%%% Spa. I cant say the name on here, and it is Jessica, " \
                  u"and it is cold"
        text_05 = u"this is Legolas I'm bored This is Danaerys  This is AshleyC"
        text_06 = text_02
        text_07 = text_04
        text_08 = u"Ashley (702)628-9035 XOXO . Aslll (702) 628-9035 XOXO Alppp 7026289035"
        text_09 = text_08

        self.data['1'] = dict()
        self.data['1']['text'] = text_01
        self.data['1']['rules'] = {"rules": [rule_01]}

        self.data['2'] = dict()
        self.data['2']['text'] = text_02
        self.data['2']['rules'] = {"rules": [rule_02]}

        self.data['3'] = dict()
        self.data['3']['text'] = text_03
        self.data['3']['rules'] = {"rules": [rule_03]}

        self.data['4'] = dict()
        self.data['4']['text'] = text_04
        self.data['4']['rules'] = {"rules": [rule_04]}

        self.data['5'] = dict()
        self.data['5']['text'] = text_05
        self.data['5']['rules'] = {"rules": [rule_05]}

        self.data['6'] = dict()
        self.data['6']['text'] = text_06
        self.data['6']['rules'] = {"rules": [rule_06]}

        self.data['7'] = dict()
        self.data['7']['text'] = text_07
        self.data['7']['rules'] = {"rules": [rule_07]}

        self.data['8'] = dict()
        self.data['8']['text'] = text_08
        self.data['8']['rules'] = {"rules": [rule_08]}

        self.data['9'] = dict()
        self.data['9']['text'] = text_09
        self.data['9']['rules'] = {"rules": [rule_09]}

        self.expected_data = dict()
        self.expected_data['1'] = dict()
        self.expected_data['1']['length'] = 3
        self.expected_data['1']['results'] = ['Ashley', 'Alanda', 'Monica']

        self.expected_data['2'] = dict()
        self.expected_data['2']['length'] = 2
        self.expected_data['2']['results'] = ['Alison', 'Gimly']

        self.expected_data['3'] = dict()
        self.expected_data['3']['length'] = 3
        self.expected_data['3']['results'] = ['Sara', 'JILL', 'Jessie']

        self.expected_data['4'] = dict()
        self.expected_data['4']['length'] = 1
        self.expected_data['4']['results'] = ['Jessica']

        self.expected_data['5'] = dict()
        self.expected_data['5']['length'] = 2
        self.expected_data['5']['results'] = ['Legolas', 'Danaerys']

        self.expected_data['6'] = dict()
        self.expected_data['6']['length'] = 1
        self.expected_data['6']['results'] = ['Ashley']

        self.expected_data['7'] = dict()
        self.expected_data['7']['length'] = 1
        self.expected_data['7']['results'] = ['Jessica']

        self.expected_data['8'] = dict()
        self.expected_data['8']['length'] = 2
        self.expected_data['8']['results'] = ['Ashley', 'Aslll']

        self.expected_data['9'] = dict()
        self.expected_data['9']['length'] = 1
        self.expected_data['9']['results'] = ['Alppp']

    def test_rules(self):
        for key in self.data.keys():
            d = dict()
            d['text'] = self.data[key]['text']
            d['simple_tokens_original_case'] = self.c.extract_tokens_from_crf(
                self.c.extract_crftokens(d['text'], lowercase=False))
            config = dict()
            config['field_name'] = 'name'
            results = self.c.extract_using_custom_spacy(
                d, config, field_rules=self.data[key]['rules'])
            self.assertTrue(len(results) == self.expected_data[key]['length'])
            for r in results:
                self.assertTrue(
                    r['value'] in self.expected_data[key]['results'])