class TestExtractionsUsingRegex(unittest.TestCase): def setUp(self): self.c = Core(load_spacy=True) file_path_age = os.path.join(os.path.dirname(__file__), "ground_truth/age.jl") f = open(file_path_age, 'r') data = f.read().split('\n') self.doc = dict() self.doc['age'] = [] for t in data: self.doc['age'].append(json.loads(t)) f.close() file_path_date = os.path.join(os.path.dirname(__file__), "ground_truth/date.jl") f = open(file_path_date, 'r') # data = f.read().split('\n') self.doc['date'] = [] for t in f: self.doc['date'].append(json.loads(t)) f.close() def test_extraction_from_date_spacy(self): for t in self.doc['date']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['content'])) extracted_dates = spacy_date_extractor.extract( self.c.nlp, self.c.matchers['date'], crf_tokens) extracted_dates = [date['value'] for date in extracted_dates] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) def test_extraction_from_age_spacy(self): for t in self.doc['age']: extracted_ages = spacy_age_extractor.extract( t['content'], self.c.nlp, self.c.matchers['age']) extracted_ages = [age['value'] for age in extracted_ages] for extracted_age in extracted_ages: for correct_age in t['correct']: if extracted_age == correct_age: self.assertTrue(extracted_age, correct_age)
class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): e_config = { 'data_extraction': [{ 'input_path': 'text.`parent`', 'fields': { "posting_date": { "extractors": { "extract_using_spacy": { "config": {} } } }, "age": { "extractors": { "extract_using_spacy": { "config": {} } } }, "social_media": { "extractors": { "extract_using_spacy": { "config": {} } } }, "address": { "extractors": { "extract_using_spacy": { "config": {} } } } } }] } self.c = Core(extraction_config=e_config, load_spacy=True) self.ground_truth = dict() ground_truth_files = { "age": os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"), "date": os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"), "social_media": os.path.join(os.path.dirname(__file__), "ground_truth/social_media.jl"), "address": os.path.join(os.path.dirname(__file__), "ground_truth/address.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) def test_spacy_extractions(self): # Date extractor for t in self.ground_truth['date']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'posting_date'} d = {'simple_tokens': crf_tokens} extracted_dates = self.c.extract_using_spacy(d, extraction_config) extracted_dates = [date['value'] for date in extracted_dates] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'age'} d = {'simple_tokens': crf_tokens} extracted_ages = self.c.extract_using_spacy(d, extraction_config) extracted_ages = [match['value'] for match in extracted_ages] if len(extracted_ages) == 0 and len(t['correct']) == 0: self.assertFalse(extracted_ages) self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'social_media'} d = {'simple_tokens': crf_tokens} extracted_social_media_handles = self.c.extract_using_spacy( d, extraction_config) extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'address'} d = {'simple_tokens': crf_tokens} extracted_addresses = self.c.extract_using_spacy( d, extraction_config) extracted_addresses = [ address['value'] for address in extracted_addresses ] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses) # Extract using config # Date extractor for t in self.ground_truth['date']: r = self.c.process(t) if 'data_extraction' in r: extracted_dates = [ x['value'] for x in r['data_extraction']['posting_date'] ['extract_using_spacy']['results'] ] else: extracted_dates = [] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: r = self.c.process(t) if 'data_extraction' in r: extracted_ages = [ x['value'] for x in r['data_extraction']['age'] ['extract_using_spacy']['results'] ] else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] extracted_social_media_handles = self.c.process(t) if 'data_extraction' in extracted_social_media_handles: extracted_social_media_handles = [ x for x in extracted_social_media_handles['data_extraction'] ['social_media']['extract_using_spacy']['results'] ] else: extracted_social_media_handles = [] extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: r = self.c.process(t) if 'data_extraction' in r: extracted_addresses = [ x['value'] for x in r['data_extraction']['address'] ['extract_using_spacy']['results'] ] else: extracted_addresses = [] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses)
class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): self.c = Core() self.ground_truth = dict() ground_truth_files = {"name": os.path.join(os.path.dirname(__file__), "ground_truth/name_my_name_1.jl"), "name_i_am_2": os.path.join(os.path.dirname(__file__), "ground_truth/name_i_am_2.jl"), "name_name_3": os.path.join(os.path.dirname(__file__), "ground_truth/name_name_3.jl"), "name_it_is_4": os.path.join(os.path.dirname(__file__), "ground_truth/name_it_is_4.jl"), "name_this_is_5": os.path.join(os.path.dirname(__file__), "ground_truth/name_this_is_5.jl"), "name_im_6": os.path.join(os.path.dirname(__file__), "ground_truth/name_im_6.jl"), "name_its_7": os.path.join(os.path.dirname(__file__), "ground_truth/name_its_7.jl"), "name_teleph_number_split_8": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_split_8.jl"), "name_teleph_number_9": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_9.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') #test_data is a list, contains dictionary self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) # ground_truth = {"text": 'Hello guy's, it's Jessica', 'extracted': 'Sara'} def generic_token(slef, type="word", token=[], shape=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="true", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return { "type": type, "token": token, "shapes": shape, "capitalization": capitalization, "part_of_speech": part_of_speech, "length": length, "prefix": prefix, "suffix": suffix, "is_followed_by_space": is_followed_by_space, "is_required": is_required, "is_in_output": is_in_output, "is_out_of_vocabulary": is_out_of_vocabulary, "is_in_vocabulary": is_in_vocabulary, "contain_digit": contain_digit } def word_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="word", token=token, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) def punctuation_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="punctuation", token=token, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) def shape_token(self, shape=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="shape", shape=shape, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) #1.my name / names is def test_rule_my_name(self): field_rules = { "rules": [ { "identifier": "name_rule_01", "description": "a description", "is_active": "false", "polarity": [], "pattern": [ self.word_token(token=["my"]), self.word_token(token=["name", "names"]), self.word_token(token=["is"], is_required="false"), self.word_token(capitalization=["title", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #2. i am def test_rule_i_am(self): field_rules = { "rules": [ { "identifier": "name_rule_02", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["i"]), self.word_token(token=["am"]), self.word_token(capitalization=["title", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_i_am_2']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #3. name : Name: def test_rule_name_(self): field_rules = { "rules": [ { "identifier": "name_rule_03", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["name"]), self.punctuation_token(token=[":"]), self.word_token(token=[], is_in_output="true"), ] } ] } for t in self.ground_truth['name_name_3']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 4.it is def test_rule_it_is(self): field_rules = { "rules": [ { "identifier": "name_rule_04", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["it"]), self.word_token(token=["is"]), # word_token(capitalization=["title", "mixed"], is_in_output="true") self.word_token(part_of_speech=["proper noun"], is_in_output="true") ] } ] } for t in self.ground_truth['name_it_is_4']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 5.this is , This is def test_rule_this_is(self): field_rules = { "rules": [ { "identifier": "name_rule_05", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["this"]), self.word_token(token=["is"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_this_is_5']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 6.I'm def test_rule_Im(self): field_rules = { "rules": [ { "identifier": "name_rule_06", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["i"]), self.punctuation_token(token=["'"]), self.word_token(token=["m"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_im_6']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 7. it's def test_rule_its(self): field_rules = { "rules": [ { "identifier": "name_rule_07", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["it"]), self.punctuation_token(token=["'"]), self.word_token(token=["s"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_its_7']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 8.Ashley (702) def test_rule_teleph_number_split(self): field_rules = { "rules": [ { "identifier": "name_rule_08", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(capitalization=["title"], is_in_output="true"), self.punctuation_token(token=["(", "["]), self.shape_token(shape=["ddd"]) ] } ] } for t in self.ground_truth['name_teleph_number_split_8']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #9. Jessica 7135975313 def test_rule_teleph_number(self): field_rules = { "rules": [ { "identifier": "name_rule_09", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(capitalization=["title", "upper", "mixed"], is_in_output="true"), self.shape_token(shape=["dddddddddd"]) ] } ] } for t in self.ground_truth['name_teleph_number_9']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names)
class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): e_config = { 'data_extraction': [{ 'input_path': 'text.`parent`', 'fields': { "posting_date": { "extractors": { "extract_using_spacy": { "config": {} } } }, "age": { "extractors": { "extract_using_spacy": { "config": {} } } }, "social_media": { "extractors": { "extract_using_spacy": { "config": {} } } }, "address": { "extractors": { "extract_using_spacy": { "config": {} } } }, "email": { "extractors": { "extract_using_spacy": { "config": {} } } } } }] } self.c = Core(extraction_config=e_config, load_spacy=True) self.ground_truth = dict() ground_truth_files = { "age": os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"), "date": os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"), "social_media": os.path.join(os.path.dirname(__file__), "ground_truth/social_media.jl"), "address": os.path.join(os.path.dirname(__file__), "ground_truth/address.jl"), "email": os.path.join(os.path.dirname(__file__), "ground_truth/email.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) @staticmethod def create_list_from_kg(extractions): results = list() for e in extractions: ps = e['provenance'] if not isinstance(ps, list): ps = [ps] for p in ps: results.append(p['extracted_value']) return results @staticmethod def create_list_from_social_media(extractions): results = dict() for e in extractions: ps = e['provenance'] if not isinstance(ps, list): ps = [ps] for p in ps: x = p['qualifiers']['social_network'] results[x] = [p['extracted_value']] return results def test_spacy_extractions(self): # Date extractor for t in self.ground_truth['date']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'posting_date'} d = {'simple_tokens': crf_tokens} extracted_dates = self.c.extract_using_spacy(d, extraction_config) extracted_dates = [date['value'] for date in extracted_dates] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'age'} d = {'simple_tokens': crf_tokens} extracted_ages = self.c.extract_using_spacy(d, extraction_config) extracted_ages = [match['value'] for match in extracted_ages] if len(extracted_ages) == 0 and len(t['correct']) == 0: self.assertFalse(extracted_ages) self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'social_media'} d = {'simple_tokens': crf_tokens} extracted_social_media_handles = self.c.extract_using_spacy( d, extraction_config) extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'address'} d = {'simple_tokens': crf_tokens} extracted_addresses = self.c.extract_using_spacy( d, extraction_config) extracted_addresses = [ address['value'] for address in extracted_addresses ] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses) # Extract using config # Date extractor for t in self.ground_truth['date']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_dates = self.create_list_from_kg( r["knowledge_graph"]['posting_date']) else: extracted_dates = [] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_ages = self.create_list_from_kg( r["knowledge_graph"]['age']) else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Email extractor for t in self.ground_truth['email']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_ages = self.create_list_from_kg( r["knowledge_graph"]['email']) else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] extracted_social_media_handles = self.c.process(t) if 'knowledge_graph' in extracted_social_media_handles: extracted_social_media_handles = self.create_list_from_social_media( extracted_social_media_handles["knowledge_graph"] ['social_media']) else: extracted_social_media_handles = {} if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_social_media_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_addresses = self.create_list_from_kg( r["knowledge_graph"]['address']) else: extracted_addresses = [] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses.sort(), correct_addresses.sort()) def test_spacy_date(self): doc = { "url": "http://date.test.com", "doc_id": "12344", "content_extraction": { "useful_text": { "text": u"Alert: Tue, 2006-02-07" } } } e_config = { "document_id": "doc_id", 'data_extraction': [{ "fields": { "event_date": { "extractors": { "extract_using_spacy": { "config": { "post_filter": "parse_date" } } } } }, "input_path": ["content_extraction.useful_text.text.`parent`"] }] } core = Core(extraction_config=e_config) r = core.process(doc) kg = r['knowledge_graph'] self.assertTrue('event_date' in kg) self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')
def test_extraction_input_path(self): c = Core() d = dict() d['simple_tokens_original_case'] = c.extract_tokens_from_crf( c.extract_crftokens(self.obj['test_text'], lowercase=False)) p_filtered = [[ x.decode('string_escape').decode("utf-8") for x in pp if x ] for pp in self.obj['positive_examples']] infered_rule = c.infer_rule_using_custom_spacy(d, p_filtered) expected_result = [{ "polarity": "true", "description": "", "pattern": [{ "shapes": [], "prefix": "", "is_in_output": "true", "capitalization": [], "part_of_speech": [], "maximum": "", "match_all_forms": "true", "length": [], "minimum": "", "numbers": [], "contain_digit": "true", "is_in_vocabulary": "", "is_out_of_vocabulary": "", "is_required": "false", "type": "punctuation", "token": [";"], "suffix": "" }], "output_format": "", "is_active": "true", "dependencies": [], "identifier": "infer_rule" }, { "polarity": "true", "description": "", "pattern": [{ "shapes": [], "prefix": "", "is_in_output": "true", "capitalization": [], "part_of_speech": [], "maximum": "", "match_all_forms": "true", "length": [], "minimum": "", "numbers": [], "contain_digit": "true", "is_in_vocabulary": "", "is_out_of_vocabulary": "", "is_required": "true", "type": "punctuation", "token": [";"], "suffix": "" }], "output_format": "", "is_active": "true", "dependencies": [], "identifier": "infer_rule" }] self.assertIn(infered_rule, expected_result)
class TestCustomSpacyNameExtraction(unittest.TestCase): def setUp(self): self.c = Core() self.data = dict() rule_01 = { "identifier": "name_rule_01", "description": "my name/names is", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["my"]), word_token(token=["name", "names"]), word_token(token=["is"], is_required="false"), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_02 = { "identifier": "name_rule_02", "description": "i am", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["i"]), word_token(token=["am"]), word_token(capitalization=["title", "upper"], is_in_output="true") ] } rule_03 = { "identifier": "name_rule_03", "description": "name : Sara", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["name"]), punctuation_token(token=[":"]), word_token(token=[], is_in_output="true"), ] } rule_04 = { "identifier": "name_rule_04", "description": "it is Jessicala", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["it"]), word_token(token=["is"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_05 = { "identifier": "name_rule_05", "description": "this is", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["this"]), word_token(token=["is"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_06 = { "identifier": "name_rule_06", "description": "i'm", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["i"]), punctuation_token(token=["'"]), word_token(token=["m"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_07 = { "identifier": "name_rule_07", "description": "it's", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["it"]), punctuation_token(token=["'"]), word_token(token=["s"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_08 = { "identifier": "name_rule_08", "description": "name followed by telephone number[123]", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(capitalization=["title"], is_in_output="true"), punctuation_token(token=["(", "["]), shape_token(shape=["ddd"]) ] } rule_09 = { "identifier": "name_rule_09", "description": "name followed by telephone number 7135975313", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(capitalization=["title", "upper"], is_in_output="true"), shape_token(shape=["dddddddddd"]) ] } text_01 = u"Hi Gentlemen, My name is Ashley . my name Monica I am the one and, My names is Alanda" text_02 = u"I'm Ashley I'm bored i am Alison, I am Gimly" text_03 = u"Name : Sara . I am the one and, Name: JILL , Name:Jessie" text_04 = u"Hello guy's, it's Jessica here from the #@%%% Spa. I cant say the name on here, and it is Jessica, " \ u"and it is cold" text_05 = u"this is Legolas I'm bored This is Danaerys This is AshleyC" text_06 = text_02 text_07 = text_04 text_08 = u"Ashley (702)628-9035 XOXO . Aslll (702) 628-9035 XOXO Alppp 7026289035" text_09 = text_08 self.data['1'] = dict() self.data['1']['text'] = text_01 self.data['1']['rules'] = {"rules": [rule_01]} self.data['2'] = dict() self.data['2']['text'] = text_02 self.data['2']['rules'] = {"rules": [rule_02]} self.data['3'] = dict() self.data['3']['text'] = text_03 self.data['3']['rules'] = {"rules": [rule_03]} self.data['4'] = dict() self.data['4']['text'] = text_04 self.data['4']['rules'] = {"rules": [rule_04]} self.data['5'] = dict() self.data['5']['text'] = text_05 self.data['5']['rules'] = {"rules": [rule_05]} self.data['6'] = dict() self.data['6']['text'] = text_06 self.data['6']['rules'] = {"rules": [rule_06]} self.data['7'] = dict() self.data['7']['text'] = text_07 self.data['7']['rules'] = {"rules": [rule_07]} self.data['8'] = dict() self.data['8']['text'] = text_08 self.data['8']['rules'] = {"rules": [rule_08]} self.data['9'] = dict() self.data['9']['text'] = text_09 self.data['9']['rules'] = {"rules": [rule_09]} self.expected_data = dict() self.expected_data['1'] = dict() self.expected_data['1']['length'] = 3 self.expected_data['1']['results'] = ['Ashley', 'Alanda', 'Monica'] self.expected_data['2'] = dict() self.expected_data['2']['length'] = 2 self.expected_data['2']['results'] = ['Alison', 'Gimly'] self.expected_data['3'] = dict() self.expected_data['3']['length'] = 3 self.expected_data['3']['results'] = ['Sara', 'JILL', 'Jessie'] self.expected_data['4'] = dict() self.expected_data['4']['length'] = 1 self.expected_data['4']['results'] = ['Jessica'] self.expected_data['5'] = dict() self.expected_data['5']['length'] = 2 self.expected_data['5']['results'] = ['Legolas', 'Danaerys'] self.expected_data['6'] = dict() self.expected_data['6']['length'] = 1 self.expected_data['6']['results'] = ['Ashley'] self.expected_data['7'] = dict() self.expected_data['7']['length'] = 1 self.expected_data['7']['results'] = ['Jessica'] self.expected_data['8'] = dict() self.expected_data['8']['length'] = 2 self.expected_data['8']['results'] = ['Ashley', 'Aslll'] self.expected_data['9'] = dict() self.expected_data['9']['length'] = 1 self.expected_data['9']['results'] = ['Alppp'] def test_rules(self): for key in self.data.keys(): d = dict() d['text'] = self.data[key]['text'] d['simple_tokens_original_case'] = self.c.extract_tokens_from_crf( self.c.extract_crftokens(d['text'], lowercase=False)) config = dict() config['field_name'] = 'name' results = self.c.extract_using_custom_spacy( d, config, field_rules=self.data[key]['rules']) self.assertTrue(len(results) == self.expected_data[key]['length']) for r in results: self.assertTrue( r['value'] in self.expected_data[key]['results'])