def test_conflicts_equal_length_take_same_language(self): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) some_entity1 = entity_config( 2, 'Some Entity1', aliases=[entity_alias('Some Entity One', language='fr')]) some_entity2 = entity_config(3, 'Some Entity2', aliases=['Something Two']) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should not be found in this text because it is not in German language.' \ 'Shorter match - "Someeee Entityyy" should be taken instead.' expected = ((some_entity[1], 'Some Entity'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['de'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_am_pm_abbreviations(): am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([(am[1], 'AM')], find_dict_entities, 'It is 11:00am now in (AM). Hello!', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True) lexnlp_tests.test_extraction_func([], find_dict_entities, 'It is 11:00am now.', all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_plural_case_matching(): table = entity_config(1, 'Table', aliases=[entity_alias('tbl.', is_abbreviation=True)], name_is_alias=True) man = entity_config(2, 'man', name_is_alias=True) masloboyka = entity_config(3, 'masloboyka', name_is_alias=True) entities = [table, man, masloboyka] text = 'We should detect the singular number of word "tables" here - the stemmer takes care of plural case. ' \ 'Unfortunately our stemmer is not able to convert word "men" to singular number yet :(. ' \ 'But it works for word "masloboykas" - a non existing word in English in plural case.' expected = ( (table[1], 'Table'), (masloboyka[1], 'masloboyka'), ) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, use_stemmer=True, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_am_pm_none(self): am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) pm = entity_config(2, 'Postmodernism', aliases=[entity_alias('PM', is_abbreviation=True)], name_is_alias=False) entities = [am, pm] ents = list( find_dict_entities('It is 11:00 AM or 11:00 PM now.', all_possible_entities=entities)) self.assertEqual(0, len(ents)) ents = list( find_dict_entities('It is 11:00am now in (AM). Hello!', all_possible_entities=entities)) self.assertEqual(1, len(ents)) self.assertEqual('America', ents[0].entity[0][1]) ents = list( find_dict_entities('It is 11:00am now.', all_possible_entities=entities)) self.assertEqual(0, len(ents))
def test_court_config_setup(): """ Test setup of CourtConfig object. :return: """ # Test setup 1 cc = entity_config(0, 'Test Court', 0, ['Alias']) assert_equals(str(cc), "(0, 'Test Court', 0, [('Test Court', None, False, None), ('Alias', None, False, None)])") # Test setup 2 cc = entity_config(0, 'Test Court', 0) assert_equals(str(cc), "(0, 'Test Court', 0, [('Test Court', None, False, None)])")
def test_conflicts_take_longest_match(): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) some_entity1 = entity_config(2, 'Some Entity One', aliases=['Something One']) some_entity2 = entity_config(3, 'Some Entity Two', aliases=['Something Two']) entities = [some_entity, some_entity1, some_entity2] text = '"Some Entity One" should be found in this text and "Someee Entityyy" should be ignored.' expected = ((some_entity1[1], 'Some Entity One'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities, actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_equal_aliases_in_dif_languages(self): mississippi = entity_config(1, 'Mississippi', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Mississippi', language='de'), entity_alias('Mississippi', language='en') ]) montserrat = entity_config(2, 'Montserrat', aliases=[ entity_alias('MS', is_abbreviation=True, language='en'), entity_alias('Montserrat', language='de'), entity_alias('Montserrat', language='en') ]) canada = entity_config(3, 'Canada', aliases=[ entity_alias('CAN', is_abbreviation=True, language='en'), entity_alias('Kanada', language='de'), entity_alias('Canada', language='en') ]) entities = [mississippi, montserrat, canada] text = '"MS" here can mean either "MMMississippi" or "MMMontserrat" because they have equal aliases in English. ' \ 'This test is here because in one version of the code alias texts were required to be unique. ' \ '"CCCanada" (can) should not be detected because word "can" is in lowercase here.' expected = ((mississippi[1], 'MS'), (montserrat[1], 'MS')) lexnlp_tests.test_extraction_func( expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['en'], actual_data_converter=lambda actual: [ (get_entity_name(c.entity[0]), c.entity[1][0]) for c in actual ], debug_print=True)
def test_courts_longest_match(): """ Tests the case when there are courts having names/aliases being one a substring of another. In such case the court having longest alias should be returned for each conflicting matching. But for the case when there is another match of the court having shorter alias in that conflict, they both should be returned. :return: """ courts_config_fn = os.path.join( os.path.dirname(lexnlp_tests.this_test_data_path()), 'us_courts.csv') courts_config_list = [] with open(courts_config_fn, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: cc = entity_config(row['Court ID'], row['Court Type'] + '|' + row['Court Name'], 0, row['Alias'].split(';') if row['Alias'] else [], name_is_alias=False) add_alias_to_entity(cc, row['Court Name']) courts_config_list.append(cc) lexnlp_tests.test_extraction_func_on_test_data( get_courts, court_config_list=courts_config_list, actual_data_converter=lambda actual: [tuple(c[0][1].split('|')) for c in actual], debug_print=True)
def test_abbreviations_simple(): some_entity = entity_config(1, 'ITAbbrev', aliases=[entity_alias('IT', is_abbreviation=True)]) some_entity1 = entity_config(2, 'ISAbbrev', aliases=[entity_alias('IS', is_abbreviation=True)]) entities = [some_entity, some_entity1] text = '"IT\'s" entity should be detected even with "\'s" because tokenizer takes care of this kind of things. ' \ '"ISS" entity should not be detected - bacause "is" word' \ ' is in lowercase here and probably does not mean an abbreviation.' expected = ((some_entity[1], 'IT'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=entities, text_languages=['ge'], actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_courts(): """ Test court extraction. :return: """ # Read master data import pandas # Load court data court_df = pandas \ .read_csv("https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.2/en/legal/us_courts" ".csv") # Create config objects court_config_list = [] for _, row in court_df.iterrows(): c = entity_config( row["Court ID"], row["Court Name"], 0, row["Alias"].split(";") if not pandas.isnull(row["Alias"]) else []) court_config_list.append(c) lexnlp_tests.test_extraction_func_on_test_data( get_courts, court_config_list=court_config_list, actual_data_converter=lambda actual: [cc[0][1] for cc in actual])
def load_entities_dict(): entities_fn = os.path.join( os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoentities.csv') aliases_fn = os.path.join( os.path.dirname(lexnlp_tests.this_test_data_path()), 'geoaliases.csv') entities = {} with open(entities_fn, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: entities[row['id']] = entity_config( row['id'], row['name'], int(row['priority']) if row['priority'] else 0, name_is_alias=True) with open(aliases_fn, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: entity = entities.get(row['entity_id']) if entity: add_aliases_to_entity( entity, row['alias'], row['locale'], row['type'].startswith('iso') or row['type'] == 'abbreviation') return entities.values()
def cache_court_config(): res = [dict_entities.entity_config( entity_id=i.id, name=i.name, priority=0, aliases=i.alias.split(';') if i.alias else [] ) for i in Court.objects.all()] DbCache.put_to_db(CACHE_KEY_COURT_CONFIG, res)
def test_find_dict_entities_empty_text(): text = '' am = entity_config(1, 'America', aliases=[entity_alias('AM', is_abbreviation=True)], name_is_alias=False) res = list(find_dict_entities(text, [am])) assert_false(res)
def test_common_search_all_languages(): some_entity = entity_config(1, 'Some Entity', aliases=['Something']) text = 'Some Entity should be found in this text.' expected = ((some_entity[1], 'Some Entity'),) lexnlp_tests.test_extraction_func(expected, find_dict_entities, text, all_possible_entities=[some_entity], actual_data_converter=lambda actual: [(get_entity_name(c[0]), c[1][0]) for c in actual], debug_print=True)
def test_court_config_setup(): """ Test setup of CourtConfig object. :return: """ # Test setup 1 cc = entity_config(0, 'Test Court', 0, ['Alias']) assert_equals( "(0, 'Test Court', 0, [('Test Court', None, False, None, ' test court '), " + \ "('Alias', None, False, None, ' alias ')])", str(cc))
def parse_courts_legacy_function(self, text: str): court_df = pandas \ .read_csv( "https://raw.githubusercontent.com/LexPredict/lexpredict-legal-dictionary/1.0.2/en/legal/us_courts" ".csv") # Create config objects court_config_list = [] for _, row in court_df.iterrows(): c = entity_config(row["Court ID"], row["Court Name"], 0, row["Alias"].split(";") if not pandas.isnull(row["Alias"]) else []) court_config_list.append(c) return get_courts(text, court_config_list)
def cache_geo_config(): geo_config = {} for name, pk, priority in GeoEntity.objects.values_list('name', 'pk', 'priority'): entity = dict_entities.entity_config(pk, name, priority or 0, name_is_alias=True) geo_config[pk] = entity for alias_id, alias_text, alias_type, entity_id, alias_lang \ in GeoAlias.objects.values_list('pk', 'alias', 'type', 'entity', 'locale'): entity = geo_config[entity_id] if entity: is_abbrev = alias_type.startswith('iso') or alias_type.startswith('abbrev') dict_entities.add_aliases_to_entity(entity, aliases_csv=alias_text, language=alias_lang, is_abbreviation=is_abbrev, alias_id=alias_id) res = list(geo_config.values()) DbCache.put_to_db(CACHE_KEY_GEO_CONFIG, res)
def load_entities_dict_by_path(entities_fn: str, aliases_fn: str): entities = {} import csv with open(entities_fn, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: entities[row['id']] = entity_config(row['id'], row['name'], int(row['priority']) if row['priority'] else 0, name_is_alias=True) with open(aliases_fn, 'r', encoding='utf8') as f: reader = csv.DictReader(f) for row in reader: entity = entities.get(row['entity_id']) if entity: add_aliases_to_entity(entity, row['alias'], row['locale'], row['type'].startswith('iso') or row['type'] == 'abbreviation') return entities.values()
def test_court_config_setup_wo_alias(): # Test setup 2 cc = entity_config(0, 'Test Court', 0) assert_equals( "(0, 'Test Court', 0, [('Test Court', None, False, None, ' test court ')])", str(cc))