def __init__(self, etk): ETKModule.__init__(self, etk) self.actor_type_decoder = DecodingValueExtractor( self.actor_codes, 'Actor Code Decoder') self.known_group_decoder = DecodingValueExtractor( self.known_group_codes, 'Known Groups Decoder') self.ethnic_group_decoder = DecodingValueExtractor( self.known_group_codes, 'Ethnic Groups Decoder') self.religion_decoder = DecodingValueExtractor(self.religion_codes, 'Religion Decoder') self.country_decoder = DecodingValueExtractor(self.country_codes, 'Country Decoder')
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser') self.causeex_decoder = DecodingValueExtractor( event_to_clauseex_class_mapping, 'CauseEx Type', default_action="delete")
def __init__(self, etk): ETKModule.__init__(self, etk) self.doc_selector = DefaultDocumentSelector() self.incomp_decoder = DecodingValueExtractor(self.incomp_type, 'Incomp Decoder') self.int_decoder = DecodingValueExtractor(self.int_event_type, 'Int Decoder') self.int_fatalities_decoder = DecodingValueExtractor(self.int_fatalities, 'Int Fatalities Decoder') self.int_fatalities_size_lower_decoder = DecodingValueExtractor(self.int_fatalities_size_lower, 'Int Fatalities Lower Bound Size Decoder') self.int_fatalities_size_upper_decoder = DecodingValueExtractor(self.int_fatalities_size_upper, 'Int Fatalities Upper Bound Size Decoder', default_action="delete") self.int_causeex_decoder = DecodingValueExtractor(self.int_causeex_type, 'Int CauseEx Type', default_action="delete")
def __init__(self, etk): ETKModule.__init__(self, etk) self.weapon_decoder = DecodingValueExtractor( weapons_to_clauseex_class_mapping, 'Causeex Weapon Type', default_action='delete')
def test_dictionary_extractor(self) -> None: decoding_dict = { 'CA': 'California', 'ny': 'New York', 'AZ': ' Arizona', ' TX ': 'Texas', ' fl': 'Florida', } values = ['ca', 'CA', ' CA', ' ca', 'NY', ' ny', 'Az', 'AZ', 'az ', 'tx', 'tx ', 'TX', 'fl', 'FL', 'fl '] de_default = DecodingValueExtractor(decoding_dict, 'default_decoding') # strip_key and not case_sensitive de_case_sensitive = DecodingValueExtractor(decoding_dict, 'default_decoding', case_sensitive=True) de_not_strip_key = DecodingValueExtractor(decoding_dict, 'default_decoding', strip_key=False) de_strip_value = DecodingValueExtractor(decoding_dict, 'default_decoding', strip_value=True) results = list() results.append([de_default.extract(v)[0].value for v in values if de_default.extract(v)]) results.append([de_case_sensitive.extract(v)[0].value for v in values if de_case_sensitive.extract(v)]) results.append([de_not_strip_key.extract(v)[0].value for v in values if de_not_strip_key.extract(v)]) results.append([de_strip_value.extract(v)[0].value for v in values if de_strip_value.extract(v)]) expected = [ [ 'California', 'California', 'California', 'California', 'New York', 'New York', ' Arizona', ' Arizona', ' Arizona', 'Texas', 'Texas', 'Texas', 'Florida', 'Florida', 'Florida' ], [ 'California', 'California', 'New York', ' Arizona', 'Texas', 'Florida', 'Florida' ], [ 'California', 'California', 'New York', ' Arizona', ' Arizona' ], [ 'California', 'California', 'California', 'California', 'New York', 'New York', 'Arizona', 'Arizona', 'Arizona', 'Texas', 'Texas', 'Texas', 'Florida', 'Florida', 'Florida' ], ] self.assertEqual(results[:-1], expected[:-1])
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True)