def __init__(self, etk): ETKModule.__init__(self, etk) sample_rules = self.etk.load_spacy_rule( "./extraction_modules/resources/sample_rules.json") self.sample_rule_extractor = SpacyRuleExtractor( self.etk.default_nlp, sample_rules, "test_extractor")
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser') self.causeex_decoder = DecodingValueExtractor( event_to_clauseex_class_mapping, 'CauseEx Type', default_action="delete")
def __init__(self, etk): ETKModule.__init__(self, etk) self.name_extractor = GlossaryExtractor( self.etk.load_glossary("./names.txt"), "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1)
def __init__(self, etk): ETKModule.__init__(self, etk) self.mapping = GdeltMapping(json.load(open("ODP-Mappings-V3.1.json"))) # As our input files have no header, create a translation table to go from names to indices. for i in range(0, len(self.header_fields)): self.header_translation_table[ self.header_fields[i]] = "COL" + str(i) # Extractors self.date_extractor = DateExtractor(self.etk, "Date Extractor")
def __init__(self, etk): ETKModule.__init__(self, etk) bae = BitcoinAddressExtractor() ce = CVEExtractor() che = CryptographicHashExtractor() he = HostnameExtractor() iae = IPAddressExtractor() ue = URLExtractor(True) self.e_list = [bae, ce, che, he, iae, ue]
def __init__(self, etk): ETKModule.__init__(self, etk) self.actor_type_decoder = DecodingValueExtractor( self.actor_codes, 'Actor Code Decoder') self.known_group_decoder = DecodingValueExtractor( self.known_group_codes, 'Known Groups Decoder') self.ethnic_group_decoder = DecodingValueExtractor( self.known_group_codes, 'Ethnic Groups Decoder') self.religion_decoder = DecodingValueExtractor(self.religion_codes, 'Religion Decoder') self.country_decoder = DecodingValueExtractor(self.country_codes, 'Country Decoder')
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser') ifp_list = open('new_ifps.jl').readlines() self.new_ifps = dict() for ifp in ifp_list: j = json.loads(ifp) self.new_ifps[j['ifp']['id']] = j['ifp']['name'] self.parsed_ifps = dict() # self.ifps_entity_map = dict() self.threshold = 0.86 self.nlp = spacy.load('en_core_web_lg') self.preprocess_ifps() self.ranking_criteria = 'SENTENCE'
def __init__(self, etk): ETKModule.__init__(self, etk) self.doc_selector = DefaultDocumentSelector() self.incomp_decoder = DecodingValueExtractor(self.incomp_type, 'Incomp Decoder') self.int_decoder = DecodingValueExtractor(self.int_event_type, 'Int Decoder') self.int_fatalities_decoder = DecodingValueExtractor(self.int_fatalities, 'Int Fatalities Decoder') self.int_fatalities_size_lower_decoder = DecodingValueExtractor(self.int_fatalities_size_lower, 'Int Fatalities Lower Bound Size Decoder') self.int_fatalities_size_upper_decoder = DecodingValueExtractor(self.int_fatalities_size_upper, 'Int Fatalities Upper Bound Size Decoder', default_action="delete") self.int_causeex_decoder = DecodingValueExtractor(self.int_causeex_type, 'Int CauseEx Type', default_action="delete")
def __init__(self, etk): ETKModule.__init__(self, etk) self.name_extractor = GlossaryExtractor( self.etk.load_glossary("./extraction_modules/resources/names.txt"), "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) self.student_extractor = GlossaryExtractor(self.etk.load_glossary( "./extraction_modules/resources/student.txt"), "student_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1)
def __init__(self, etk): ETKModule.__init__(self, etk) self.my_table_extractor = TableExtractor() self.etk.parser = jex.parse file_name = '${GLOSSARY_PATH}/cities_ppl_25000.json' file = open(file_name, 'r') self.city_dataset = json.loads(file.read()) file.close() self.city_list = list(self.city_dataset.keys()) self.my_glossary_extractor = GlossaryExtractor( glossary=self.city_list, extractor_name='tutorial_glossary', tokenizer=etk.default_tokenizer, ngrams=3, case_sensitive=False)
def __init__(self, etk): ETKModule.__init__(self, etk) self.metadata_extractor = HTMLMetadataExtractor() self.content_extractor = HTMLContentExtractor() self.date_extractor = DateExtractor(self.etk, 'demo_date_parser') self.country_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3)
def __init__(self, etk): ETKModule.__init__(self, etk)
def __init__(self, etk): ETKModule.__init__(self, etk) self.weapon_decoder = DecodingValueExtractor( weapons_to_clauseex_class_mapping, 'Causeex Weapon Type', default_action='delete')
def __init__(self, etk): ETKModule.__init__(self, etk) self.rule_extractor = SpacyRuleExtractor( self.etk.default_nlp, self.etk.load_spacy_rule("sample_rules.json"), "test_extractor")
def __init__(self, etk): ETKModule.__init__(self, etk) self.inferlink_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file( '../html_basic/sample_inferlink_rules.json')))
def __init__(self, etk): ETKModule.__init__(self, etk) self.uri_prefix = "http://spaceaware.isi.edu/data"
def __init__(self, etk): ETKModule.__init__(self, etk) self.metadata_extractor = HTMLMetadataExtractor() self.content_extractor = HTMLContentExtractor()
def __init__(self, etk: ETK): ETKModule.__init__(self, etk) self.sentence_extractor = SentenceExtractor( name="My sentence splitter")
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True)
def __init__(self, etk): ETKModule.__init__(self, etk) self.table_extractor = TableExtractor()
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
def __init__(self, etk): ETKModule.__init__(self, etk) self.ee = ExcelExtractor()