def __init__(self, path_dir_database, path_wiki_pages): # === save input(s) ===# self.path_dir_database = os.path.join(path_dir_database, 'wiki_database') self.path_wiki_pages = path_wiki_pages # === variables === # # === process === # print('WikiDatabase') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(self.path_dir_database) self.title_2_id_db = Database(path_database_dir=self.path_dir_database, database_name='title_2_id', database_method='lsm', input_type='string', output_type='int', checks_flag=True) self.id_2_title_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_title', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_text_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_text', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_lines_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_lines', database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) # === create database === # self.flag_function_call(function_name='create_database', arg_list=[]) self.nr_wikipedia_pages = self.settings.get_item( key='nr_wikipedia_pages') print('***finished***')
def __init__(self, path_dir_database, claim_database, method_tokenization, n_gram, delimiter_option=False): # === save input(s) ===# self.claim_database = claim_database self.method_tokenization = method_tokenization self.n_gram = n_gram # self.output_type = output_type self.delimiter_option = delimiter_option self.path_dir_database = os.path.join(path_dir_database, 'claim_database_n_gram_' + self.claim_database.claim_data_set) # === variables === # # === process === # print('ClaimDatabaseNgrams') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(self.path_dir_database) self.nlp = spacy.load('en', disable=["parser", "ner"]) method_tokenization_list = ['tokenize', 'tag'] # ['tokenize', 'lemma', 'tag', 'lower'] n_gram_list = [1] delimiter_options_list = [True, False] doc_type_list = ['claim'] # === create database === # self.flag_function_call(function_name='create_database', arg_list=[method_tokenization_list, n_gram_list, delimiter_options_list, doc_type_list]) self.id_2_claim_db = Database(path_database_dir=self.path_dir_database, database_name=get_database_name_from_options(doc_type='claim', method_tokenization=self.method_tokenization, n_gram=self.n_gram, delimiter_option=self.delimiter_option), database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) print('***finished***')
class ClaimDatabaseNgrams: def __init__(self, path_dir_database, claim_database, method_tokenization, n_gram, delimiter_option=False): # === save input(s) ===# self.claim_database = claim_database self.method_tokenization = method_tokenization self.n_gram = n_gram # self.output_type = output_type self.delimiter_option = delimiter_option self.path_dir_database = os.path.join(path_dir_database, 'claim_database_n_gram_' + self.claim_database.claim_data_set) # === variables === # # === process === # print('ClaimDatabaseNgrams') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(self.path_dir_database) self.nlp = spacy.load('en', disable=["parser", "ner"]) method_tokenization_list = ['tokenize', 'tag'] # ['tokenize', 'lemma', 'tag', 'lower'] n_gram_list = [1] delimiter_options_list = [True, False] doc_type_list = ['claim'] # === create database === # self.flag_function_call(function_name='create_database', arg_list=[method_tokenization_list, n_gram_list, delimiter_options_list, doc_type_list]) self.id_2_claim_db = Database(path_database_dir=self.path_dir_database, database_name=get_database_name_from_options(doc_type='claim', method_tokenization=self.method_tokenization, n_gram=self.n_gram, delimiter_option=self.delimiter_option), database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) print('***finished***') def get_item(self, input_type, input_value, output_type): # description: # input: # - input_type: options: 'id' # - input_value: value that is the key # - output_type: 'title', 'text', 'lines' if input_type == 'id': if output_type == 'claim': return self.id_2_claim_db.get_item(input_value) else: raise ValueError('output_type not in options', output_type) else: raise ValueError('input_type not in options', input_type) def create_database(self, method_tokenization_list, n_gram_list, delimiter_options_list, doc_type_list): # description : # input : # - # output : # - database_dict = {} batch_size = 500000 experiment_settings_list = [] for method_tokenization in method_tokenization_list: for n_gram in n_gram_list: if n_gram == 1: delimiter_option = True experiment_settings_list.append([method_tokenization, n_gram, delimiter_option]) else: for delimiter_option in delimiter_options_list: experiment_settings_list.append([method_tokenization, n_gram, delimiter_option]) # === create databases === # for doc_type in doc_type_list: for experiment in experiment_settings_list: method_tokenization, n_gram, delimiter_option = experiment database_dict = create_path_dictionary( [doc_type, method_tokenization, n_gram, delimiter_option], database_dict) database_dict[doc_type][method_tokenization][n_gram][delimiter_option] = Database( path_database_dir=self.path_dir_database, database_name=get_database_name_from_options(doc_type=doc_type, method_tokenization=method_tokenization, n_gram=n_gram, delimiter_option=delimiter_option), database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) for doc_type in doc_type_list: if doc_type in ['claim']: text_list = [] id_list = [] for doc_nr in tqdm(range(claim_database.nr_claims), desc='n_gram_claim_database_' + doc_type): text = claim_database.get_item(input_type='id', input_value=doc_nr, output_type=doc_type) text_list.append(text) id_list.append(doc_nr) if doc_nr % batch_size == 0 or doc_nr == claim_database.nr_claims - 1: idx = 0 for doc in tqdm(self.nlp.pipe(iter_phrases(text_list)), desc='pipeline', total=len(text_list)): text_class = Text(doc) doc_nr_batch = id_list[idx] for experiment in experiment_settings_list: method_tokenization, n_gram, delimiter_option = experiment tokenized_text = text_class.process(method_tokenization=method_tokenization, n_gram=n_gram, delimiter_flag=delimiter_option) database_dict[doc_type][method_tokenization][n_gram][ delimiter_option].store_item(key=doc_nr_batch, value=tokenized_text) idx += 1 text_list = [] id_list = [] else: raise ValueError('doc_type not in options', doc_type) # === recurrent functions == # def flag_function_call(self, function_name, arg_list): check_flag = self.settings.check_function_flag(function_name, 'check') if check_flag == 'finished_correctly': return True elif check_flag == 'not_started_yet': self.settings.check_function_flag(function_name, 'start') values = getattr(self, function_name)(*arg_list) self.settings.check_function_flag(function_name, 'finish') else: raise ValueError('check_flag not in options', check_flag)
class WikiDatabaseSqlite: def __init__(self, path_dir_database, path_wiki_pages): # === save input(s) ===# self.path_dir_database = os.path.join(path_dir_database, 'wiki_database') self.path_wiki_pages = path_wiki_pages # === variables === # # === process === # print('WikiDatabase') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(self.path_dir_database) self.title_2_id_db = Database(path_database_dir=self.path_dir_database, database_name='title_2_id', database_method='lsm', input_type='string', output_type='int', checks_flag=True) self.id_2_title_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_title', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_text_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_text', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_lines_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_lines', database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) # === create database === # self.flag_function_call(function_name='create_database', arg_list=[]) self.nr_wikipedia_pages = self.settings.get_item( key='nr_wikipedia_pages') print('***finished***') def get_item(self, input_type, input_value, output_type): # description: # input: # - input_type: options: 'id' # - input_value: value that is the key # - output_type: 'title', 'text', 'lines' if input_type == 'id': if output_type == 'title': return self.id_2_title_db.get_item(input_value) elif output_type == 'text': return self.id_2_text_db.get_item(input_value) elif output_type == 'lines': return self.id_2_lines_db.get_item(input_value) else: raise ValueError('output_type not in options', output_type) elif input_type == 'title': if output_type == 'id': return self.title_2_id_db.get_item(input_value) else: raise ValueError('output_type not in options', output_type) else: raise ValueError('input_type not in options', input_type) def create_database(self): # description : # input : # - # output : # - nr_wikipedia_files = num_files_in_directory(self.path_wiki_pages) self.settings.add_item(key='nr_wikipedia_files', value=nr_wikipedia_files) id_cnt = 0 for wiki_page_nr in tqdm(range(1, nr_wikipedia_files + 1), desc='wiki_page_nr'): # load json wikipedia dump file wiki_page_path = os.path.join(self.path_wiki_pages, 'wiki-%.3d.jsonl' % (wiki_page_nr)) list_dict = load_jsonl(wiki_page_path) # iterate over pages for page in list_dict: title = normalise_text(page['id']) if title != '': text = normalise_text(page['text']) self.title_2_id_db.store_item(key=title, value=id_cnt) self.id_2_title_db.store_item(key=id_cnt, value=title) self.id_2_text_db.store_item(key=id_cnt, value=text) self.id_2_lines_db.store_item(key=id_cnt, value=get_list_lines(page)) id_cnt += 1 self.settings.add_item(key='nr_wikipedia_pages', value=id_cnt) # === recurrent functions == # def flag_function_call(self, function_name, arg_list): check_flag = self.settings.check_function_flag(function_name, 'check') if check_flag == 'finished_correctly': return True elif check_flag == 'not_started_yet': self.settings.check_function_flag(function_name, 'start') values = getattr(self, function_name)(*arg_list) self.settings.check_function_flag(function_name, 'finish') else: raise ValueError('check_flag not in options', check_flag)
def __init__(self, path_dir_database, path_raw_data_dir, claim_data_set, wiki_database=None): # === save input(s) ===# self.path_raw_data_dir = path_raw_data_dir self.claim_data_set = claim_data_set self.path_dir_database = os.path.join( path_dir_database, 'claim_database_' + self.claim_data_set) # === variables === # self.path_raw_claims = os.path.join(path_raw_data_dir, self.claim_data_set + '.jsonl') self.verifiable_2_int = {} self.verifiable_2_int['NOT VERIFIABLE'] = 0 self.verifiable_2_int['VERIFIABLE'] = 1 self.int_2_verifiable = {} self.int_2_verifiable[0] = 'NOT VERIFIABLE' self.int_2_verifiable[1] = 'VERIFIABLE' self.label_2_int = {} self.label_2_int['REFUTES'] = 0 self.label_2_int['SUPPORTS'] = 1 self.label_2_int['NOT ENOUGH INFO'] = 2 self.int_2_label = {} self.int_2_label[0] = 'REFUTES' self.int_2_label[1] = 'SUPPORTS' self.int_2_label[2] = 'NOT ENOUGH INFO' # === process === # print('ClaimDatabase') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(path_settings_dir=self.path_dir_database) self.id_2_id_number_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_id_number', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_verifiable_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_verifiable', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_label_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_label', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_claim_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_claim', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_evidence_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_evidence', database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) # === create database === # self.flag_function_call(function_name='create_database', arg_list=[wiki_database]) self.nr_claims = self.settings.get_item(key='nr_claims') print('***finished***')
class ClaimDatabase: def __init__(self, path_dir_database, path_raw_data_dir, claim_data_set, wiki_database=None): # === save input(s) ===# self.path_raw_data_dir = path_raw_data_dir self.claim_data_set = claim_data_set self.path_dir_database = os.path.join( path_dir_database, 'claim_database_' + self.claim_data_set) # === variables === # self.path_raw_claims = os.path.join(path_raw_data_dir, self.claim_data_set + '.jsonl') self.verifiable_2_int = {} self.verifiable_2_int['NOT VERIFIABLE'] = 0 self.verifiable_2_int['VERIFIABLE'] = 1 self.int_2_verifiable = {} self.int_2_verifiable[0] = 'NOT VERIFIABLE' self.int_2_verifiable[1] = 'VERIFIABLE' self.label_2_int = {} self.label_2_int['REFUTES'] = 0 self.label_2_int['SUPPORTS'] = 1 self.label_2_int['NOT ENOUGH INFO'] = 2 self.int_2_label = {} self.int_2_label[0] = 'REFUTES' self.int_2_label[1] = 'SUPPORTS' self.int_2_label[2] = 'NOT ENOUGH INFO' # === process === # print('ClaimDatabase') mkdir_if_not_exist(self.path_dir_database) self.settings = Settings(path_settings_dir=self.path_dir_database) self.id_2_id_number_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_id_number', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_verifiable_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_verifiable', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_label_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_label', database_method='lsm', input_type='int', output_type='int', checks_flag=True) self.id_2_claim_db = Database(path_database_dir=self.path_dir_database, database_name='id_2_claim', database_method='lsm', input_type='int', output_type='string', checks_flag=True) self.id_2_evidence_db = Database( path_database_dir=self.path_dir_database, database_name='id_2_evidence', database_method='lsm', input_type='int', output_type='list_str', checks_flag=True) # === create database === # self.flag_function_call(function_name='create_database', arg_list=[wiki_database]) self.nr_claims = self.settings.get_item(key='nr_claims') print('***finished***') def get_item(self, input_type, input_value, output_type): if input_type == 'id': if output_type == 'id_number': # return the id number as specified in the raw data return self.id_2_id_number_db.get_item(input_value) elif output_type == 'verifiable_int': # return the verifiable flag in integer (int) format (0 or 1) return self.id_2_verifiable_db.get_item(input_value) elif output_type == 'verifiable_str': # return the verifiable flag in string(str) format ('NOT VERIFIABLE', 'VERIFIABLE') return self.int_2_verifiable[self.id_2_verifiable_db.get_item( input_value)] elif output_type == 'label_int': # return the label in integer (int) format (0, 1, 2) return self.id_2_label_db.get_item(input_value) elif output_type == 'label_str': # return the label in string (str) format ('REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO') return self.int_2_label[self.id_2_label_db.get_item( input_value)] elif output_type == 'claim': return self.id_2_claim_db.get_item(input_value) elif output_type == 'evidence': return self.id_2_evidence_db.get_item(input_value) elif output_type == 'evidence_class': return Evidence(self.id_2_evidence_db.get_item(input_value)) else: raise ValueError('output_type not in options', output_type) else: raise ValueError('input_type not in options', input_type) def create_database(self, wiki_database): list_claim_dicts = load_jsonl(self.path_raw_claims) nr_claims = len(list_claim_dicts) for id in tqdm(range(nr_claims), desc='claims'): dict_claim_id = list_claim_dicts[id] # print(dict_claim_id['claim'], type(dict_claim_id['claim'])) dict_claim_id['verifiable'] = unicodedata.normalize( 'NFD', normalise_text(dict_claim_id['verifiable'])) dict_claim_id['claim'] = unicodedata.normalize( 'NFD', normalise_text(dict_claim_id['claim'])) for interpreter in range(len(dict_claim_id['evidence'])): for proof in range(len( dict_claim_id['evidence'][interpreter])): if dict_claim_id['evidence'][interpreter][proof][2] != None: title = unicodedata.normalize( 'NFD', normalise_text(dict_claim_id['evidence'] [interpreter][proof][2])) dict_claim_id['evidence'][interpreter][proof][ 2] = wiki_database.get_item(input_type='title', input_value=title, output_type='id') self.id_2_id_number_db.store_item(key=id, value=dict_claim_id['id']) self.id_2_verifiable_db.store_item( key=id, value=self.verifiable_2_int[dict_claim_id['verifiable']]) self.id_2_label_db.store_item( key=id, value=self.label_2_int[dict_claim_id['label']]) # print(dict_claim_id['claim'], type(dict_claim_id['claim'])) self.id_2_claim_db.store_item(key=id, value=dict_claim_id['claim']) self.id_2_evidence_db.store_item(key=id, value=dict_claim_id['evidence']) self.settings.add_item(key='nr_claims', value=nr_claims) def flag_function_call(self, function_name, arg_list): check_flag = self.settings.check_function_flag(function_name, 'check') if check_flag == 'finished_correctly': return True elif check_flag == 'not_started_yet': self.settings.check_function_flag(function_name, 'start') values = getattr(self, function_name)(*arg_list) self.settings.check_function_flag(function_name, 'finish') else: raise ValueError('check_flag not in options', check_flag)