Пример #1
0
    def __init__(self, path_dir_database, path_wiki_pages):
        # === save input(s) ===#
        self.path_dir_database = os.path.join(path_dir_database,
                                              'wiki_database')
        self.path_wiki_pages = path_wiki_pages

        # === variables === #

        # === process === #
        print('WikiDatabase')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(self.path_dir_database)

        self.title_2_id_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='title_2_id',
                                      database_method='lsm',
                                      input_type='string',
                                      output_type='int',
                                      checks_flag=True)
        self.id_2_title_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_title',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='string',
                                      checks_flag=True)
        self.id_2_text_db = Database(path_database_dir=self.path_dir_database,
                                     database_name='id_2_text',
                                     database_method='lsm',
                                     input_type='int',
                                     output_type='string',
                                     checks_flag=True)
        self.id_2_lines_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_lines',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='list_str',
                                      checks_flag=True)

        # === create database === #
        self.flag_function_call(function_name='create_database', arg_list=[])

        self.nr_wikipedia_pages = self.settings.get_item(
            key='nr_wikipedia_pages')

        print('***finished***')
Пример #2
0
    def __init__(self, path_dir_database, claim_database, method_tokenization, n_gram, delimiter_option=False):
        # === save input(s) ===#
        self.claim_database = claim_database
        self.method_tokenization = method_tokenization
        self.n_gram = n_gram
        #         self.output_type = output_type
        self.delimiter_option = delimiter_option
        self.path_dir_database = os.path.join(path_dir_database,
                                              'claim_database_n_gram_' + self.claim_database.claim_data_set)
        # === variables === #

        # === process === #
        print('ClaimDatabaseNgrams')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(self.path_dir_database)

        self.nlp = spacy.load('en', disable=["parser", "ner"])

        method_tokenization_list = ['tokenize', 'tag']  # ['tokenize', 'lemma', 'tag', 'lower']
        n_gram_list = [1]
        delimiter_options_list = [True, False]
        doc_type_list = ['claim']

        # === create database === #
        self.flag_function_call(function_name='create_database', arg_list=[method_tokenization_list,
                                                                           n_gram_list,
                                                                           delimiter_options_list,
                                                                           doc_type_list])

        self.id_2_claim_db = Database(path_database_dir=self.path_dir_database,
                                      database_name=get_database_name_from_options(doc_type='claim',
                                                                                   method_tokenization=self.method_tokenization,
                                                                                   n_gram=self.n_gram,
                                                                                   delimiter_option=self.delimiter_option),
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='list_str',
                                      checks_flag=True)

        print('***finished***')
Пример #3
0
class ClaimDatabaseNgrams:

    def __init__(self, path_dir_database, claim_database, method_tokenization, n_gram, delimiter_option=False):
        # === save input(s) ===#
        self.claim_database = claim_database
        self.method_tokenization = method_tokenization
        self.n_gram = n_gram
        #         self.output_type = output_type
        self.delimiter_option = delimiter_option
        self.path_dir_database = os.path.join(path_dir_database,
                                              'claim_database_n_gram_' + self.claim_database.claim_data_set)
        # === variables === #

        # === process === #
        print('ClaimDatabaseNgrams')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(self.path_dir_database)

        self.nlp = spacy.load('en', disable=["parser", "ner"])

        method_tokenization_list = ['tokenize', 'tag']  # ['tokenize', 'lemma', 'tag', 'lower']
        n_gram_list = [1]
        delimiter_options_list = [True, False]
        doc_type_list = ['claim']

        # === create database === #
        self.flag_function_call(function_name='create_database', arg_list=[method_tokenization_list,
                                                                           n_gram_list,
                                                                           delimiter_options_list,
                                                                           doc_type_list])

        self.id_2_claim_db = Database(path_database_dir=self.path_dir_database,
                                      database_name=get_database_name_from_options(doc_type='claim',
                                                                                   method_tokenization=self.method_tokenization,
                                                                                   n_gram=self.n_gram,
                                                                                   delimiter_option=self.delimiter_option),
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='list_str',
                                      checks_flag=True)

        print('***finished***')

    def get_item(self, input_type, input_value, output_type):
        # description:
        # input:
        # - input_type: options: 'id'
        # - input_value: value that is the key
        # - output_type: 'title', 'text', 'lines'

        if input_type == 'id':
            if output_type == 'claim':
                return self.id_2_claim_db.get_item(input_value)
            else:
                raise ValueError('output_type not in options', output_type)
        else:
            raise ValueError('input_type not in options', input_type)

    def create_database(self, method_tokenization_list, n_gram_list, delimiter_options_list, doc_type_list):
        # description :
        # input :
        # -
        # output :
        # -

        database_dict = {}

        batch_size = 500000

        experiment_settings_list = []
        for method_tokenization in method_tokenization_list:
            for n_gram in n_gram_list:
                if n_gram == 1:
                    delimiter_option = True
                    experiment_settings_list.append([method_tokenization, n_gram, delimiter_option])
                else:
                    for delimiter_option in delimiter_options_list:
                        experiment_settings_list.append([method_tokenization, n_gram, delimiter_option])

        # === create databases === #
        for doc_type in doc_type_list:
            for experiment in experiment_settings_list:
                method_tokenization, n_gram, delimiter_option = experiment
                database_dict = create_path_dictionary(
                    [doc_type, method_tokenization, n_gram, delimiter_option],
                    database_dict)
                database_dict[doc_type][method_tokenization][n_gram][delimiter_option] = Database(
                    path_database_dir=self.path_dir_database,
                    database_name=get_database_name_from_options(doc_type=doc_type,
                                                                 method_tokenization=method_tokenization,
                                                                 n_gram=n_gram,
                                                                 delimiter_option=delimiter_option),
                    database_method='lsm',
                    input_type='int',
                    output_type='list_str',
                    checks_flag=True)

        for doc_type in doc_type_list:
            if doc_type in ['claim']:
                text_list = []
                id_list = []

                for doc_nr in tqdm(range(claim_database.nr_claims), desc='n_gram_claim_database_' + doc_type):
                    text = claim_database.get_item(input_type='id', input_value=doc_nr, output_type=doc_type)
                    text_list.append(text)
                    id_list.append(doc_nr)

                    if doc_nr % batch_size == 0 or doc_nr == claim_database.nr_claims - 1:
                        idx = 0
                        for doc in tqdm(self.nlp.pipe(iter_phrases(text_list)), desc='pipeline', total=len(text_list)):
                            text_class = Text(doc)

                            doc_nr_batch = id_list[idx]

                            for experiment in experiment_settings_list:
                                method_tokenization, n_gram, delimiter_option = experiment

                                tokenized_text = text_class.process(method_tokenization=method_tokenization,
                                                                    n_gram=n_gram,
                                                                    delimiter_flag=delimiter_option)

                                database_dict[doc_type][method_tokenization][n_gram][
                                    delimiter_option].store_item(key=doc_nr_batch,
                                                                 value=tokenized_text)

                            idx += 1
                        text_list = []
                        id_list = []
            else:
                raise ValueError('doc_type not in options', doc_type)

    # === recurrent functions == #
    def flag_function_call(self, function_name, arg_list):
        check_flag = self.settings.check_function_flag(function_name, 'check')

        if check_flag == 'finished_correctly':
            return True

        elif check_flag == 'not_started_yet':
            self.settings.check_function_flag(function_name, 'start')

            values = getattr(self, function_name)(*arg_list)

            self.settings.check_function_flag(function_name, 'finish')
        else:
            raise ValueError('check_flag not in options', check_flag)
Пример #4
0
class WikiDatabaseSqlite:
    def __init__(self, path_dir_database, path_wiki_pages):
        # === save input(s) ===#
        self.path_dir_database = os.path.join(path_dir_database,
                                              'wiki_database')
        self.path_wiki_pages = path_wiki_pages

        # === variables === #

        # === process === #
        print('WikiDatabase')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(self.path_dir_database)

        self.title_2_id_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='title_2_id',
                                      database_method='lsm',
                                      input_type='string',
                                      output_type='int',
                                      checks_flag=True)
        self.id_2_title_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_title',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='string',
                                      checks_flag=True)
        self.id_2_text_db = Database(path_database_dir=self.path_dir_database,
                                     database_name='id_2_text',
                                     database_method='lsm',
                                     input_type='int',
                                     output_type='string',
                                     checks_flag=True)
        self.id_2_lines_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_lines',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='list_str',
                                      checks_flag=True)

        # === create database === #
        self.flag_function_call(function_name='create_database', arg_list=[])

        self.nr_wikipedia_pages = self.settings.get_item(
            key='nr_wikipedia_pages')

        print('***finished***')

    def get_item(self, input_type, input_value, output_type):
        # description:
        # input:
        # - input_type: options: 'id'
        # - input_value: value that is the key
        # - output_type: 'title', 'text', 'lines'

        if input_type == 'id':
            if output_type == 'title':
                return self.id_2_title_db.get_item(input_value)
            elif output_type == 'text':
                return self.id_2_text_db.get_item(input_value)
            elif output_type == 'lines':
                return self.id_2_lines_db.get_item(input_value)
            else:
                raise ValueError('output_type not in options', output_type)

        elif input_type == 'title':
            if output_type == 'id':
                return self.title_2_id_db.get_item(input_value)
            else:
                raise ValueError('output_type not in options', output_type)
        else:
            raise ValueError('input_type not in options', input_type)

    def create_database(self):
        # description :
        # input :
        # -
        # output :
        # -

        nr_wikipedia_files = num_files_in_directory(self.path_wiki_pages)

        self.settings.add_item(key='nr_wikipedia_files',
                               value=nr_wikipedia_files)

        id_cnt = 0

        for wiki_page_nr in tqdm(range(1, nr_wikipedia_files + 1),
                                 desc='wiki_page_nr'):
            # load json wikipedia dump file
            wiki_page_path = os.path.join(self.path_wiki_pages,
                                          'wiki-%.3d.jsonl' % (wiki_page_nr))
            list_dict = load_jsonl(wiki_page_path)

            # iterate over pages
            for page in list_dict:
                title = normalise_text(page['id'])
                if title != '':
                    text = normalise_text(page['text'])
                    self.title_2_id_db.store_item(key=title, value=id_cnt)
                    self.id_2_title_db.store_item(key=id_cnt, value=title)
                    self.id_2_text_db.store_item(key=id_cnt, value=text)
                    self.id_2_lines_db.store_item(key=id_cnt,
                                                  value=get_list_lines(page))

                    id_cnt += 1

        self.settings.add_item(key='nr_wikipedia_pages', value=id_cnt)

    # === recurrent functions == #
    def flag_function_call(self, function_name, arg_list):
        check_flag = self.settings.check_function_flag(function_name, 'check')

        if check_flag == 'finished_correctly':
            return True

        elif check_flag == 'not_started_yet':
            self.settings.check_function_flag(function_name, 'start')

            values = getattr(self, function_name)(*arg_list)

            self.settings.check_function_flag(function_name, 'finish')
        else:
            raise ValueError('check_flag not in options', check_flag)
Пример #5
0
    def __init__(self,
                 path_dir_database,
                 path_raw_data_dir,
                 claim_data_set,
                 wiki_database=None):
        # === save input(s) ===#
        self.path_raw_data_dir = path_raw_data_dir
        self.claim_data_set = claim_data_set
        self.path_dir_database = os.path.join(
            path_dir_database, 'claim_database_' + self.claim_data_set)

        # === variables === #
        self.path_raw_claims = os.path.join(path_raw_data_dir,
                                            self.claim_data_set + '.jsonl')

        self.verifiable_2_int = {}
        self.verifiable_2_int['NOT VERIFIABLE'] = 0
        self.verifiable_2_int['VERIFIABLE'] = 1
        self.int_2_verifiable = {}
        self.int_2_verifiable[0] = 'NOT VERIFIABLE'
        self.int_2_verifiable[1] = 'VERIFIABLE'

        self.label_2_int = {}
        self.label_2_int['REFUTES'] = 0
        self.label_2_int['SUPPORTS'] = 1
        self.label_2_int['NOT ENOUGH INFO'] = 2
        self.int_2_label = {}
        self.int_2_label[0] = 'REFUTES'
        self.int_2_label[1] = 'SUPPORTS'
        self.int_2_label[2] = 'NOT ENOUGH INFO'

        # === process === #
        print('ClaimDatabase')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(path_settings_dir=self.path_dir_database)

        self.id_2_id_number_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_id_number',
            database_method='lsm',
            input_type='int',
            output_type='int',
            checks_flag=True)
        self.id_2_verifiable_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_verifiable',
            database_method='lsm',
            input_type='int',
            output_type='int',
            checks_flag=True)
        self.id_2_label_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_label',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='int',
                                      checks_flag=True)
        self.id_2_claim_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_claim',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='string',
                                      checks_flag=True)
        self.id_2_evidence_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_evidence',
            database_method='lsm',
            input_type='int',
            output_type='list_str',
            checks_flag=True)

        # === create database === #
        self.flag_function_call(function_name='create_database',
                                arg_list=[wiki_database])

        self.nr_claims = self.settings.get_item(key='nr_claims')

        print('***finished***')
Пример #6
0
class ClaimDatabase:
    def __init__(self,
                 path_dir_database,
                 path_raw_data_dir,
                 claim_data_set,
                 wiki_database=None):
        # === save input(s) ===#
        self.path_raw_data_dir = path_raw_data_dir
        self.claim_data_set = claim_data_set
        self.path_dir_database = os.path.join(
            path_dir_database, 'claim_database_' + self.claim_data_set)

        # === variables === #
        self.path_raw_claims = os.path.join(path_raw_data_dir,
                                            self.claim_data_set + '.jsonl')

        self.verifiable_2_int = {}
        self.verifiable_2_int['NOT VERIFIABLE'] = 0
        self.verifiable_2_int['VERIFIABLE'] = 1
        self.int_2_verifiable = {}
        self.int_2_verifiable[0] = 'NOT VERIFIABLE'
        self.int_2_verifiable[1] = 'VERIFIABLE'

        self.label_2_int = {}
        self.label_2_int['REFUTES'] = 0
        self.label_2_int['SUPPORTS'] = 1
        self.label_2_int['NOT ENOUGH INFO'] = 2
        self.int_2_label = {}
        self.int_2_label[0] = 'REFUTES'
        self.int_2_label[1] = 'SUPPORTS'
        self.int_2_label[2] = 'NOT ENOUGH INFO'

        # === process === #
        print('ClaimDatabase')

        mkdir_if_not_exist(self.path_dir_database)

        self.settings = Settings(path_settings_dir=self.path_dir_database)

        self.id_2_id_number_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_id_number',
            database_method='lsm',
            input_type='int',
            output_type='int',
            checks_flag=True)
        self.id_2_verifiable_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_verifiable',
            database_method='lsm',
            input_type='int',
            output_type='int',
            checks_flag=True)
        self.id_2_label_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_label',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='int',
                                      checks_flag=True)
        self.id_2_claim_db = Database(path_database_dir=self.path_dir_database,
                                      database_name='id_2_claim',
                                      database_method='lsm',
                                      input_type='int',
                                      output_type='string',
                                      checks_flag=True)
        self.id_2_evidence_db = Database(
            path_database_dir=self.path_dir_database,
            database_name='id_2_evidence',
            database_method='lsm',
            input_type='int',
            output_type='list_str',
            checks_flag=True)

        # === create database === #
        self.flag_function_call(function_name='create_database',
                                arg_list=[wiki_database])

        self.nr_claims = self.settings.get_item(key='nr_claims')

        print('***finished***')

    def get_item(self, input_type, input_value, output_type):
        if input_type == 'id':
            if output_type == 'id_number':
                # return the id number as specified in the raw data
                return self.id_2_id_number_db.get_item(input_value)
            elif output_type == 'verifiable_int':
                # return the verifiable flag in integer (int) format (0 or 1)
                return self.id_2_verifiable_db.get_item(input_value)
            elif output_type == 'verifiable_str':
                # return the verifiable flag in string(str) format ('NOT VERIFIABLE', 'VERIFIABLE')
                return self.int_2_verifiable[self.id_2_verifiable_db.get_item(
                    input_value)]
            elif output_type == 'label_int':
                # return the label in integer (int) format (0, 1, 2)
                return self.id_2_label_db.get_item(input_value)
            elif output_type == 'label_str':
                # return the label in string (str) format ('REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO')
                return self.int_2_label[self.id_2_label_db.get_item(
                    input_value)]
            elif output_type == 'claim':
                return self.id_2_claim_db.get_item(input_value)
            elif output_type == 'evidence':
                return self.id_2_evidence_db.get_item(input_value)
            elif output_type == 'evidence_class':
                return Evidence(self.id_2_evidence_db.get_item(input_value))
            else:
                raise ValueError('output_type not in options', output_type)
        else:
            raise ValueError('input_type not in options', input_type)

    def create_database(self, wiki_database):
        list_claim_dicts = load_jsonl(self.path_raw_claims)
        nr_claims = len(list_claim_dicts)

        for id in tqdm(range(nr_claims), desc='claims'):
            dict_claim_id = list_claim_dicts[id]
            #             print(dict_claim_id['claim'], type(dict_claim_id['claim']))
            dict_claim_id['verifiable'] = unicodedata.normalize(
                'NFD', normalise_text(dict_claim_id['verifiable']))
            dict_claim_id['claim'] = unicodedata.normalize(
                'NFD', normalise_text(dict_claim_id['claim']))
            for interpreter in range(len(dict_claim_id['evidence'])):
                for proof in range(len(
                        dict_claim_id['evidence'][interpreter])):
                    if dict_claim_id['evidence'][interpreter][proof][2] != None:
                        title = unicodedata.normalize(
                            'NFD',
                            normalise_text(dict_claim_id['evidence']
                                           [interpreter][proof][2]))
                        dict_claim_id['evidence'][interpreter][proof][
                            2] = wiki_database.get_item(input_type='title',
                                                        input_value=title,
                                                        output_type='id')

            self.id_2_id_number_db.store_item(key=id,
                                              value=dict_claim_id['id'])

            self.id_2_verifiable_db.store_item(
                key=id,
                value=self.verifiable_2_int[dict_claim_id['verifiable']])

            self.id_2_label_db.store_item(
                key=id, value=self.label_2_int[dict_claim_id['label']])

            #             print(dict_claim_id['claim'], type(dict_claim_id['claim']))
            self.id_2_claim_db.store_item(key=id, value=dict_claim_id['claim'])

            self.id_2_evidence_db.store_item(key=id,
                                             value=dict_claim_id['evidence'])

        self.settings.add_item(key='nr_claims', value=nr_claims)

    def flag_function_call(self, function_name, arg_list):
        check_flag = self.settings.check_function_flag(function_name, 'check')

        if check_flag == 'finished_correctly':
            return True

        elif check_flag == 'not_started_yet':
            self.settings.check_function_flag(function_name, 'start')

            values = getattr(self, function_name)(*arg_list)

            self.settings.check_function_flag(function_name, 'finish')
        else:
            raise ValueError('check_flag not in options', check_flag)