示例#1
0
    def __extra_examples_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            examples_dicts = file_tool.load_data_pickle(des_filename)
            return examples_dicts
        example_dicts = []
        rows = file_tool.read_tsv(org_file)
        for i, row in enumerate(rows):
            if i == 0:
                continue
            if i == 0:
                continue

            if len(row) != 6:
                raise RuntimeError

            example_temp = {
                'qes_id1': int(row[1]),
                'qes1': str(row[3]).strip(),
                'qes_id2': int(row[2]),
                'qes2': str(row[4]).strip(),
                'label': int(row[5]),
                'id': int(row[0]),
            }
            example_dicts.append(example_temp)

        file_tool.save_data_pickle(example_dicts, des_filename)
        return example_dicts
示例#2
0
    def train_model(self):
        train_loader_tuple_list = self.data_loader_dict[
            'train_loader_tuple_list']
        avg_result = np.array([0, 0], dtype=np.float)
        record_list = []
        for tuple_index, train_loader_tuple in enumerate(
                train_loader_tuple_list, 1):
            # if tuple_index>2:
            #     break
            #repeat create framework, when each fold train
            self.create_framework()
            self.logger.info('{} was created!'.format(self.framework.name))
            train_loader, valid_loader = train_loader_tuple
            self.logger.info('train_loader:{}  valid_loader:{}'.format(
                len(train_loader), len(valid_loader)))
            self.logger.info('begin train {}-th fold'.format(tuple_index))
            result = self.__train_fold__(train_loader=train_loader,
                                         valid_loader=valid_loader)
            self.trial_step = self.arg_dict['epoch'] * tuple_index
            avg_result += np.array(result[0:2], dtype=np.float)
            record_list.append(result[3])

        record_file = file_tool.connect_path(
            self.framework.arg_dict['model_path'], 'record_list.pkl')
        file_tool.save_data_pickle(record_list, record_file)
        avg_result = (avg_result / len(train_loader_tuple_list)).tolist()
        avg_result.append('finish')
        self.logger.info('avg_acc:{}'.format(avg_result[0]))
        return avg_result
示例#3
0
    def __extra_sentences_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            sentence_dict = file_tool.load_data_pickle(des_filename)
            return sentence_dict
        sentence_dict = {}
        rows = file_tool.load_data(org_file, mode='r')
        for i, row in enumerate(rows):
            result = row.split("\t")
            if i == 0:
                continue
            if len(result) != 6:
                raise RuntimeError

            if not general_tool.is_number(result[1]):
                raise RuntimeError

            if not general_tool.is_number(result[2]):
                raise RuntimeError

            if str(result[3]).strip() == '':
                print('empty sentence id:{}'.format(str(result[1]).strip()))
            else:
                sentence_dict[str(result[1]).strip()] = str(result[3]).strip()

            if str(result[4]).strip() == '':
                print('empty sentence id:{}'.format(str(result[2]).strip()))
                continue
            else:
                sentence_dict[str(result[2]).strip()] = str(result[4]).strip()

        file_tool.save_data_pickle(sentence_dict, des_filename)
        return sentence_dict
示例#4
0
def get_qqp_obj(force=False):

    global single_qqp_obj
    if force or (single_qqp_obj is None):
        single_qqp_obj_file = file_tool.connect_path("corpus/qqp",
                                                     'qqp_obj.pkl')
        if file_tool.check_file(single_qqp_obj_file):
            single_qqp_obj = file_tool.load_data_pickle(single_qqp_obj_file)
        else:
            single_qqp_obj = Qqp()
            file_tool.save_data_pickle(single_qqp_obj, single_qqp_obj_file)

    return single_qqp_obj
示例#5
0
 def show_pared_info(self):
     print('the count of dep type:{}'.format(
         self.parse_info.dependency_count))
     print(
         'the max len of sentence_tokens:{}, correspond sent id:{}'.format(
             self.parse_info.max_sent_len, self.parse_info.max_sent_id))
     print('the average len of sentence_tokens:{}'.format(
         self.parse_info.avg_sent_len))
     sent_len_table = self.parse_info.sent_len_table
     file_tool.save_data_pickle(
         sent_len_table,
         file_tool.connect_path(self.data_path, "sent_len_table.pkl"))
     plt.bar(range(1, len(sent_len_table) + 1), sent_len_table)
     plt.title("sentence tokens length distribution")
     plt.show()
示例#6
0
    def parse_sentences(self):
        parsed_sentence_org_file = 'corpus/mrpc/parsed_sentences.txt'
        parsed_sentence_dict_file = 'corpus/mrpc/parsed_sentence_dict.pkl'
        if file_tool.check_file(parsed_sentence_dict_file):
            parsed_sentence_dict = file_tool.load_data_pickle(
                parsed_sentence_dict_file)
        else:
            parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file(
                parsed_sentence_org_file)
            file_tool.save_data_pickle(parsed_sentence_dict,
                                       parsed_sentence_dict_file)

        if len(parsed_sentence_dict) != len(self.sentence_dict):
            # raise ValueError("parsed_sentence_dict not march sentence_dict")
            pass

        if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(),
                                                  parsed_sentence_dict.copy()):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        # for sent_id, info in parsed_sentence_dict.items():
        #     if info['original'] != self.sentence_dict[sent_id].original:
        #         raise ValueError("parsed_sentence_dict not march sentence_dict")

        for sent_id, parse_info in parsed_sentence_dict.items():
            sent_id = str(sent_id)
            self.sentence_dict[sent_id].parse_info = parse_info

        self.parse_info = parser_tool.process_parsing_sentence_dict(
            parsed_sentence_dict, modify_dep_name=True)
        numeral_sentence_dict = self.parse_info.numeral_sentence_dict

        if not general_tool.compare_two_dict_keys(
                self.sentence_dict.copy(), numeral_sentence_dict.copy()):
            raise ValueError("numeral_sentence_dict not march sentence_dict")

        for sent_id in self.sentence_dict.keys():
            self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[
                sent_id]

        # print('the count of dep type:{}'.format(self.parse_info.dependency_count))
        # print('the max len of sentence_tokens:{}'.format(self.parse_info.max_sent_len))

        pass
示例#7
0
    def parse_sentences(self):
        parsed_sentence_org_file = file_tool.connect_path(
            self.data_path, 'parsed_sentences.txt')
        parsed_sentence_dict_file = file_tool.connect_path(
            self.data_path, 'parsed_sentence_dict.pkl')
        if file_tool.check_file(parsed_sentence_dict_file):
            parsed_sentence_dict = file_tool.load_data_pickle(
                parsed_sentence_dict_file)
        else:
            parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file(
                parsed_sentence_org_file)
            file_tool.save_data_pickle(parsed_sentence_dict,
                                       parsed_sentence_dict_file)

        if len(parsed_sentence_dict) != len(self.sentence_dict):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(),
                                                  parsed_sentence_dict.copy()):
            raise ValueError("parsed_sentence_dict not march sentence_dict")

        for sent_id, info in parsed_sentence_dict.items():
            if info['original'] != self.sentence_dict[sent_id].original:
                raise ValueError(
                    "parsed_sentence_dict not march sentence_dict")

        for sent_id, parse_info in parsed_sentence_dict.items():
            sent_id = str(sent_id)
            self.sentence_dict[sent_id].parse_info = parse_info

        self.parse_info = parser_tool.process_parsing_sentence_dict(
            parsed_sentence_dict, modify_dep_name=True)
        numeral_sentence_dict = self.parse_info.numeral_sentence_dict
        self.max_sent_len = self.parse_info.max_sent_len

        if not general_tool.compare_two_dict_keys(
                self.sentence_dict.copy(), numeral_sentence_dict.copy()):
            raise ValueError("numeral_sentence_dict not march sentence_dict")

        for sent_id in self.sentence_dict.keys():
            self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[
                sent_id]
        pass
示例#8
0
 def __extra_examples_from_org_file__(self, org_file, des_filename):
     if file_tool.check_file(des_filename):
         examples_dicts = file_tool.load_data_pickle(des_filename)
         return examples_dicts
     example_dicts = []
     rows = file_tool.load_data(org_file, mode='r')
     examples_id = 0
     for i, row in enumerate(rows):
         result = row.split("\t")
         if i == 0:
             continue
         if len(result) != 5:
             raise RuntimeError
         example_temp = {
             'sent_id1': int(result[1]),
             'sent_id2': int(result[2]),
             'label': int(result[0]),
             'id': examples_id
         }
         example_dicts.append(example_temp)
         examples_id += 1
     file_tool.save_data_pickle(example_dicts, des_filename)
     return example_dicts
示例#9
0
    def __extra_sentences_from_org_file__(self, org_file, des_filename):
        if file_tool.check_file(des_filename):
            sentence_dict = file_tool.load_data_pickle(des_filename)
            return sentence_dict
        sentence_dict = {}
        rows = file_tool.load_data(org_file, mode='r')
        for i, row in enumerate(rows):
            result = row.split("\t")
            if i == 0:
                continue
            if len(result) != 7:
                raise RuntimeError

            if not general_tool.is_number(result[0]):
                raise RuntimeError

            if str(result[0]) in sentence_dict:
                raise RuntimeError

            sentence_dict[str(result[0])] = str(result[1])

        file_tool.save_data_pickle(sentence_dict, des_filename)

        return sentence_dict