示例#1
0
    def deal_tagdata(self,
                     tagdata_filepaths: list,
                     rate: float = config.WR_RATE):
        logging.info("begin deal word tag data")
        if rate < 0 or rate > 1:
            logging.error("rate is not between 0 and 1")
            exit(1)

        datas = list()
        if '.DS_Store' in tagdata_filepaths:
            tagdata_filepaths.remove('.DS_Store')
        for tagdata_filepath in tagdata_filepaths:
            if os.path.exists(config.TAG_DIC + '/sr/seq/' + tagdata_filepath):
                para = []
                num = 0
                for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' +
                                              tagdata_filepath):
                    if num == 0:
                        num = 1
                        continue
                    if line != ',,,;;;;;;;;;;\n':
                        para.append(line.replace('\n', ''))
                    else:
                        datas.append(para)
                        para = []
            else:
                logging.warning(
                    "tag data file {} is not exist".format(tagdata_filepath))
                raise FileNotFoundError(
                    '{} 标注数据文件不存在'.format(tagdata_filepath))

        words_list, labels_list = self.split_tagdata(datas)
        datas = None

        regwords_list = self.sentence2regwords(words_list)
        reglabels_list = self.labels2reglabels(labels_list)
        words_list = None
        labels_list = None

        regwords_list, reglabels_list = shuffle.shuffle_both(
            regwords_list, reglabels_list)

        wordvecs_list = self.word2vec(regwords_list)
        labelvecs_list = self.label2vec(reglabels_list)
        regwords_list = None
        reglabels_list = None

        # 将数据保存下来
        total_size = len(labelvecs_list)

        train_x = wordvecs_list[:int(total_size * rate)]
        train_y = labelvecs_list[:int(total_size * rate)]
        test_x = wordvecs_list[int(total_size * rate):]
        test_y = labelvecs_list[int(total_size * rate):]
        wordvecs_list = None
        labelvecs_list = None

        logging.info("deal word tag data end")
        return train_x, train_y, test_x, test_y
示例#2
0
    def get_testdata(self):
        wtest_x, wtest_y = self.load_testdata()

        wtest_x, wtest_y = shuffle.shuffle_both(wtest_x, wtest_y)  # 打乱数据

        if len(wtest_x) > 0:
            return wtest_x, wtest_y
        else:
            logging.error("test data length is less than 0")
            exit(1)
示例#3
0
    def get_traindata(self):
        strain_x, strain_y = self.load_traindata()

        strain_x, strain_y = shuffle.shuffle_both(strain_x, strain_y)  # 打乱数据

        if len(strain_x) > 0:
            return strain_x, strain_y
        else:
            logging.error("train data length is less than 0")
            exit(1)