def deal_tagdata(self, tagdata_filepaths: list, rate: float = config.WR_RATE): logging.info("begin deal word tag data") if rate < 0 or rate > 1: logging.error("rate is not between 0 and 1") exit(1) datas = list() if '.DS_Store' in tagdata_filepaths: tagdata_filepaths.remove('.DS_Store') for tagdata_filepath in tagdata_filepaths: if os.path.exists(config.TAG_DIC + '/sr/seq/' + tagdata_filepath): para = [] num = 0 for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' + tagdata_filepath): if num == 0: num = 1 continue if line != ',,,;;;;;;;;;;\n': para.append(line.replace('\n', '')) else: datas.append(para) para = [] else: logging.warning( "tag data file {} is not exist".format(tagdata_filepath)) raise FileNotFoundError( '{} 标注数据文件不存在'.format(tagdata_filepath)) words_list, labels_list = self.split_tagdata(datas) datas = None regwords_list = self.sentence2regwords(words_list) reglabels_list = self.labels2reglabels(labels_list) words_list = None labels_list = None regwords_list, reglabels_list = shuffle.shuffle_both( regwords_list, reglabels_list) wordvecs_list = self.word2vec(regwords_list) labelvecs_list = self.label2vec(reglabels_list) regwords_list = None reglabels_list = None # 将数据保存下来 total_size = len(labelvecs_list) train_x = wordvecs_list[:int(total_size * rate)] train_y = labelvecs_list[:int(total_size * rate)] test_x = wordvecs_list[int(total_size * rate):] test_y = labelvecs_list[int(total_size * rate):] wordvecs_list = None labelvecs_list = None logging.info("deal word tag data end") return train_x, train_y, test_x, test_y
def get_testdata(self): wtest_x, wtest_y = self.load_testdata() wtest_x, wtest_y = shuffle.shuffle_both(wtest_x, wtest_y) # 打乱数据 if len(wtest_x) > 0: return wtest_x, wtest_y else: logging.error("test data length is less than 0") exit(1)
def get_traindata(self): strain_x, strain_y = self.load_traindata() strain_x, strain_y = shuffle.shuffle_both(strain_x, strain_y) # 打乱数据 if len(strain_x) > 0: return strain_x, strain_y else: logging.error("train data length is less than 0") exit(1)