def _get_data(self): datas = list() tagdata_filenames = os.listdir(config.TAG_WR_DIC) tagdata_filepaths = [ config.TAG_WR_DIC + "/" + tagdata_filename for tagdata_filename in tagdata_filenames if tagdata_filename != ".D_Store" ] for tagdata_filepath in tagdata_filepaths: if os.path.exists(tagdata_filepath): para = [] num = 0 for line in bigfile.get_lines(tagdata_filepath): if num == 0: num = 1 continue if line != ',,,;;;;;;;;;;\n': para.append(line.replace('\n', '')) else: datas.append(para) para = [] else: logging.warning( "tag data file {} is not exist".format(tagdata_filepath)) raise FileNotFoundError( '{} 标注数据文件不存在'.format(tagdata_filepath)) return datas
def deal_tagdata(self, tagdata_filepaths: list, rate: float = config.WR_RATE): logging.info("begin deal word tag data") if rate < 0 or rate > 1: logging.error("rate is not between 0 and 1") exit(1) datas = list() if '.DS_Store' in tagdata_filepaths: tagdata_filepaths.remove('.DS_Store') for tagdata_filepath in tagdata_filepaths: if os.path.exists(config.TAG_DIC + '/sr/seq/' + tagdata_filepath): para = [] num = 0 for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' + tagdata_filepath): if num == 0: num = 1 continue if line != ',,,;;;;;;;;;;\n': para.append(line.replace('\n', '')) else: datas.append(para) para = [] else: logging.warning( "tag data file {} is not exist".format(tagdata_filepath)) raise FileNotFoundError( '{} 标注数据文件不存在'.format(tagdata_filepath)) words_list, labels_list = self.split_tagdata(datas) datas = None regwords_list = self.sentence2regwords(words_list) reglabels_list = self.labels2reglabels(labels_list) words_list = None labels_list = None regwords_list, reglabels_list = shuffle.shuffle_both( regwords_list, reglabels_list) wordvecs_list = self.word2vec(regwords_list) labelvecs_list = self.label2vec(reglabels_list) regwords_list = None reglabels_list = None # 将数据保存下来 total_size = len(labelvecs_list) train_x = wordvecs_list[:int(total_size * rate)] train_y = labelvecs_list[:int(total_size * rate)] test_x = wordvecs_list[int(total_size * rate):] test_y = labelvecs_list[int(total_size * rate):] wordvecs_list = None labelvecs_list = None logging.info("deal word tag data end") return train_x, train_y, test_x, test_y
def othersplit2word(self, filepath: str): logging.info("other corpus split to word") if os.path.exists(filepath): with open(config.PREDATA_DIC + "/" + filepath.split("/")[-1], "w", encoding="utf-8") as write_file: for line in bigfile.get_lines(filepath): if line: try: write_file.write(" ".join(jieba_holder.lcut(line))) except Exception as e: if isinstance(e, KeyboardInterrupt): exit(1) logging.warning("error line:{}".format(line)) else: logging.error("file {} is not exist".format(filepath))
def wikisplit2word(self): logging.info("wiki corpus split to word") if os.path.exists(config.CORPUS_DIC + "/wiki_chs"): with open(config.PREDATA_DIC + "/totalpart.txt", "w", encoding="utf-8") as write_file: for line in bigfile.get_lines(config.CORPUS_DIC + "/wiki_chs"): if line: try: write_file.write(" ".join(jieba.lcut(line))) except Exception as e: if isinstance(e, KeyboardInterrupt): exit(1) logging.warning("error line:{}".format(line)) else: logging.error("file {} is not exist".format(config.CORPUS_DIC + "/wiki_chs"))
def deal_tagdata(self, tagdata_filepaths: list, rate: float = config.WR_RATE): logging.info("begin deal word tag data") if rate < 0 or rate > 1: logging.error("rate is not between 0 and 1") exit(1) datas = list() for tagdata_filepath in tagdata_filepaths: if os.path.exists(tagdata_filepath): for line in bigfile.get_lines(tagdata_filepath): datas.append(line) else: logging.warning( "tag data file {} is not exist".format(tagdata_filepath)) words_list, labels_list = self._split_tagdata(datas) datas = None regwords_list = self.words2regwords(words_list) reglabels_list = self.labels2reglabels(labels_list) words_list = None labels_list = None # regwords_list, reglabels_list = shuffle.shuffle_both(regwords_list, reglabels_list) wordvecs_list = self.word2vec(regwords_list) labelvecs_list = self.label2vec(reglabels_list) regwords_list = None reglabels_list = None # 将数据保存下来 total_size = len(wordvecs_list) train_x = wordvecs_list[:int(total_size * rate)] train_y = labelvecs_list[:int(total_size * rate)] test_x = wordvecs_list[int(total_size * rate):] test_y = labelvecs_list[int(total_size * rate):] wordvecs_list = None labelvecs_list = None logging.info("deal word tag data end") return train_x, train_y, test_x, test_y
def deal_tagdata(self, tagdata_filepaths: list, rate: float = config.SR_RATE): logging.info("begin deal sentence tag data") if rate < 0 or rate > 1: logging.error("rate is not between 0 and 1") exit(1) datas = list() for tagdata_filepath in tagdata_filepaths: if os.path.exists(tagdata_filepath): for line in bigfile.get_lines(tagdata_filepath): datas.append(line) else: logging.warning("tag data file {} is not exist".format(tagdata_filepath)) # random.shuffle(datas) sentences, labels = self._split_tagdata(datas) datas = None regwords_list = self.sentence2regwords(sentences) sentences = None sentencevecs = self.words2vec(regwords_list) labelvecs = self.label2vec(labels) regwords_list = None labels = None # 将数据保存下来 total_size = len(sentencevecs) train_x = sentencevecs[:int(total_size * rate)] train_y = labelvecs[:int(total_size * rate)] test_x = sentencevecs[int(total_size * rate):] test_y = labelvecs[int(total_size * rate):] sentencevecs = None labelvecs = None logging.info("deal sentence tag data end") return train_x, train_y, test_x, test_y
# print(word['item']) # word_list.append(word['item']) word_list.extend(jieba.lcut(sentence, HMM=True)) word_list = list(set(word_list)) return word_list if __name__ == '__main__': datas = [] file_name_list = os.listdir(config.TAG_DIC + '/sr/seq/') if '.DS_store' in file_name_list: file_name_list.remove('.DS_store') for file_name in file_name_list: if os.path.exists(config.TAG_DIC + '/sr/seq/' + file_name): num = 0 for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' + file_name): if num == 0: num = 1 continue if line != ',,,;;;;;;;;;;\n': datas.append(line.replace('\n', '')) else: raise FileNotFoundError('{} 标注数据文件不存在'.format(file_name)) words = split_sentence(datas) with open('news_word.txt', 'a', encoding='UTF-8-sig') as wf1: for word in words: wf1.write(word + ' ') print("write over")
__all__ = ["wordpre"] import os import config import tool.bigfile as bigfile import jieba if __name__ == '__main__': tagdata_filepaths = os.listdir(config.PRE_DIC) datas = [] para = [] if '.DS_Store' in tagdata_filepaths: tagdata_filepaths.remove('.DS_Store') for tagdata_filepath in tagdata_filepaths: if os.path.exists(config.PRE_DIC + '/' + tagdata_filepath): para = [] num = 0 for line in bigfile.get_lines(config.PRE_DIC + '/' + tagdata_filepath): if num == 0: num = 1 continue if line != ',,,;;;;;;;;;;,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n' and line != ',,,;;;;;;;;;;,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n': spline = line.replace('\n', '').split(',,,,')[0] # para.append(line.replace('\n', '')) para.append(spline) else: pass print(para) for i in para: datas.extend(jieba.lcut(i)) datas = list(set(datas)) print(datas) with open('news_word.txt', 'w') as wf:
count_len += 1 print(cal / len(investor_labels_list)) print(test / test1) print(test) print(test1) if __name__ == '__main__': datas = list() if os.path.exists('preditor.csv'): para = [] num = 0 flag = 0 for line in bigfile.get_lines('preditor.csv'): if num == 0: num = 1 continue if line != ',,,,,,;;;;;;;;;;\n': para.append(line.replace('\n', '')) else: # flag += 1 datas.append(para) para = [] if flag == 2: break investor_labels_list, \ investee_labels_list, \ mount_labels_list, \ pre_investor_labels_list, \