예제 #1
0
 def _get_data(self):
     datas = list()
     tagdata_filenames = os.listdir(config.TAG_WR_DIC)
     tagdata_filepaths = [
         config.TAG_WR_DIC + "/" + tagdata_filename
         for tagdata_filename in tagdata_filenames
         if tagdata_filename != ".D_Store"
     ]
     for tagdata_filepath in tagdata_filepaths:
         if os.path.exists(tagdata_filepath):
             para = []
             num = 0
             for line in bigfile.get_lines(tagdata_filepath):
                 if num == 0:
                     num = 1
                     continue
                 if line != ',,,;;;;;;;;;;\n':
                     para.append(line.replace('\n', ''))
                 else:
                     datas.append(para)
                     para = []
         else:
             logging.warning(
                 "tag data file {} is not exist".format(tagdata_filepath))
             raise FileNotFoundError(
                 '{} 标注数据文件不存在'.format(tagdata_filepath))
     return datas
예제 #2
0
    def deal_tagdata(self,
                     tagdata_filepaths: list,
                     rate: float = config.WR_RATE):
        logging.info("begin deal word tag data")
        if rate < 0 or rate > 1:
            logging.error("rate is not between 0 and 1")
            exit(1)

        datas = list()
        if '.DS_Store' in tagdata_filepaths:
            tagdata_filepaths.remove('.DS_Store')
        for tagdata_filepath in tagdata_filepaths:
            if os.path.exists(config.TAG_DIC + '/sr/seq/' + tagdata_filepath):
                para = []
                num = 0
                for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' +
                                              tagdata_filepath):
                    if num == 0:
                        num = 1
                        continue
                    if line != ',,,;;;;;;;;;;\n':
                        para.append(line.replace('\n', ''))
                    else:
                        datas.append(para)
                        para = []
            else:
                logging.warning(
                    "tag data file {} is not exist".format(tagdata_filepath))
                raise FileNotFoundError(
                    '{} 标注数据文件不存在'.format(tagdata_filepath))

        words_list, labels_list = self.split_tagdata(datas)
        datas = None

        regwords_list = self.sentence2regwords(words_list)
        reglabels_list = self.labels2reglabels(labels_list)
        words_list = None
        labels_list = None

        regwords_list, reglabels_list = shuffle.shuffle_both(
            regwords_list, reglabels_list)

        wordvecs_list = self.word2vec(regwords_list)
        labelvecs_list = self.label2vec(reglabels_list)
        regwords_list = None
        reglabels_list = None

        # 将数据保存下来
        total_size = len(labelvecs_list)

        train_x = wordvecs_list[:int(total_size * rate)]
        train_y = labelvecs_list[:int(total_size * rate)]
        test_x = wordvecs_list[int(total_size * rate):]
        test_y = labelvecs_list[int(total_size * rate):]
        wordvecs_list = None
        labelvecs_list = None

        logging.info("deal word tag data end")
        return train_x, train_y, test_x, test_y
예제 #3
0
 def othersplit2word(self, filepath: str):
     logging.info("other corpus split to word")
     if os.path.exists(filepath):
         with open(config.PREDATA_DIC + "/" + filepath.split("/")[-1], "w", encoding="utf-8") as write_file:
             for line in bigfile.get_lines(filepath):
                 if line:
                     try:
                         write_file.write(" ".join(jieba_holder.lcut(line)))
                     except Exception as e:
                         if isinstance(e, KeyboardInterrupt):
                             exit(1)
                         logging.warning("error line:{}".format(line))
     else:
         logging.error("file {} is not exist".format(filepath))
예제 #4
0
 def wikisplit2word(self):
     logging.info("wiki corpus split to word")
     if os.path.exists(config.CORPUS_DIC + "/wiki_chs"):
         with open(config.PREDATA_DIC + "/totalpart.txt", "w", encoding="utf-8") as write_file:
             for line in bigfile.get_lines(config.CORPUS_DIC + "/wiki_chs"):
                 if line:
                     try:
                         write_file.write(" ".join(jieba.lcut(line)))
                     except Exception as e:
                         if isinstance(e, KeyboardInterrupt):
                             exit(1)
                         logging.warning("error line:{}".format(line))
     else:
         logging.error("file {} is not exist".format(config.CORPUS_DIC + "/wiki_chs"))
예제 #5
0
    def deal_tagdata(self,
                     tagdata_filepaths: list,
                     rate: float = config.WR_RATE):
        logging.info("begin deal word tag data")
        if rate < 0 or rate > 1:
            logging.error("rate is not between 0 and 1")
            exit(1)

        datas = list()
        for tagdata_filepath in tagdata_filepaths:
            if os.path.exists(tagdata_filepath):
                for line in bigfile.get_lines(tagdata_filepath):
                    datas.append(line)
            else:
                logging.warning(
                    "tag data file {} is not exist".format(tagdata_filepath))

        words_list, labels_list = self._split_tagdata(datas)
        datas = None

        regwords_list = self.words2regwords(words_list)
        reglabels_list = self.labels2reglabels(labels_list)
        words_list = None
        labels_list = None

        # regwords_list, reglabels_list = shuffle.shuffle_both(regwords_list, reglabels_list)

        wordvecs_list = self.word2vec(regwords_list)
        labelvecs_list = self.label2vec(reglabels_list)
        regwords_list = None
        reglabels_list = None

        # 将数据保存下来
        total_size = len(wordvecs_list)

        train_x = wordvecs_list[:int(total_size * rate)]
        train_y = labelvecs_list[:int(total_size * rate)]
        test_x = wordvecs_list[int(total_size * rate):]
        test_y = labelvecs_list[int(total_size * rate):]
        wordvecs_list = None
        labelvecs_list = None

        logging.info("deal word tag data end")
        return train_x, train_y, test_x, test_y
예제 #6
0
    def deal_tagdata(self, tagdata_filepaths: list, rate: float = config.SR_RATE):
        logging.info("begin deal sentence tag data")
        if rate < 0 or rate > 1:
            logging.error("rate is not between 0 and 1")
            exit(1)

        datas = list()
        for tagdata_filepath in tagdata_filepaths:
            if os.path.exists(tagdata_filepath):
                for line in bigfile.get_lines(tagdata_filepath):
                    datas.append(line)
            else:
                logging.warning("tag data file {} is not exist".format(tagdata_filepath))

        # random.shuffle(datas)
        sentences, labels = self._split_tagdata(datas)
        datas = None

        regwords_list = self.sentence2regwords(sentences)
        sentences = None

        sentencevecs = self.words2vec(regwords_list)
        labelvecs = self.label2vec(labels)
        regwords_list = None
        labels = None

        # 将数据保存下来
        total_size = len(sentencevecs)

        train_x = sentencevecs[:int(total_size * rate)]
        train_y = labelvecs[:int(total_size * rate)]
        test_x = sentencevecs[int(total_size * rate):]
        test_y = labelvecs[int(total_size * rate):]
        sentencevecs = None
        labelvecs = None

        logging.info("deal sentence tag data end")
        return train_x, train_y, test_x, test_y
예제 #7
0
        #         print(word['item'])
        #         word_list.append(word['item'])
        word_list.extend(jieba.lcut(sentence, HMM=True))
    word_list = list(set(word_list))
    return word_list


if __name__ == '__main__':
    datas = []
    file_name_list = os.listdir(config.TAG_DIC + '/sr/seq/')
    if '.DS_store' in file_name_list:
        file_name_list.remove('.DS_store')
    for file_name in file_name_list:
        if os.path.exists(config.TAG_DIC + '/sr/seq/' + file_name):
            num = 0
            for line in bigfile.get_lines(config.TAG_DIC + '/sr/seq/' +
                                          file_name):
                if num == 0:
                    num = 1
                    continue
                if line != ',,,;;;;;;;;;;\n':
                    datas.append(line.replace('\n', ''))

        else:
            raise FileNotFoundError('{} 标注数据文件不存在'.format(file_name))
    words = split_sentence(datas)
    with open('news_word.txt', 'a', encoding='UTF-8-sig') as wf1:
        for word in words:
            wf1.write(word + ' ')
    print("write over")
예제 #8
0
__all__ = ["wordpre"]
import os
import config
import tool.bigfile as bigfile
import jieba
if __name__ == '__main__':
    tagdata_filepaths = os.listdir(config.PRE_DIC)
    datas = []
    para = []
    if '.DS_Store' in tagdata_filepaths:
        tagdata_filepaths.remove('.DS_Store')
    for tagdata_filepath in tagdata_filepaths:
        if os.path.exists(config.PRE_DIC + '/' + tagdata_filepath):
            para = []
            num = 0
            for line in bigfile.get_lines(config.PRE_DIC + '/' + tagdata_filepath):
                if num == 0:
                    num = 1
                    continue
                if line != ',,,;;;;;;;;;;,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n' and line != ',,,;;;;;;;;;;,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n':
                    spline = line.replace('\n', '').split(',,,,')[0]
                    # para.append(line.replace('\n', ''))
                    para.append(spline)
                else:
                    pass
    print(para)
    for i in para:
        datas.extend(jieba.lcut(i))
    datas = list(set(datas))
    print(datas)
    with open('news_word.txt', 'w') as wf:
예제 #9
0
            count_len += 1

    print(cal / len(investor_labels_list))
    print(test / test1)
    print(test)
    print(test1)


if __name__ == '__main__':
    datas = list()

    if os.path.exists('preditor.csv'):
        para = []
        num = 0
        flag = 0
        for line in bigfile.get_lines('preditor.csv'):
            if num == 0:
                num = 1
                continue
            if line != ',,,,,,;;;;;;;;;;\n':
                para.append(line.replace('\n', ''))
            else:
                # flag += 1
                datas.append(para)
                para = []
            if flag == 2:
                break
    investor_labels_list, \
    investee_labels_list, \
    mount_labels_list, \
    pre_investor_labels_list, \