Exemplo n.º 1
0
def wikisplit2word():
    if os.path.exists(config.CORPUS_DIC + '/wiki_chs'):
        with open(config.PREDATA_DIC + '/totalpart.txt', 'a',
                  encoding='utf-8') as write_file:
            print('开始分词')
            for line in __bigfile.get_lines(config.CORPUS_DIC + '/wiki_chs'):
                if line:
                    write_file.write(' '.join(jieba.lcut(line)))
            print('分词结束')
    else:
        raise FileNotFoundError('{} 不存在'.format(config.CORPUS_DIC +
                                                '/wiki_chs'))
Exemplo n.º 2
0
def othersplit2word(filepath: str):
    if os.path.exists(filepath):

        with open(config.PREDATA_DIC + '/' + filepath.split('/')[-1],
                  'a',
                  encoding='utf-8') as write_file:
            print('开始分词')
            for line in __bigfile.get_lines(filepath):
                if line:
                    write_file.write(' '.join(jieba.lcut(line)))
            print('分词结束')
    else:
        raise FileNotFoundError('{} 不存在'.format(filepath))
Exemplo n.º 3
0
def deal_tagdata(tagdata_filepaths: list, rate: float = config.SR_RATE):
    datas = []
    for tagdata_filepath in tagdata_filepaths:
        if os.path.exists(tagdata_filepath):
            for line in __bigfile.get_lines(tagdata_filepath):
                datas.append(line)
        else:
            raise FileNotFoundError('{} 标注数据文件不存在'.format(tagdata_filepath))

    random.shuffle(datas)  # 打乱数据

    sentences, labels = __split_tagdata(datas)

    datas.clear()

    words_list = __tagsentence2regwords(sentences)

    sentences.clear()

    sentencevec_list, labelvec_list = __data2vec(words_list, labels)

    words_list.clear()
    labels.clear()

    # 将数据保存下来
    total_size = len(labelvec_list)

    train_x = sentencevec_list[:int(total_size * rate)]
    train_y = labelvec_list[:int(total_size * rate)]
    test_x = sentencevec_list[int(total_size * rate):]
    test_y = labelvec_list[int(total_size * rate):]

    sentencevec_list.clear()
    labelvec_list.clear()

    if rate == 1.0:
        # 特殊要求
        if len(train_x) > 0:
            np.save(config.PREDATA_DIC + '/strain_x.npy', np.array(train_x))
            np.save(config.PREDATA_DIC + '/strain_y.npy', np.array(train_y))
        else:
            raise ValueError('rate为1.0,但数据长度为0')

    elif rate == 0.0:
        # 特殊要求
        if len(test_x) > 0:
            np.save(config.PREDATA_DIC + '/stest_x.npy', np.array(test_x))
            np.save(config.PREDATA_DIC + '/stest_y.npy', np.array(test_y))
        else:
            raise ValueError('rate为0.0,但数据长度为0')

    elif rate > 0.0 and rate < 1.0:
        train_size = len(train_x)
        test_size = len(test_x)

        if train_size <= 0 or test_size <= 0:
            raise ValueError('数据长度为0')

        # 正常要求
        np.save(config.PREDATA_DIC + '/strain_x.npy', np.array(train_x))
        np.save(config.PREDATA_DIC + '/strain_y.npy', np.array(train_y))
        np.save(config.PREDATA_DIC + '/stest_x.npy', np.array(test_x))
        np.save(config.PREDATA_DIC + '/stest_y.npy', np.array(test_y))

    else:
        raise ValueError('rate 超出范围,rate应该在0.0和1.0之间 rate:{}'.format(rate))
Exemplo n.º 4
0
def __get_sentences_generator(filepath: str):
    for resume in __bigfile.get_lines(filepath):
        yield __splitsentence.resume2sentences(resume)
Exemplo n.º 5
0
def __get_inputs_generator(filepath: str):
    for resume in __bigfile.get_lines(filepath):
        sentences = __splitsentence.resume2sentences(resume)
        words_list = srpre.sentence2regwords(sentences)
        yield srpre.sentence2vec(words_list)