Exemplo n.º 1
0
def generate_training_data(data_train_file, output_file, word2idx):
    ''' 生成tokenize后的数据
    Args:
        data_train_file:训练集文件
            output_file:输出的tokenize后的文件
    '''
    data_train = pd.read_pickle(data_train_file)
    x_train_txt0 = data_train.txt_split.apply(split_word)
    X_train_txt, _ = make_deepLearn_data(x_train_txt0, word2idx)

    x_train_title0 = data_train.title_split.apply(split_word)
    X_train_title, _ = make_deepLearn_data(x_train_title0, word2idx)

    x_entity = data_train.entity.apply(split_word_entity).apply(
        generate_token, args=(word2idx, ))

    y_train = data_train.negative.values

    train_data = dict(
        zip(['txt', 'title', 'entity', 'y_train'],
            [X_train_txt, X_train_title, x_entity.values, y_train]))
    with open(output_file, 'wb') as f:
        pickle.dump(train_data, f)

    shape_dic = {
        'txt_shape': X_train_txt.shape[1],
        'title_shape': X_train_title.shape[1],
    }
    return shape_dic
Exemplo n.º 2
0
def generate_test_data(data_test_file, output_file, shape_dic, word2idx):
    ''' 生成tokenize后的数据
    Args:
        data_test_file:test set文件
            output_file:输出的tokenize后的文件
            shape_dic:由generate_training_data生成的txt,title的shape
    '''
    data_test = pd.read_pickle(data_test_file)
    x_test_txt0 = data_test.txt_split.apply(split_word)
    X_test_txt, _ = make_deepLearn_data(x_test_txt0, word2idx)

    x_test_title0 = data_test.title_split.apply(split_word)
    X_test_title, _ = make_deepLearn_data(x_test_title0, word2idx)

    x_entity = data_test.entity.apply(split_word_entity).apply(
        generate_token, args=(word2idx, ))

    # 保证test set的padding长度 和train set一致
    if shape_dic['txt_shape'] > X_test_txt.shape[1]:
        X_test_txt = pad_sequences(X_test_txt,
                                   shape_dic['txt_shape'],
                                   padding='post')
    else:
        X_test_txt = X_test_txt[:, :shape_dic['txt_shape']]

    if shape_dic['title_shape'] > X_test_title.shape[1]:
        X_test_title = pad_sequences(X_test_title,
                                     shape_dic['title_shape'],
                                     padding='post')
    else:
        X_test_title = X_test_title[:, :shape_dic['title_shape']]

    ## ouput file
    test_data = dict(
        zip(['txt', 'title', 'entity'],
            [X_test_txt, X_test_title, x_entity.values]))
    with open(output_file, 'wb') as f:
        pickle.dump(test_data, f)
Exemplo n.º 3
0
    # 读取所有句子,包括测试集和训练集
    with open('all_word_seg.txt', 'r', encoding='UTF-8') as f:
        sentences = f.readlines()
        sentences = [item[:-1].split(' ') for item in sentences]

    ## 清除低频的entity
    entities_all_count = generate_count(entities_all, 'all_word_seg.txt')
    entities_all_count_keys = clear_entity(entities_all_count, limit_count=3)

    ## 从1开始,留一位给UNK,从1开始标
    word2idx = dict(
        zip(entities_all_count_keys, range(1,
                                           len(entities_all_count_keys) + 1)))

    # 对句子进行tokenize,生成token矩阵
    sentences_tokens, _ = make_deepLearn_data(sentences, word2idx)
    # 生成共现矩阵
    cooccurrence_matrix = generate_co_occurrence(
        sentences_tokens,
        len(word2idx.keys()),
        window_size=5,
    )
    # 训练glove并输出文件
    glove_model = GloVe(n=dim_n, max_iter=itter_n)
    embedMatrix = glove_model.fit(cooccurrence_matrix)

    print('load word2vec model...')
    model = KeyedVectors.load_word2vec_format('train_vec_byTencent_word.bin',
                                              binary=True)

    print('build embedding matrix...')