示例#1
0
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """
    # print(type(sentences))
    # print(sentences)
    sentence = LineSentence(sentence_path)
    w2v = Word2Vec(sentence, size = 256, sg = 1, min_count = 4, workers = 4)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
def build(train_texts_path,
          train_questions_path,
          train_answers_path,
          test_texts_path,
          test_answers_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_texts_path, train_questions_path,
                                 train_answers_path, test_texts_path,
                                 test_answers_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
        通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    """
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
示例#3
0
文件: build_w2v.py 项目: LGP1010/NLP
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print(len(sentences))
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """
    w2v = Word2Vec(LineSentence(sentence_path), size=256, sg=1,
                   min_count=1)  # LineSentence把txt文件转为所需要的格式(已经分词,以空格隔开)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)  # 模型保存为bin格式
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')  # 查看两个词向量的相近程度
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path,
                                              binary=True)  # #加载训练的词向量
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
示例#5
0
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=100):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # 训练模型,LineSentence是读取句子的方法
    w2v = Word2Vec(sg=1,
                   sentences=LineSentence(sentence_path),
                   size=256,
                   window=5,
                   min_count=min_count,
                   iter=5)
    # 保存模型
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # 词向量模型效果测试
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # 加载我们的词向量模型,这个模型之前是以二进制形式保存的,加载的包再KeyedVectors里
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
示例#6
0
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model

    # 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    #     global w2v

    w2v = Word2Vec(sentences=LineSentence(sentence_path),
                   sg=1,
                   size=256,
                   window=5,
                   min_count=5,
                   negative=3,
                   sample=0.001,
                   hs=1,
                   workers=4)

    #用LineSentence把一个txt文件转为所需要的格式 PathLineSentence把一个文件夹里所有text转为一句话一个列表。
    # w2v.save('word2vec.model')
    # loaded_model = Word2Vec.load('word2vec.model')
    # wv = w2v.wv
    # del w2v
    # wv.save('word_vector')

    def cal_similarity(self, test_word_id):
        sim_matrix = self.sess.run(self.similarity, feed_dict={self.test})

    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
示例#7
0
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)

    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """

    # 训练skip-gram模型
    # min_count,频数阈值,大于等于1的保留
    # size,神经网络 NN 层单元数,它也对应了训练算法的自由程度
    # workers=4,default = 1 worker = no parallelization 只有在机器已安装 Cython 情况下才会起到作用。如没有 Cython,则只能单核运行。

    w2v = Word2Vec(sentences, size=256, window=5, min_count=1, workers=4, sg=1)

    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok.__________------" % w2v_bin_path)

    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)

    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    print(type(model))
    word_dict = {}
    for word in model.vocab:

        word_dict[word] = model[word]

    dump_pkl(word_dict, out_path, overwrite=True)
示例#8
0
def build(train_x_seg_path,
          test_y_seg_path,
          test_seg_path,
          out_path=None,
          sentence_path='',
          w2v_bin_path="w2v.bin",
          min_count=1):
    # 读取三个文件源然后合并三个文件中的句子
    # 根据col_sep进行拆分词
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path,
                                 test_seg_path)
    print(sentences[:5])
    save_sentence(sentences, sentence_path)

    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """
    # 如果模型还未训练过,则开始训练,否则的话跳过训练,直接加载模型
    if not os.path.exists(w2v_bin_path):
        model = Word2Vec(sentences, size=256, window=3, min_count=1, workers=4)
        model.wv.save_word2vec_format(w2v_bin_path, binary=True)
        print("save %s ok." % w2v_bin_path)

    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)

    # test
    print(model['技师'])
    sim = model.wv.similarity(u'技师', u'车主')
    print('技师 vs 车主 similarity score:', sim)
    # 打印出来为0.7745

    # 存储词向量数据
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
def train():
    expName = 'ptr128_image_{0}_text_{1}'.format(str(return_image),
                                                 str(return_text))
    model = ptrnet_model(input_dim=input_dim, hiddenStates=128, parallel=True)
    model.load_weights('weights_ptr128_image_True_text_True.h5')

    train_losses = []
    test_losses = []
    print input_dim
    min_loss = 1.68
    for ep in range(nb_epoch):
        ep_start = time()
        print "Epoch:", ep + 1
        _loss = 0.0
        for b, (x, y) in enumerate(generate_batches(batch_size)):
            h = model.fit(x, y, batch_size=4096 * 4, verbose=0, nb_epoch=1)
            _loss += h.history['loss'][0]
        print "\n", _loss / b

        test_loss = 0.0
        for b, (x, y) in enumerate(generate_batches(batch_size,
                                                    training=False)):
            test_loss += model.evaluate(x, y, verbose=0, batch_size=4096 * 4)
        test_loss /= b
        test_losses.append(test_loss)

        print 'test loss:', test_loss

        if test_loss < min_loss:
            print 'Loss improved from {0} to {1}'.format(min_loss, test_loss)
            min_loss = test_loss
            print 'Saving model_%s' % expName
            model.save('model_%s.h5' % expName)
            model.save_weights('weights_%s.h5' % expName)
        train_losses.append(_loss)

        dump_pkl([train_losses, test_losses], 'stories_losses')
        print time() - ep_start, "seconds for epoch", ep + 1
        print "=" * 100
示例#10
0
def build(train_x_seg_path, test_y_seg_path, test_seg_path,jiebainput_path,jiebaoutput_path,out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1):
    sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    """
    通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256
    your code
    w2v = (one line)
    """

    fileTrainSeg=[]
    fileTrainSeg=write_jiebatxt(jiebainput_path)
    # 保存分词结果到文件中
    with open(jiebaoutput_path,'w',encoding='utf-8') as fW:
        for i in range(len(fileTrainSeg)):
            fW.write(fileTrainSeg[i][0])
            fW.write('\n')

    # 训练skip-gram模型
    w2v = Word2Vec(LineSentence(jiebaoutput_path), size=50, window=5, min_count=5,
                     workers=multiprocessing.cpu_count())


    w2v.wv.save_word2vec_format(jiebaoutput_path, binary=True)
    print("save %s ok." % jiebaoutput_path)
    # test
    sim = w2v.wv.similarity('技师', '车主')
    print('技师 vs 车主 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(jiebaoutput_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    dump_pkl(word_dict, out_path, overwrite=True)
if False:
    (x_train, y_train), (x_test, y_test) = load_data(path="imdb.npz",
                                                     skip_top=0,
                                                     maxlen=None,
                                                     seed=113,
                                                     start_char=1,
                                                     oov_char=0,
                                                     index_from=2)
    n = 10
    n_grams = []
    for x in x_train:
        l = len(x)
        for i in range(0, l - n + 1, 1):
            n_grams.append(x[i:i + n])
    n_grams = np.array(n_grams)
    dump_pkl(n_grams, 'sentences_10_grams')

else:
    sentences = load_pkl('sentences_10_grams')
    L = len(sentences)
    maxWordIndex = np.max(sentences)

print L


def create_data(sentences, K=120):

    x = np.zeros((len(sentences), 10, 1))
    y = np.tile(np.eye(10), (len(sentences), 1, 1))

    x = sentences