예제 #1
0
def build_average_dv(docs,
                     doc_num,
                     model,
                     save=True,
                     save_file="doc_vector_ave.bin"):
    num_features = model.syn0.shape[1]

    manager = Manager()
    global_doc_vector = mp.Array('d', doc_num * num_features, lock=False)
    global_word_set = manager.dict(util.get_word_vec_dict(model))

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    for words in docs:
        pool.apply_async(single_average_dv, [
            words,
            global_word_set,
            index,
            doc_num,
        ])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape(
        (doc_num, num_features))
    # print doc_vector
    if save:
        np.save(save_file, doc_vector)
    return doc_vector
def build_av_tf_idf_dv(docs, doc_num, model, save=True, save_file="doc_vector_tfidf.bin"):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    num_features = model.syn0.shape[1]

    manager = Manager()
    global_word_set = manager.dict(util.get_word_vec_dict(model))
    global_vocabulary = manager.dict(vectorizer.vocabulary_);
    global_doc_vector = mp.Array('d', doc_num*num_features, lock=False)

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv)
    for words in docs:
        pool.apply_async(single_av_tf_idf_dv, [words, global_word_set, index, doc_num, global_vocabulary, tfidf_fv[index]])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape((doc_num, num_features))
    if save:
        np.save(save_file, doc_vector)
    return doc_vector
def build_average_dv(docs, doc_num, model, save=True, save_file="doc_vector_ave.bin"):
    num_features = model.syn0.shape[1]

    manager = Manager()
    global_doc_vector = mp.Array('d', doc_num*num_features, lock=False)
    global_word_set = manager.dict(util.get_word_vec_dict(model))

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    for words in docs:
        pool.apply_async(single_average_dv, [words, global_word_set, index, doc_num, ])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape((doc_num, num_features))
    # print doc_vector
    if save:
        np.save(save_file, doc_vector)
    return doc_vector
예제 #4
0
def build_av_tf_idf_dv(docs,
                       doc_num,
                       model,
                       save=True,
                       save_file="doc_vector_tfidf.bin"):
    docs = list(docs)
    vectorizer = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    count_fv = vectorizer.fit_transform(util.word2sentence(docs))
    tfidf_fv = tfidf_transformer.fit_transform(count_fv)

    num_features = model.syn0.shape[1]

    manager = Manager()
    global_word_set = manager.dict(util.get_word_vec_dict(model))
    global_vocabulary = manager.dict(vectorizer.vocabulary_)
    global_doc_vector = mp.Array('d', doc_num * num_features, lock=False)

    pool = mp.Pool(initializer=initprocess, initargs=[global_doc_vector])

    index = 0
    # test(docs[0], global_word_set, 0, doc_num, global_vocabulary, global_doc_vector, global_tfidf_fv)
    for words in docs:
        pool.apply_async(single_av_tf_idf_dv, [
            words, global_word_set, index, doc_num, global_vocabulary,
            tfidf_fv[index]
        ])
        index += 1

    pool.close()
    pool.join()

    doc_vector = np.frombuffer(global_doc_vector).reshape(
        (doc_num, num_features))
    if save:
        np.save(save_file, doc_vector)
    return doc_vector