def main():
    word_model = word2vec_model.word2vec_model()
    wv, vocabulary = load_embeddings(word_model)

    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(wv[:1000,:])
    # area = np.pi * similiarity
    plt.figure(figsize=(18, 18))
    plt.scatter(Y[:, 0], Y[:, 1])
    
    for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()
예제 #2
0
    query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(dict(query_wordcount))
query_model = query_unigram
Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True)

# remove template file
for rm_file in remove_list:
    if os.path.isfile("model/" + rm_file):
        os.remove("model/" + rm_file)

# Embedded Query Expansion
m_list = np.linspace(4, 4, num=1)
m = 1
interpolated_aplpha_list = np.linspace(0, 1.0, num=11)
word2vec = word2vec_model.word2vec_model(word_emb_path)

embd = EmbeddedBased(query_wordcount, collection, word2vec)
evaluate_model = EvaluateModel(relevance_path)
EQE1 = []
EQE2 = []
print "Embedded..."
# Embedding-based system (hyperparameter)
tmp_eqe1 = embd.embedded_query_expansion_ci(0.4, 4)
tmp_eqe2 = embd.embedded_query_expansion_qi(0.4, 4)
tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda)
tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda)
EQE1.append([ProcDoc.dict2np(tmp_eqe1), tmp_eqe1])
EQE2.append([ProcDoc.dict2np(tmp_eqe2), tmp_eqe2])

Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True)
예제 #3
0
    newEmbeddingList = []
    for idx, name in enumerate(objList):
        cur_idx = nameList.index(name)
        print EmbeddingList[cur_idx].shape
        newEmbeddingList.append(EmbeddingList[cur_idx])
    return newEmbeddingList


with open(model_path + "doc_list.pkl", "rb") as f:
    doc_list = Pickle.load(f)
with open(model_path + "query_list.pkl", "rb") as f:
    qry_list = Pickle.load(f)
with open(model_path + "test_query_list.pkl", "rb") as f:
    tstQry_list = Pickle.load(f)

wordModel = word2vec_model.word2vec_model()
wordVec = wordModel.getWord2Vec()
vocab_length = wordModel.vocabulary_length
print vocab_length

# document
doc = ProcDoc.read_file(document_path)
doc = ProcDoc.doc_preprocess(doc)
#[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100)
#doc_emb = rePermute(docTmpList, docEmbList, doc_list)
#doc_emb = content2List(doc, doc_list)
#doc_emb = np.asarray(doc_emb)
#print doc_emb.shape
#np.save(model_path + "doc_id_fix_pad.npy", doc_emb)

# train query
예제 #4
0
    query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_unigram = ProcDoc.unigram(dict(query_wordcount))
query_model = query_unigram
Pickle.dump(query_model, open("model/query_model.pkl", "wb"), True)
'''
# remove template file
for rm_file in remove_list:
    if os.path.isfile("model/" + rm_file):
        os.remove("model/" + rm_file)
'''
# Embedded Query Expansion
m_list = np.linspace(1, 80, num=1)
m = 1
interpolated_aplpha_list = np.linspace(0, 1.0, num=11)
word2vec = word2vec_model.word2vec_model()
evaluate_model = EvaluateModel(
    "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt")
EQE1 = []
EQE2 = []
for m in m_list:
    [tmp_eqe1,
     tmp_eqe2] = Embedded_based.EmbeddedQuery(query_wordcount, collection,
                                              word2vec, 1, int(m))
    tmp_eqe1 = ProcDoc.modeling(tmp_eqe1, background_model, query_lambda)
    tmp_eqe2 = ProcDoc.modeling(tmp_eqe2, background_model, query_lambda)
    EQE1.append(ProcDoc.dict2np(tmp_eqe1))
    EQE2.append(ProcDoc.dict2np(tmp_eqe2))

Pickle.dump(EQE1, open("model/eqe1_10.pkl", "wb"), True)
Pickle.dump(EQE2, open("model/eqe2_10.pkl", "wb"), True)