Пример #1
0
    with open(r'D:/in_the_name_of_people_segment.txt', 'wb+') as f2:
        f2.write(result)

f.close()
f2.close()

import logging
import os
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

sentences = word2vec.LineSentence(r'D:/in_the_name_of_people_segment.txt')

model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3, size=20)

req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn=20):  #20是设置每个词语提取向量的个数
    if len(key[0]) == 3:
        req_count -= 1
        print(key[0], key[1])
        if req_count == 0:
            break

end_time = time()
run_time = end_time - begin_time
print(run_time)

#返回的时间单位是秒
train_data_features = vectorizer.fit_transform(questionList)
train_data_features = train_data_features.toarray()

# Get a list of all words from the feature list
vocab = vectorizer.get_feature_names()
# Sum the counts for each vocab word
dist = np.sum(train_data_features, axis=0)

num_features = 300
min_word_count = 30
num_workers = 4
context = 10
downsampling = 1e-3

model = word2vec.Word2Vec(questionList, workers=num_workers, \
					size=num_features, min_count=min_word_count, \
					window=context, sample=downsampling)

# No more training, makes the model more memory friendly
model.init_sims(replace=True)


def makeFeatureVec(words, model, num_features):
	#preallocation of numpy array for speed purposes
	featureVec = np.zeros((num_features,), dtype="float32")
	nwords = 0
	#Convert model vocabulary to set for speed
	index2word_set = set(model.index2word)
	for word in words:
		if word in index2word_set:
			nwords = nwords+1
Пример #3
0
def BuildSemanticModel(semantic_model_input_file,
                       pretrained_input_file,
                       use_pretrained_vectors=True,
                       high_sd_cutoff=3,
                       low_n_cutoff=1):
    """
    Given an input file produced by the ALIGN Phase 1 functions,
    build a semantic model from all transcripts in all conversations
    in target corpus after removing high- and low-frequency words.
    High-frequency words are determined by a user-defined number of
    SDs over the mean (by default, `high_sd_cutoff=3`). Low-frequency
    words must appear over a specified number of raw occurrences
    (by default, `low_n_cutoff=1`).

    Frequency cutoffs can be removed by `high_sd_cutoff=None` and/or
    `low_n_cutoff=0`.
    """

    # build vocabulary list from transcripts
    data1 = pd.read_csv(semantic_model_input_file, sep='\t', encoding='utf-8')

    # get frequency count of all included words
    all_sentences = [
        re.sub('[^\w\s]+', '', str(row)).split(' ')
        for row in list(data1['lemma'])
    ]
    all_words = list([a for b in all_sentences for a in b])
    frequency = defaultdict(int)
    for word in all_words:
        frequency[word] += 1

    # remove words that only occur more frequently than our cutoff (defined in occurrences)
    frequency = {
        word: freq
        for word, freq in frequency.iteritems() if freq > low_n_cutoff
    }

    # if desired, remove high-frequency words (over user-defined SDs above mean)
    if high_sd_cutoff is None:
        contentWords = [word for word in frequency.keys()]
    else:
        getOut = np.mean(frequency.values()) + (np.std(frequency.values()) *
                                                (high_sd_cutoff))
        contentWords = {
            word: freq
            for word, freq in frequency.iteritems() if freq < getOut
        }.keys()

    # decide whether to build semantic model from scratch or load in pretrained vectors
    if not use_pretrained_vectors:
        keepSentences = [[word for word in row if word in contentWords]
                         for row in all_sentences]
        semantic_model = word2vec.Word2Vec(all_sentences,
                                           min_count=low_n_cutoff)
    else:
        if pretrained_input_file is None:
            raise ValueError(
                'Error! Specify path to pretrained vector file using the `pretrained_input_file` argument.'
            )
        else:
            semantic_model = gensim.models.KeyedVectors.load_word2vec_format(
                pretrained_input_file, binary=True)

    # return all the content words and the trained word vectors
    return contentWords, semantic_model.wv
Пример #4
0
is_test_train = np.zeros(N, dtype='int')
is_test_test = np.ones(M, dtype='int')
data_train['is_test'] = pd.Series(is_test_train, index=data_train.index)
data_test['is_test'] = pd.Series(is_test_test, index=data_test.index)
data_train_test = pd.concat([
    data_train[['question1', 'question2', 'is_test']],
    data_test[['question1', 'question2', 'is_test']]
],
                            axis=0)
corpus = hr.build_corpus(data_train_test)
print "Corpus creado"

#Parti pris : word2vec está entrenado sobre el conjunto limpiado
model = word2vec.Word2Vec(corpus,
                          size=100,
                          window=20,
                          min_count=200,
                          workers=4)
model.save(path + 'mymodel')
print "Model word2vec creado y guardado"

del corpus  #hint para aliviar la RAM

df_RF_train = hr.clean_dataframe_after_building_model(data_train)
df_RF_test = hr.clean_dataframe_after_building_model(data_test)

del model

df_RF_train.to_csv(path + 'df_RF_train.csv', index=False)
df_RF_test.to_csv(path + 'df_RF_test.csv', index=False)
Пример #5
0
sentences1 = ['this is a sentence', 'this is the second sentence']
# 传入的正确格式
sentences2 = [['this', 'is', 'a', 'sentence'],
              ['this', 'is', 'the', 'second', 'sentence']]
# 也可以是把文档中所有的词提取出来的长列表,但是要保证它是二维的
# 无需去重,因为Word2Vec建议训练模型时把词频考虑在内
sentences3 = [[
    'this', 'is', 'a', 'sentence', 'this', 'is', 'the', 'second', 'sentence'
]]

# 模型构建
model = word2vec.Word2Vec(sentences2,
                          sg=1,
                          size=20,
                          window=1,
                          min_count=1,
                          negative=3,
                          sample=0.001,
                          hs=1,
                          workers=4)
'''
Word2Vec参数说明
    1.sentences:可以是一个List,对于大语料集,建议使用BrownCorpus,Text8Corpus或LineSentence构建。
    2.sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法。
    3.size:是指输出的词的向量维数,默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
    4.window:为训练的窗口大小,8表示每个词考虑前8个词与后8个词(实际代码中还有一个随机选窗口的过程,窗口大小<=5),默认值为5。
    5.alpha: 是学习速率
    6.seed:用于随机数发生器。与初始化词向量有关。
    7.min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。
    8.max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个,则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。
    9.sample: 表示 采样的阈值,如果一个词在训练样本中出现的频率越大,那么就越会被采样。默认为1e-3,范围是(0,1e-5)
#이중 리스트에 split
# Word2vec에 알맞은 형태로 넣어주기 위해

product_names_word2vec = list()
for i in product_names_cleansing:
    a = i.split(" ")
    product_names_word2vec.append(a)

embedding_size = 300  #최대 40000개의 단어를 사용하여 사전구성
min_count = 1  # 최소 n번 이상 나온 단어만 사용
max_sentence_length = 38  #문장의 최대길이를 50

# Word2Vec Embedding
w2v_model = word2vec.Word2Vec(product_names_word2vec,
                              size=embedding_size,
                              min_count=min_count)
print(w2v_model)

# 유사단어
# w2v_model.wv.most_similar("BANANA")

# word2vec 벡터값
w2v_weight = w2v_model.wv.vectors
w2v_weight.shape

# 단어 {Key:Value} Dictionary 처리 및 기존에 없는 단어가 들어올 때를 위한 처리
index2word = {i + 2: w for i, w in enumerate(w2v_model.wv.index2word)}
index2word[0] = 'PAD'
index2word[1] = 'UNK'
word2index = {w: i for i, w in index2word.items()}
Пример #7
0
'''
Building Word2Vector Model for Word Embeddings
'''
print("Building Word2Vec model..")

# Initialize Model Building timer
start = time.time()

# Check if a Word2Vec Model name is specified
if(Word2VecModelName):
    # Load a locally saved model
    v2wmodel = Word2Vec.load(Word2VecModelName)
else:
    # Building the Word2Vec Model with the specified parameters
    v2wmodel = word2vec.Word2Vec(training_sentences, size=vector_dimensions, window=window_size, min_count=min_word_count, workers=number_of_workers)
    
    # Save Word2Vec model with specified Name
    v2wmodel.save("Word2VecModel")

# End and display time to build Word2Vec Model
print("Model built in : ", time.time()-start,"s.\n")



'''
Embedding of Train Vectors
'''
print("Creating Embedded Train Vectors..")
start = time.time()
Пример #8
0
tradeEnglish = pd.Series(data.loc[:, "Trade_English"])

# %% prepare for word2vec

tradeEnglish2 = tradeEnglish.apply(lambda x: [" ".join(x).strip()])

tradeEnglish3 = []

for item in tradeEnglish2:
    tradeEnglish3.append("".join(item))

pd.Series(tradeEnglish3).to_csv(dataPath + "sentences.csv", header=0, index=0)

# %% w2v
import logging

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
                    level=logging.INFO)
from gensim.models import word2vec

sentences = word2vec.LineSentence(dataPath + "sentences.csv")

model = word2vec.Word2Vec(sentences,
                          min_count=10,
                          workers=4,
                          size=300,
                          window=5,
                          iter=50)

model.save(dataPath + "word2vec.model")
Пример #9
0
from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus('./test.txt')

model = word2vec.Word2Vec(sentences, size=200, min_count=20, window=15)
model.save("./test.model")
Пример #10
0
# 读取训练数据。先转换成Corpus形式
from gensim.models import word2vec
from gensim.models import KeyedVectors, Word2Vec

sentences = []
with open("poem_for_embedding.txt") as f:
    for line in f.readlines():
        sentences.append(line.replace("\n", "").split(" "))

dim = 128
window = 5
min_count = 5
model = word2vec.Word2Vec(sentences,
                          size=dim,
                          window=window,
                          min_count=min_count,
                          workers=4)
model.save(f"vocab/w2v_{dim}.txt")
print(model.wv.most_similar("我", topn=10))
print(model.wv["我"])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from gensim.models import word2vec
import logging

# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus(r"C:\Users\CJ17\Desktop\text8")
model = word2vec.Word2Vec(sentences,
                          sg=1,
                          size=200,
                          window=10,
                          min_count=5,
                          negative=10)
model.save(r"C:\Users\CJ17\Desktop\text8output\200dimension\text8.model")
model.wv.save_word2vec_format(
    r"C:\Users\CJ17\Desktop\text8output\200dimension\text8.model.vector")

y1 = model.similarity("woman", "man")
print(u"woman and man :", y1)
'''
model.save("text8.model")# add the path into 
# 对应的加载方式
# model_2 = word2vec.Word2Vec.load("text8.model")

model.save_word2vec_format("text8.model.bin", binary=True)
# 对应的加载方式
# model_3 = word2vec.Word2Vec.load_word2vec_format("text8.model.bin", binary=True)
'''
Пример #12
0
#-*-coding:utf-8-*-
from gensim.models import word2vec
from gensim.models import word2vec as LineSentence

import logging

import sys
reload(sys)
sys.setdefaultencoding('utf8')

if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s:%(levelname)s:(message)s',
                        level=logging.INFO)
    txt = word2vec.Text8Corpus(u'train.data.model')
    model = word2vec.Word2Vec(txt, size=100, window=50, min_count=1)
    model.save('word2vec.model')
Пример #13
0
cleantext = " ".join(tokens)

nlp = spacy.load('en_core_web_sm')  # make sure to use larger model!

doc = nlp(cleantext)
list_of_lists = []
for sentence in doc.sents:
    inner_list = []
    for token in sentence:
        inner_list.append(token.text)
    list_of_lists.append(inner_list)

model = word2vec.Word2Vec(list_of_lists,
                          size=200,
                          window=5,
                          min_count=4,
                          workers=4)


def tsne_plot(doc, myperplexity, title):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    #model.wv('trump') accesses the word vector for the word 'trump.

    #vocab is the list of words
    for word in model.wv.vocab:

        #model[word] is the matrix (word vector) of the word
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(word_list(raw_sentence))

token_count = sum([len(sentence) for sentence in sentences])
print("\nToken count = {:,}".format(token_count))

# Build Word2Vec model
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
thrones2vec = w2v.Word2Vec(sg=1,
                           size=num_features,
                           min_count=min_word_count,
                           window=context_size,
                           workers=num_workers)
thrones2vec.build_vocab(sentences)

# # Train the model
# thrones2vec.train(sentences)
# if not os.path.exists(model_path):
#     os.mkdir(model_path)
# thrones2vec.save(model_name)

thrones2vec = w2v.Word2Vec.load(model_name)
man_sim = thrones2vec.most_similar(positive=['man', 'woman'],
                                   negative=['girl'],
                                   topn=1)
print(man_sim)
Пример #15
0
def my_word2vec(cut_filename):
    mysetence = word2vec.Text8Corpus(cut_filename)
    # model = word2vec.Word2Vec(mysetence, size=300, min_count=1, window=5, hs=5)
    model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5)
    model.save('./model/zh_wiki_global.model')
    return model
Пример #16
0
def learn_embeddings(walks, output, dimensions=100, window_size=5, min_count=0, sg=1,
                     iterations=3, alpha=.1, min_alpha=.01, workers=4):
    model = word2vec.Word2Vec(sentences=walks, size=dimensions,
                              window=window_size, min_count=min_count, sg=sg, workers=workers,
                              iter=iterations, alpha=alpha, min_alpha=min_alpha)
    model.wv.save_word2vec_format(output)
Пример #17
0
twitter = Twitter()
results = []
lines = text.split("\r\n")

for line in lines:
    # 형태소 분석하기 --- (※3)
    # 단어의 기본형 사용
    malist = twitter.pos(line, norm=True, stem=True)
    r = []
    for word in malist:
        # 어미/조사/구두점 등은 대상에서 제외 
        if not word[1] in ["Josa", "Eomi", "Punctuation"]:
            r.append(word[0])
    rl = (" ".join(r)).strip()
    results.append(rl)
    print(rl)
    
# 파일로 출력하기  --- (※4)
wakati_file = 'yesterday.model'
with open(wakati_file, 'w', encoding='utf-8') as fp:
    fp.write("\n".join(results))
    
# Word2Vec 모델 만들기 --- (※5)
# LineSentence 함수로 텍스트 파일을 읽어 들인다.
data = word2vec.LineSentence(wakati_file)
model = word2vec.Word2Vec(data, 
    size=200, window=10, hs=1, min_count=2, sg=1)

# 모델을 저장한다.
model.save("yesterday.model")
print("ok")
Пример #18
0
def train_word2vec(inputFile, modelFile):
    sentences = word2vec.LineSentence(inputFile)
    model = word2vec.Word2Vec(sentences, size=300, min_count=1, sg=1)
    model.save(modelFile)
Пример #19
0
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus('text8')
model = word2vec.Word2Vec(sentences, size=200)
Пример #20
0
    ###############################################################r
    mpg = MetaPathGenerator()
    mpg.read_data("gene")
    ###############################################################

    ##论文关系表征向量
    ###############################################################
    all_embs = []
    rw_num = 10
    cp = set()
    for k in range(rw_num):
        mpg.generate_WMRW("gene/RW.txt", 5, 20)
        sentences = word2vec.Text8Corpus(r'gene/RW.txt')
        model = word2vec.Word2Vec(sentences,
                                  size=100,
                                  negative=25,
                                  min_count=1,
                                  window=10)
        embs = []
        for i, pid in enumerate(pubs):
            if pid in model:
                embs.append(model[pid])
            else:
                cp.add(i)
                embs.append(np.zeros(100))
        all_embs.append(embs)
    all_embs = np.array(all_embs)
    print(cp)
    ###############################################################

    ##论文语义表征向量
Пример #21
0
def word_2_vec(path):
    sentences = word2vec.LineSentence(path)
    model = word2vec.Word2Vec(sentences, size=300, min_count=20, window=5)
    return model
Пример #22
0
context_size = 7

#downloading setting for frequent words
#0 - le-5 is good for this
downsampling = 1e-3

#seed for the RNG, to make the results reproducible
#random number generator
#deterministic, good for debugging
seed = 1

thrones2vec = w2v.Word2Vec(
		sg = 1,
		seed = seed,
		workers = num_workers,
		size = num_features,
		min_count = min_word_count,
		window = context_size,
		sample = downsampling
)


thrones2vec.build_vocab(sentences)

print("Word2Vec vocabulary lenght:", len(thrones2vec.wv.vocab))

thrones2vec.train(sentences)

if not os.path.exists("trained"):
	os.makedirs("trained")
Пример #23
0
# -*- coding: utf-8 -*-

from gensim.models import word2vec
import sys

args = sys.argv

data = word2vec.Text8Corpus(args[1]+'.txt')
model = word2vec.Word2Vec(data, size=200,min_count=1)

model.save(args[1]+".model")
Пример #24
0
    sentences += review_to_sentences(review, tokenizer)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

num_features = 50
min_word_count = 1
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling,sg=0)

model.init_sims(replace=True)

model_name = "50features_1minwords_1000context"
model.save(model_name)


def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features, ), dtype="float32")
    nwords = 0.

    index2word_set = set(model.wv.index2word)

    for word in words:
Пример #25
0
def generate_word2vec():
    s = word2vec.LineSentence(novel_seg_path)
    model = word2vec.Word2Vec(s, size=20, window=5, min_count=5, workers=4)
    model.save(novel_wzv_path)
    return model
Пример #26
0
    line.replace('\t', '').replace('\n', '').replace(' ', '')
    seg_list = jieba.cut(line, cut_all=False, HMM=True)
    f2.write(" ".join(seg_list))

f1.close
f2.close

#####训练模型
from gensim.models import word2vec
import logging

#主程序
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s',
                    level=logging.INFO)
sentences = word2vec.Text8Corpus(u"Word2vec_jieba.txt")
model = word2vec.Word2Vec(sentences, size=50)  #训练skip-gram模型,默认window=5
print(model)

#1、计算两个词的相似度/相关程度
try:
    y1 = model.similarity("阿里", "万达")
except:
    y1 = 0
print("【国家】和【国务院】的相似度为:%s" % y1)
print("-----\n")

#2、计算某个词的相关词列表
y2 = model.most_similar("阿里", topn=30)  # 20个最相关的
print("【阿里】最相关的词有:\r\n")
for item in y2:
    print(item[0], item[1])
Пример #27
0
from gensim.models import word2vec
# 코퍼스 읽어 들이기 --- (※ 1)
sentences = word2vec.Text8Corpus('./wiki_wakati.txt')
# 모델 만들기 --- (※ 2)
model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5)
# 모델 저장하기 --- (※ 3)
model.save("./wiki.model")
            if line == '\n':
                continue
            temp = line.replace('\n', '').split('\t')
            temp[1] = ''.join(temp[1].split())
            temp[1] = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()~-]+|[A-Za-z0-9]+", "", temp[1])
            # sentences.append(temp[1])
            sentences.append(jieba.lcut(temp[1]))
    return sentences

sentences_train = read(trainDataSource)
sentences_train_validation = read(trainDataSource) + read(validationDataSource)

embeddingSize = 300
miniFreq = 1

word2VecModel_1 = word2vec.Word2Vec(sentences = sentences_train, size = embeddingSize,
    min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 0, iter = 20)
word2VecModel_1.save('word2VecModel_1')

word2VecModel_2 = word2vec.Word2Vec(sentences = sentences_train_validation, size = embeddingSize,
    min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 0, iter = 20)
word2VecModel_2.save('word2VecModel_2')

word2VecModel_3 = word2vec.Word2Vec(sentences = sentences_train, size = embeddingSize,
    min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 1, iter = 20)
word2VecModel_3.save('word2VecModel_3')

word2VecModel_4 = word2vec.Word2Vec(sentences = sentences_train_validation, size = embeddingSize,
    min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 1, iter = 20)
word2VecModel_4.save('word2VecModel_4')

import gensim
Пример #29
0
                          size=num_features, min_count=min_word_count,
                          window=context, sample=downsampling,hashfxn=myhash)

python 2.x declaration would be 

model = word2vec.Word2Vec(bagOfsentences, workers=num_workers,
                          size=num_features, min_count=min_word_count,
                          window=context, sample=downsampling
                          )
'''

print("Training model...")

model = word2vec.Word2Vec(bagOfsentences,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
"""
If you don't plan to train the model any further, calling
init_sims will make the model much more memory-efficient
If `replace` is set, forget the original vectors and only keep the normalized
ones = saves lots of memory!
"""
model.init_sims(replace=False)

# save the model for later use
# for loading, call Word2Vec.load()

model.save("../../classifier/Word2VectforNLPTraining")
Пример #30
0
 def train_word2vec(self):
     sentences = word2vec.Text8Corpus (self.args['all_text_path'])
     model = word2vec.Word2Vec (sentences, size=128, negative=5, min_count=2, window=5)
     model.save (self.args['save_word2vec_model'])