示例#1
0
import prepare_sentence_stream
import gensim

#model = gensim.models.Word2Vec.load_word2vec_format('../../Word2Vec/all.s200.w11.n1.v20.cbow.bin', binary=True, unicode_errors='ignore')
#print("Loaded model")

file = '../text/ch.txt'
sentence_stream = prepare_sentence_stream.file_to_stream(file)

print(sentence_stream)

bigram = gensim.models.phrases.Phrases(sentence_stream,
                                       min_count=5,
                                       threshold=10)
print(bigram)

new_sentences = list(bigram[sentence_stream])

# do we need to convert all words to lower case?
'''for l in new_sentences:
    for i in range(len(l)):
        l[i] = l[i].lower()
'''

print("Read sentences, building model...")

model = gensim.models.word2vec.Word2Vec(new_sentences, size=25)
file_name = 'word2vec/w2v.bin'
model.save_word2vec_format(file_name, binary=True)

text_file_name = 'word2vec/w.txt'
示例#2
0
import prepare_sentence_stream
import gensim

separators = ['...', '.', '?', '!']
trash = ['"', '--', '(', ')', ':', ',', ';']


file = '../text/ch.txt'
sentence_stream = prepare_sentence_stream.file_to_stream(file)
text = prepare_sentence_stream.file_to_stream(file, seps=separators, trs=trash)

print(sentence_stream)

bigram = gensim.models.phrases.Phrases(sentence_stream, min_count=5, threshold=10)
print(bigram)

new_sentences = list(bigram[text])

new_file = '../text/ch_bigrams.txt'
f = open(new_file, 'w', encoding='utf-8')
for l in new_sentences:
    for word in l:
        f.write(word + ' ')
    f.write('\n')


示例#3
0
import prepare_sentence_stream
import gensim

#model = gensim.models.Word2Vec.load_word2vec_format('../../Word2Vec/all.s200.w11.n1.v20.cbow.bin', binary=True, unicode_errors='ignore')
#print("Loaded model")

file = '../text/ch.txt'
sentence_stream = prepare_sentence_stream.file_to_stream(file)

print(sentence_stream)

bigram = gensim.models.phrases.Phrases(sentence_stream, min_count=5, threshold=10)
print(bigram)

new_sentences = list(bigram[sentence_stream])

# do we need to convert all words to lower case?
'''for l in new_sentences:
    for i in range(len(l)):
        l[i] = l[i].lower()
'''

print("Read sentences, building model...")

model = gensim.models.word2vec.Word2Vec(new_sentences, size=25)
file_name = 'word2vec/w2v.bin'
model.save_word2vec_format(file_name, binary=True)

text_file_name = 'word2vec/w.txt'
f = open(text_file_name, 'w', encoding='utf-8')
for key in model.vocab.keys():
示例#4
0
import prepare_sentence_stream
import gensim

separators = ['...', '.', '?', '!']
trash = ['"', '--', '(', ')', ':', ',', ';']

file = '../text/ch.txt'
sentence_stream = prepare_sentence_stream.file_to_stream(file)
text = prepare_sentence_stream.file_to_stream(file, seps=separators, trs=trash)

print(sentence_stream)

bigram = gensim.models.phrases.Phrases(sentence_stream,
                                       min_count=5,
                                       threshold=10)
print(bigram)

new_sentences = list(bigram[text])

new_file = '../text/ch_bigrams.txt'
f = open(new_file, 'w', encoding='utf-8')
for l in new_sentences:
    for word in l:
        f.write(word + ' ')
    f.write('\n')