/
lab7.py
24 lines (22 loc) · 927 Bytes
/
lab7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii
vect = TfidfVectorizer()
tok=vect.build_tokenizer() # хорошо токенизирует все
texts=[]
lem=WordNetLemmatizer()
lemms=[]
#for text in dataset.data:
# for token in tok(text):
# lemms.append(lem.lemmatize(token))
# texts.append(lemms)
#models = models.Word2Vec(texts,size=100, window=5,min_count=5,workers=4)
#models.save('texts.dat')
model = models.Word2Vec.load('texts.dat')
#print(model['theory'])
#print(model.similarity('man','car'))
#print(model.most_similar(positive=['man'],negative=['computer']))
print model.doesnt_match("car wheel glass engine".split())