def word_vec(self, word, use_norm=False): """Get the word's representations in vector space, as a 1D numpy array. Parameters ---------- word : str A single word whose vector needs to be returned. use_norm : bool If True, returns normalized vector. Returns ------- :class:`numpy.ndarray` The word's representations in vector space, as a 1D numpy array. Raises ------ KeyError For words with all ngrams absent, a KeyError is raised. Example ------- >>> from gensim.models import FastText >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = FastText(sentences, min_count=1) >>> meow_vector = model.word_vec('meow') # get vector for word """ return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm)
def test_persistence(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) self.models_equal(model, FT_gensim.load(testfile())) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(testfile()) loaded_wv = FastTextKeyedVectors.load(testfile()) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
def test_norm_vectors_not_saved(self): tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(tmpf) loaded_model = FT_gensim.load(tmpf) self.assertTrue(loaded_model.wv.vectors_norm is None) self.assertTrue(loaded_model.wv.vectors_ngrams_norm is None) wv = model.wv wv.save(tmpf) loaded_kv = FastTextKeyedVectors.load(tmpf) self.assertTrue(loaded_kv.vectors_norm is None) self.assertTrue(loaded_kv.vectors_ngrams_norm is None)
def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def word_vec(self, word, use_norm=False): return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm)
def initialize_word_vectors(self): self.wv = FastTextKeyedVectors() self.wv.min_n = self.min_n self.wv.max_n = self.max_n
def initialize_word_vectors(self): """Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model.""" self.wv = FastTextKeyedVectors() self.wv.min_n = self.min_n self.wv.max_n = self.max_n
from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models.wrappers.fasttext import FastTextKeyedVectors from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder import pickle as pkl from dostoevsky.tokenization import RegexTokenizer from dostoevsky.models import FastTextSocialNetworkModel import datetime tokenizer = RegexTokenizer() model = FastTextSocialNetworkModel(tokenizer=tokenizer) # morph analyzer for text lemmatization morph = pymorphy2.MorphAnalyzer() fasttext = FastTextKeyedVectors.load('187/model.model') pos_log_reg = pkl.load(open('pickles/pos_log_reg.pkl', 'rb')) neg_log_reg = pkl.load(open('pickles/neg_log_reg.pkl', 'rb')) pos_log_reg_dost = pkl.load(open('pickles/pos_log_reg_dost.pkl', 'rb')) neg_log_reg_dost = pkl.load(open('pickles/neg_log_reg_dost.pkl', 'rb')) old_data = pd.read_pickle('data/new_data.pkl') old_data['index'] = old_data.index training_data = pd.read_csv('data/training_data_with_razmetka_final.csv') data_new = training_data.merge(old_data, on=['index', 'message']) cut_date = lambda x: datetime.date(x.year, x.month, x.day) data_new['local_datetime'] = pd.to_datetime( data_new.local_datetime).apply(cut_date)