class SearchEngine: def __init__(self, connection_provider, ft_embed_size): print('Search engine initialization') self.conn_provider = connection_provider self.fasttext = FastText(size=ft_embed_size, window=3, min_count=1, iter=100, workers=-1, min_n=1, max_n=5) name_corpus = [] for i, app in enumerate( self.conn_provider.get_all_articles().iterator()): name_corpus.append([app.appendix]) if i >= 3000: break self.fasttext.build_vocab(name_corpus) corp_count = self.fasttext.corpus_count n_iter = 1000 self.fasttext.train(name_corpus, total_examples=corp_count, epochs=n_iter) del name_corpus print('Done') def find_match(self, new_id, top_n=5): new = self.conn_provider.get_article(new_id) new_entities = new.named_entities scores = [] for candidate in self.conn_provider.get_all_articles().iterator(): if candidate.global_id == new.global_id: continue score = 0 cand_entities = set(candidate.named_entities) for ent in new_entities: if ent in cand_entities: score += 1 score += self.fasttext.similarity(new.appendix, candidate.appendix) score = score / (len(new_entities) + 1) scores.append((candidate.global_id, score)) scores = sorted(scores, key=lambda x: -x[1]) return scores[:top_n]
size=25, window=5, min_count=2, workers=4, sg=1) if 0: from gensim.models import FastText model_rw = FastText(sentences=desc_token, size=25, window=5, min_count=2, workers=4, sg=1) model_rw.similarity('12 Linajes Reserva 2012', '13th Street Burger Blend Gamay Pinot Noir VQA') model_rw.save("word2vec_model_test.model") # model_rw_load = Word2Vec.load("word2vec_model_test.model") model_rw.wv.most_similar('dri fruit') model_rw.wv.vectors.shape len(model_rw.wv.vocab) model_rw.wv.vocab model_rw.vocabulary model_rw.wv.similarity('dri', 'fruit')
) #Saving The Model modelSGw2v.wv.save_word2vec_format("SGw2v.txt", binary=False) #Delete model in order not to load RAM a lot modelSGw2v = None #Creating CBOW FastText model modelCBOWFT = FastText( sentences=gensim.models.word2vec.LineSentence("path_to_data_corpus"), min_n=4, max_n=2) #Checking cosine similarity between two words modelCBOWFT.similarity('first_word', 'second_word') #Showing top 5 similar words to a given words with their cosie similarities modelCBOWFT.wv.most_similar("word", topn=5) #Checking Word Analogy modelCBOWFT.wv.most_similar( positive=["first_positive_word", "second_positive_word"], negative=["negative_word"], topn=1) #Checking syntactic and semantic, capital-country scores of the model, Intrinsic Evaluation print( f" syntactic score: - {modelCBOWFT.wv.evaluate_word_analogies('path_to_syntactic_inputs')[0]}" ) print(