def rp(dataframe, num_topics=300): """Returns an RP model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. num_topics : int (default is 300) The number of topics to train the RP model with. Returns ------- model : Gensim RpModel RP model for documents stored in the DataFrame. """ filename = 'caches/models/rp.model' if not os.path.isfile(filename): dictionary = dictionary_corpus(dataframe) bow = bow_corpus(dataframe) tfidf_model = tfidf(dataframe) tfidf_corpus = tfidf_model[bow] rp_model = RpModel(tfidf_corpus, id2word=dictionary, num_topics=num_topics) rp_model.save(filename) else: rp_model = RpModel.load(filename) return rp_model
def rpmodel(self, corpus_t=None, topic=200, save=False, savename=None): """ :param corpus_t: :param topic: :param save: :param savename: :return: """ print('using Random Projections model...') rpmodel =RpModel(corpus=corpus_t, id2word=self.word_dict, num_topics=topic) if save: print('输出rpm模型到文件:{}'.format(savename)) rpmodel.save(savename) return rpmodel
def create_doc_term_matrix(docs, id2word, tfidf=False, logentropy=False, random_projections=False): doc_term_matrix = [id2word.doc2bow(doc) for doc in docs] _save_model2(doc_term_matrix, 'doc_term_matrix') if random_projections: rp_model = RpModel(corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics']) doc_term_matrix = rp_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_random_projections') if tfidf: tfidf_model = TfidfModel(id2word=id2word, corpus=doc_term_matrix, normalize=True) doc_term_matrix = tfidf_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_tfidf') if logentropy: log_model = LogEntropyModel(corpus=doc_term_matrix, normalize=True) doc_term_matrix = log_model[doc_term_matrix] _save_model2(doc_term_matrix, 'doc_term_matrix_logentropy') return doc_term_matrix
def compute(self): vec_texts = [text.split() for text in self.texts] write("\n "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else '' dictionary = Dictionary(vec_texts) write("[OK]") if self.debug else '' write("\n "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' corpus = [dictionary.doc2bow(vec) for vec in vec_texts] write("[OK]") if self.debug else '' write("\n "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' tfidf_space = TfidfModel(corpus) tfidf_corpus = tfidf_space[corpus] if self.method == 'TFIDF': self.space = tfidf_space self.index = MatrixSimilarity(tfidf_corpus) elif self.method == 'LSI': self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'RP': self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'LDA': self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) self.dictionary = dictionary write("[OK]\n") if self.debug else ''
def fit_model(self, corpus: List): """ This method creates the model, using Gensim Random Projection. The model isn't then returned, but gets stored in the 'model' class attribute. """ dictionary = Dictionary(corpus) self.model = RpModel(corpus, id2word=dictionary, **self.additional_parameters)
def fit(self): """ This method creates the model, using Gensim Random Projection. The model isn't then returned, but gets stored in the 'model' class attribute. """ corpus = self.extract_corpus() dictionary = Dictionary(corpus) model = RpModel(corpus, id2word=dictionary) self.model = model
def __init__(self, file_path: str, embedding_type: str): super().__init__() self.__file_path: str = file_path embedding_type = embedding_type.lower() if embedding_type == "word2vec": self.model = KeyedVectors.load_word2vec_format(self.__file_path, binary=True) elif embedding_type == "doc2vec": self.model = Doc2Vec.load(self.__file_path) elif embedding_type == "fasttext": self.model = fasttext.load_facebook_vectors(self.__file_path) elif embedding_type == "ri": self.model = RpModel.load(self.__file_path) else: raise ValueError( "Must specify a valid embedding model type for loading from binary file" )
def load_model(self): return RpModel.load(self.reference)
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return
def lsa_fact(tf, tokenized_docs): corpus = RpModel(corpus=tf.distance_matrix, id2word=tf.id2Word, num_topics=num_topics) return corpus[tf.distance_matrix]