def create_full_corpus(n_topics=num_topics): lda_bow = LdaModel.load(os.path.join(models_path, f'lda_bow_multi')) print('Loaded model') docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] with open('./objects/dictionary_lda', 'rb') as f: dictionary = pkl.load(f) # creating bow print('creating bow corpus') corpus_bow = [dictionary.doc2bow(d) for d in docs] # creating binary bow print('creating binary bow') corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] corpus_full = [ sparse2full(t_doc, n_topics) for t_doc in lda_bow[corpus_binary] ] with open('./objects/lda_bow_full', 'wb') as f: pkl.dump(corpus_full, f) return corpus_full
def train(n_topics=num_topics): docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50 ) # save the dictionary with open(os.path.join(folder_path_objects, 'dictionary_lsi_bow'), 'wb') as f: pickle.dump(dictionary, f) # create binary and regular bow corpus corpus_bow = [dictionary.doc2bow(d) for d in docs] corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] # save corpuses with open(os.path.join(folder_path_objects, 'corpus_binary'), 'wb') as f: pickle.dump(corpus_binary, f) # create models print(f'{time.ctime()} Start training LSA (binary bow)') lsi_bin = LsiModel( corpus=corpus_binary, id2word=dictionary, chunksize=1000, num_topics=n_topics ) # save models to disk os.makedirs(folder_path_models, exist_ok=True) lsi_bin.save('./models/lsi_bin_filtered')
def data_loader(): ''' Loads the documents (by id in a dict and concatenated in a list) and the word2id/id2word dicts. ''' # Load documents if not os.path.exists("./pickles/processed_docs.pkl"): docs_by_id = ra.get_processed_docs() else: with open("./pickles/processed_docs.pkl", "rb") as reader: docs_by_id = pkl.load(reader) # Load word2id and id2word documents if not os.path.exists("./pickles/word2id.pkl"): print("constructing word2id and id2word dicts") word2id, id2word = data_utils.counter_to_dicts(docs_by_id) else: with open("./pickles/word2id.pkl", "rb") as reader: word2id = pkl.load(reader) with open("./pickles/id2word.pkl", "rb") as reader: id2word = pkl.load(reader) # Load word2vec corpus if not os.path.exists("./pickles/word2vec_corpus.pkl"): print("creating train_corpus") word2vec_corpus = data_utils.create_word2vec_corpus( docs_by_id, word2id) else: with open("./pickles/word2vec_corpus.pkl", "rb") as reader: word2vec_corpus = pkl.load(reader) return (word2id, id2word, word2vec_corpus, docs_by_id)
def get_ranking(n_topics=num_topics): ''' get ranking for all queries ''' # load queries qrels, queries = read_ap.read_qrels() # load model lda_bow = LdaModel.load(os.path.join(models_path, 'lda_bow_multi')) # load corpus of full vectors with open('./objects/lda_bow_full', 'rb') as f: corpus_full = pkl.load(f) # load dictionary with open('./objects/dictionary_lda', 'rb') as f: dictionary = pkl.load(f) # process docs processed_docs = read_ap.get_processed_docs() doc_keys = processed_docs.keys() idx2key = {i: key for i, key in enumerate(doc_keys)} overall_ser = {} # loop over queries for qid in tqdm(qrels): query_text = queries[qid] sims = get_sims(lda_bow, query_text, corpus_full, dictionary, n_topics) overall_ser[qid] = dict([(idx2key[idx], np.float64(score)) for idx, score in sims]) with open('./objects/overal_ser_lda', 'wb') as f: pkl.dump(overall_ser, f)
def get_model(idx): # TF-IDF MODEL if idx == 1: docs_by_id = read_ap.get_processed_docs() model = TfIdfRetrieval(docs_by_id) return model # LSI BINARY MODEL elif idx == 2: return lsi_lda.LSIRetrieval('binary') # LSI TF-IDF MODEL elif idx == 3: return lsi_lda.LSIRetrieval('tfidf') # LDA MODEL elif idx == 4: return lsi_lda.LDARetrieval() elif idx == 5: return analysis.Word2Vec() # LSI BINARY 5 TOPICS elif idx == 12: return lsi_lda.LSIRetrieval('binary', path="lsi/5topics", num_topics=5)
def get_corpus(self): docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] doc_bows = [self.dictionary.doc2bow(doc) for doc in docs] corpus = [[(idx, 1) for idx, _ in bow] for bow in doc_bows] return corpus
def get_data(): print("Loading data ...") # load preprocessed data download_ap.download_dataset() docs_by_id = read_ap.get_processed_docs() return docs_by_id
def __init__(self, path='lda/', num_topics=500): self.path = path self.dictionary = self.get_dictionary() self.model = self.get_model(num_topics) self.index = self.get_index() self.doc_index_map = { i: doc_id for i, (doc_id, _) in enumerate(read_ap.get_processed_docs().items()) }
def __init__(self, model_type, path='lsi/', num_topics=500): assert model_type == "binary" or model_type == "tfidf", "acccepted model_type: 'binary' or 'tfidf'" self.path = path self.model_type = model_type self.dictionary = self.get_dictionary() self.model = self.get_model(num_topics) self.index = self.get_index() self.doc_index_map = { i: doc_id for i, (doc_id, _) in enumerate(read_ap.get_processed_docs().items()) }
def get_dictionary(self): tmp_fname = self.path + "lda.dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.save_as_text(tmp_fname) return dictionary
def get_corpus(self): docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] doc_bows = [self.dictionary.doc2bow(doc) for doc in docs] if self.model_type == "binary": corpus = [[(idx, 1) for idx, _ in bow] for bow in doc_bows] elif self.model_type == "tfidf": df = self.dictionary.dfs corpus = [[(idx, (np.log(1 + tf) / df[idx])) for idx, tf in bow] for bow in doc_bows] return corpus
def get_dictionary(self): tmp_fname = self.path + self.model_type + "_dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=20, no_above=0.5) dictionary.save_as_text(tmp_fname) return dictionary
def __init__(self, window_size, vocab_size): # ensure dataset is downloaded download_ap.download_dataset() # pre-process the text docs_by_id = read_ap.get_processed_docs() self.word2id = dict() self.id2word = dict() self.window_size = window_size self.vocab_size = vocab_size self.docs_by_id = docs_by_id self.read_words(vocab_size)
def main(): docs_by_id = ra.get_processed_docs() path = "./doc2vec_models/{}".format(config.model_name) # print(path) if not os.path.exists(path): print("Model not yet trained, starting training now.") train_corpus = create_corpus(docs_by_id) model = train_doc2vec(train_corpus) else: print("Model already trained, loading the file.") model = gensim.models.doc2vec.Doc2Vec.load(path) qrels, queries = ra.read_qrels() print(queries) overall_ser = {} trec_path = "./results/trec_doc2vec.csv" # Write TREC results column headers to file with open(trec_path, "w") as f: f.write("query-id, Q0, document-id, rank, score, STANDARD\n") print("Evaluating doc2vec model:", config.model_name) # Loop over all queries and predict most relevant docs for qid in tqdm(qrels): query_text = queries[qid] results, trec_results = rank_docs(model, query_text, qid, config.model_name) results = dict(results) overall_ser[qid] = results # Write all test queries to TREC format file if not int(qid) in range(76,100): with open(trec_path, "a+") as f: f.write("\n".join("{},{},{},{},{},{}".format(x[0], x[1],x[2],x[3],x[4],x[5]) for x in trec_results)) f.write("\n") # run evaluation with `qrels` as the ground truth relevance judgements # here, we are measuring MAP and NDCG evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) # dump this to JSON # *Not* Optional - This is submitted in the assignment! json_path = "./results/{}.json".format(config.model_name) with open(json_path, "w") as writer: json.dump(metrics, writer, indent=1)
def individual_query(query_text): docs = read_ap.get_processed_docs() doc_keys = docs.keys() idx2key = {i: key for i, key in enumerate(doc_keys)} # load model lda_bow = LdaModel.load(os.path.join(models_path, 'lda_bow_multi')) # load corpus of full vectors with open('./objects/lda_bow_full', 'rb') as f: corpus_full = pkl.load(f) # load dictionary with open('./objects/dictionary_lda', 'rb') as f: dictionary = pkl.load(f) sims = get_sims(lda_bow, query_text, corpus_full, dictionary, num_topics) ranking = dict([(idx2key[idx], np.float64(score)) for idx, score in sims]) return ranking
def preprocess(path=PROCESSED_DOCS_PATH): # Load the preprocessed docs_by_id file if it exists. if os.path.exists(path): print("Loading the preprocessed files...") with open(path, "rb") as reader: return pickle.load(reader) # (Down)load the dataset from the ap files and get it in the right form. download_ap.download_dataset() docs_by_id = read_ap.get_processed_docs() print("Filtering infrequent words...") docs_by_id = filter_infrequent(docs_by_id) print("Converting words to indices...") tok2idx, id2corpus = all_words_to_indices(docs_by_id) # Store the preprocessing results for faster future retrieval. print("Storing the preprocessed files...") with open(path, "wb") as writer: pickle.dump((tok2idx, id2corpus), writer) return tok2idx, id2corpus
def train(config): print(f"Training vec dim: {config.vector_dim}, window size: {config.window_size}, Vocab size: {config.vocab_size}") if not os.path.exists(config.model_file) or config.t: print("\n### Reading in the documents ###\n") docs_by_id = read_ap.get_processed_docs() print("\n### Converting to gensim standards ###\n") train_docs = list(AP2Gensim(docs_by_id)) model = gensim.models.doc2vec.Doc2Vec(vector_size=config.vector_dim, window=config.window_size, min_count=config.min_count, dm=0, max_vocab_size=config.vocab_size, epochs=config.epochs) print("\n### Building vocab ###\n") model.build_vocab(train_docs) print("\n### Training model ###\n") model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs) print("\n### Saving model ###\n") model.save(config.model_file) else: print("A model already exists so skipping training")
def search_doc2vec(model, query, docs_by_id=None, result_len=MAX_NUMBER_OF_RESULTS): if docs_by_id is None: docs_by_id = read_ap.get_processed_docs() # Deleting training data is advice by the official gensim website. model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) print("Comparing the query embedding with all document embeddings...") # Get cosine similarity for the query compared to the documents. q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)]) q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0) cos = torch.nn.CosineSimilarity() results = {} for doc_id, doc in docs_by_id.items(): vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0) results[doc_id] = float(cos(vec, q_vec)) # Rank the top results in a list. results = list(results.items()) results.sort(key=lambda _: _[1]) return results[:result_len]
def main(): docs_by_id = get_processed_docs() doc_ids = list(docs_by_id) documents = [] for key in docs_by_id.keys(): doc = docs_by_id[key] documents.append(doc) # construct dictionary and corpus dictionary, bow_corpus, tfidf_corpus = create_corpus_and_dict(documents) if not os.path.exists( ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics))): print("Starting LDA with topics = {} with BOW training now.".format( config.topics)) BOW_LDA = train_LDA(bow_corpus, dictionary, config.topics) BOW_LDA.save( ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics))) else: print("LDA with BOW already trained, loading the file.") BOW_LDA = LdaModel.load( ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics)))
def train(n_topics=num_topics): docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50) # save the dictionary with open(os.path.join(folder_path_objects, 'dictionary_lsi_bow'), 'wb') as f: pickle.dump(dictionary, f) # create binary and regular bow corpus corpus_bow = [dictionary.doc2bow(d) for d in docs] # create tf-idf corpus tfidf = TfidfModel(corpus_bow) corpus_tfidf = tfidf[corpus_bow] with open(os.path.join(folder_path_objects, 'corpus_lsi_tfidf'), 'wb') as f: pickle.dump(corpus_tfidf, f) # create models print(f'{time.ctime()} Start training LSI (tf-idf)') lsi_tfidf = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics) # save models to disk os.makedirs(folder_path_models, exist_ok=True) def filepath_out(model): return os.path.join('models', f'{model}_{t}') lsi_tfidf.save(filepath_out('lsi_tfidf'))
def train(n_topics=num_topics): '''Train LDA model''' docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50) # save the dictionary with open('./objects/dictionary_lda', 'wb') as f: pkl.dump(dictionary, f) # creating bow print('creating bow corpus') corpus_bow = [dictionary.doc2bow(d) for d in docs] # creating binary bow print('creating binary bow') corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] # with open(os.path.join(objects_path, 'corpus'), 'wb') as f: # pickle.dump(corpus_tfidf, f) print(f'{time.ctime()} Start training LDA (BOW)') lda_bow = LdaMulticore(workers=5, corpus=corpus_binary, id2word=dictionary, chunksize=1000, num_topics=n_topics, dtype=np.float64) # save models to disk os.makedirs(models_path, exist_ok=True) lda_bow.save(os.path.join(models_path, f'lda_bow_multi'))
if query_term not in self.ii: continue for (doc_id, tf) in self.ii[query_term]: results[doc_id] += np.log(1 + tf) / self.df[query_term] results = list(results.items()) results.sort(key=lambda _: -_[1]) return results if __name__ == "__main__": # ensure dataset is downloaded download_ap.download_dataset() # pre-process the text docs_by_id = read_ap.get_processed_docs() # Create instance for retrieval tfidf_search = TfIdfRetrieval(docs_by_id) # read in the qrels qrels, queries = read_ap.read_qrels() overall_ser = {} print("Running TFIDF Benchmark") # collect results for qid in tqdm(qrels): query_text = queries[qid] results = tfidf_search.search(query_text) overall_ser[qid] = dict(results)
def main(): # id2word, word2id, = load_data() docs_by_id = get_processed_docs() doc_ids = list(docs_by_id) documents = [] for key in docs_by_id.keys(): doc = docs_by_id[key] documents.append(doc) #construct dictionary and corpus dictionary, bow_corpus, tfidf_corpus = create_corpus_and_dict(documents) # train lsi, generate index #TRAINING LOOP topic_args = [10, 50, 100, 500, 1000, 2000, 5000, 10000] for topic_num in topic_args: print(('starting training tfidf lsi and index with topic_num {}' ).format(topic_num)) lsi, index = train_lsi(tfidf_corpus, dictionary, num_topics=topic_num, corpus_type='tfidf') print(('finished training tfidf lsi and index with topic_num {}' ).format(topic_num)) print(('starting training bow lsi and index with topic_num {}' ).format(topic_num)) lsi, index = train_lsi(bow_corpus, dictionary, num_topics=topic_num, corpus_type='bow') print(('finished training bow lsi and index with topic_num {}' ).format(topic_num)) #METRICS LOOP # COMPUTE METRICS topic_args = [10, 50, 100, 500, 1000, 2000] for topic_num in topic_args: # retrieve model and index for TFIDF, compute and store metrics lsi, index = train_lsi(tfidf_corpus, dictionary, num_topics=topic_num, corpus_type='tfidf') compute_metrics(dictionary=dictionary, model=lsi, index=index, corpus_type='tfidf', num_topics=topic_num, doc_ids=doc_ids) # retrieve model and index for BOW, compute and store metrics lsi, index = train_lsi(bow_corpus, dictionary, num_topics=topic_num, corpus_type='bow') compute_metrics(dictionary=dictionary, model=lsi, index=index, corpus_type='bow', num_topics=topic_num, doc_ids=doc_ids) #TOPICS tfidf_lsi, index = train_lsi(tfidf_corpus, dictionary, num_topics=500, corpus_type='tfidf') print("top 5 TFIDF topics") pprint(tfidf_lsi.print_topics(num_topics=50)) bow_lsi, index = train_lsi(bow_corpus, dictionary, num_topics=500, corpus_type='bow') print("top 5 BOW topics") pprint(bow_lsi.print_topics(num_topics=50))
import read_ap from gensim.test.utils import common_dictionary, common_corpus from gensim.models import LsiModel model = LsiModel(common_corpus, id2word=common_dictionary) vectorized_corpus = model[common_corpus] docs = read_ap.get_processed_docs() print(len(docs)) pass """ tf-idf > w2v > lsa >> d2v """
def get_docs_by_id(): return read_ap.get_processed_docs()
metrics = evaluator.evaluate(overall_ser) json_filename = f"./json_files/benchmark_{model_name}.json" # dump to JSON with open(json_filename, "w") as writer: json.dump(metrics, writer, indent=1) return json_filename if __name__ == "__main__": np.random.seed(42) # retrieve docs as a list processed_docs = get_processed_docs() docs = processed_docs.values() doc_keys = processed_docs.keys() idx2key = {i: key for i, key in enumerate(doc_keys)} # convert to TaggedDocuments so that gensim can work with them documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)] print(f"Docs are loaded. {len(docs)} in total\n") # train the model model, model_name = training(documents, max_vocab_size=3000000, vector_dim=300, window_size=2, verbose=True)
def get_doc_keys(): docs = read_ap.get_processed_docs() return list(docs.keys())
def get_doc_list(): """ Process documents and convert doc Dictionary to a list of lists of tokens """ docs = read_ap.get_processed_docs() return list(map(list, docs.values()))
if docs_by_id is None: docs_by_id = read_ap.get_processed_docs() # Deleting training data is advice by the official gensim website. model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) print("Comparing the query embedding with all document embeddings...") # Get cosine similarity for the query compared to the documents. q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)]) q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0) cos = torch.nn.CosineSimilarity() results = {} for doc_id, doc in docs_by_id.items(): vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0) results[doc_id] = float(cos(vec, q_vec)) # Rank the top results in a list. results = list(results.items()) results.sort(key=lambda _: _[1]) return results[:result_len] if __name__ == "__main__": skipgram = SkipGram() skipgram._train(list(ID2CORPUS.values())) print(search_SkipGram(skipgram, "How are you")) train_doc2vec(read_ap.get_processed_docs(), batched=True)