def _setup_model(self, model): # Determine which model to use, download/load it, and create the similarity_index if isinstance(model, Word2VecKeyedVectors): # Use supplied model self.model = model elif isinstance(model, str): # Try to download named model if self.verbose: print(f"Loading word vector model: {model}") self.model = api.load(model) if self.verbose: print("Model loaded Succesfully") elif model is None: # Download/use default GloVe model if self.verbose: print( f"Loading default GloVe word vector model: {self.default_model}" ) self.model = api.load(self.default_model) if self.verbose: print("Model loaded Succesfully") else: raise ValueError("Unable to load word vector model") self.similarity_index = WordEmbeddingSimilarityIndex(self.model) self.model_ready = True
def W2VH(): docbrown = "" for w in brown.words(categories='mystery'): docbrown += str(w.lower().split()) docbrown1, docbrown2 = docbrown[:int(len(docbrown) / 2)], docbrown[int(len(docbrown) / 2):] stop_words = stopwords.words('english') docbrown1 = [w for w in docbrown1 if w not in stop_words] docbrown2 = [w for w in docbrown2 if w not in stop_words] documents = [docbrown1, docbrown2] dictionary = corpora.Dictionary(documents) docbrown1 = dictionary.doc2bow(docbrown1) docbrown2 = dictionary.doc2bow(docbrown2) model = Word2Vec(common_texts, size=20, min_count=1) termsim_index = WordEmbeddingSimilarityIndex(model.wv) similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) similarity = similarity_matrix.inner_product(docbrown1, docbrown2, normalized=True) print('= %.4f' % similarity)
def create_softcosine_resourse(model_source,all_in_sentence): # create resources for soft cosine overall_dict = gensim.corpora.Dictionary(all_in_sentence) model = gensim.models.Word2Vec.load(model_source) similarity_index = WordEmbeddingSimilarityIndex(model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, overall_dict) return overall_dict, similarity_matrix
def createW2VecIndex(reference_dict): from gensim.corpora import Dictionary from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix print("Prepare Word2Vec model") import time t1 = time.time() corpus = [] #reference = [] for term in reference_dict: corpus.append(word_tokenize(term)) #reference.append(term) model = Word2Vec(corpus, size=20, min_count=1) # train word-vectors termsim_index = WordEmbeddingSimilarityIndex(model.wv) #<---- dictionary = Dictionary(corpus) bow_corpus = [dictionary.doc2bow(document) for document in corpus] similarity_matrix = SparseTermSimilarityMatrix( termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=3) t2 = time.time() print(" W2v index and dictionary in ", (t2 - t1) / 60, " minutes") import pickle f = open("./models/W2VecIndexes.bin", 'wb') pickle.dump((docsim_index, dictionary), f) return docsim_index, dictionary
def __init__(self, documents): print("Initializing GloVe") if isinstance(documents[0], list): print("It is a list") documents = [[" ".join(document)] for document in documents if isinstance(document, list)] documents = [str(document) for document in documents] self.corpus = [ preprocess(document) for document in documents if type(document) is str ] self.documents = documents ''' Then we create a similarity matrix, that contains the similarity between each pair of words, weighted using the term frequency: ''' # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") print("Document loaded") self.similarity_index = WordEmbeddingSimilarityIndex(glove) self.dictionary = Dictionary(self.corpus) self.tfidf = TfidfModel(dictionary=self.dictionary) print("Model is running") # Create the term similarity matrix. self.similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, self.dictionary, self.tfidf) print("Everything has been initialized")
def initializeSimilarityMatrix(self): self.similarity_index = WordEmbeddingSimilarityIndex(self.w2v_model) self.similarity_matrix = SparseTermSimilarityMatrix( self.similarity_index, self.dictionary, self.tfidf, nonzero_limit=100)
def compute_sim_matrix(self): ''' if(self.model_type.lower() == "fasttext"): model = FastText(self.questions) else: model = Word2Vec(self.questions) ''' self.dictionary = Dictionary(self.questions) self.tfidf = TfidfModel(dictionary=self.dictionary) word2vec_model = Word2Vec(self.questions, workers=cpu_count(), min_count=5, size=300, seed=12345) sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv) sim_matrix = SparseTermSimilarityMatrix(sim_index, self.dictionary, self.tfidf, nonzero_limit=100) bow_corpus = [ self.dictionary.doc2bow(document) for document in self.questions ] tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus] self.docsim_index = SoftCosineSimilarity(tfidf_corpus, sim_matrix, num_best=10)
def get_sim_index(wv_model, bow_corpus, dictionary): termsim_index = WordEmbeddingSimilarityIndex(wv_model.wv) similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) return docsim_index
def prepare_index(dictionary, model, tfidf, documents): if not os.path.isfile('soft_cosine.index'): similarity_index = WordEmbeddingSimilarityIndex(model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in documents]], similarity_matrix) index.save('soft_cosine.index') return SoftCosineSimilarity.load('soft_cosine.index')
def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv) self.dictionary = Dictionary(self.corpus) bow = [self.dictionary.doc2bow(doc) for doc in self.corpus] similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary) self.softcosinesimilarity = SoftCosineSimilarity( bow, similarity_matrix, num_best=10 )
def glove_score_1v1(query_string, documents): # query_string = 'Leticia has 3+ years of experience in data science. She has a background in applied mathematics and computer science and currently works as a data scientist at Ørsted. In her work, she builds condition-based algorithms to predict when their offshore wind turbines are going to fail in order to optimize daily operations. Leticia has an international upbringing and has lived in 9 different countries, and she is driven by a great work environment with diversity in the workplace. Leticia wants to become a mentor to help students in their transition to professional life and share their own experiences of studying and working abroad and succeeding as a woman in a male-dominated field. Leticia would prefer a mentee that has ambition and drive, such that she has a better understanding of where he or she wants to go and how she can help in the best way.' # documents = ['I would describe myself as being dedicated and curious. I am very interested in data analytics and operations research, specially in connection with logistics and planning. For my Bachelor thesis I did a simulation project with Copenhagen Malmö Port on how to optimise the logistics operations at their container-terminal, which really sparked my interest in this area. I am always interesting in learning new things and I try to take advantage of the great opportunities offered through my studies at DTU - like this mentorship or having the opportunity to go abroad for a semester. Last year I spent a semester at Hong Kong University of Science and Technology which was a big experience both academically and personally. Currently, I am working as a student assistant in Danmarks Nationalbank, and even though it is interesting getting an insight into the financial world and having to apply my skills to a different area, at some time, I would like to try something more related to my studies. I would like to be part of the program to gain more knowledge of what it is like working in the industry as a data analyst or engineer - preferably working with logistics, data analytics or operations research. I know very few engineers outside the academic world at DTU, so I would appreciate a mentor who could share some of their experiences and tips on transitioning from student to professional. I am leaning towards specialising in prescriptive analytics, so I would also be very interested in learning more about how optimisation methods and simulation studies are actually applied to real-world problems. What I hope to achieve as a mentee is to be more prepared for working in the industry and get advice on how to make smart choices regarding my studies. I would also appreciate some advice on whether to take another semester abroad during my Masters or gain more work-experience.', # 'My greatest ambition is to leave the world in a better state for humans to experience the quality of life than it was when I entered it. This reason lead me to choose scientific studies - general engineering in Paris at first, and then Applied Mathematics in DTU - in the hope to use technologys leverage for maximum impact. Disclaimer: I am currently not looking for a position as I am to continue working for Tomorrow, the fantastic company I am already working for I nevertheless am very interested to get some insights, from a mentor that went through a similar line of study, into how they decided on starting to work straight away vs continue in the academic world by applying for a PhD. I am also eager to learn more about what it actually means to be a professional "data scientist". How much research/theory is actually useful in day-to-day operations and what level of freedom they can have in their decisions and organisation. I am also curious to learn more about career path for data scientist. The popularity of this position is fairly recent and for this reason, career evolution for a data scientist is still rather obscure to me.'] # 'I would describe myself as focused, structured and vigorous. My main interest is overall concrete technology. It is from the mixing recipes to the maintaining of old structures to "cure" its sickness. The topic of my bachelor project was about testing the different national and international test methods for alkali silica reactions (ASR). To find out the most optimal methods, to catch that sand and stone which could develop ASR. My master thesis is about testing if mine tailings could be used as a substitute for fly ash, which soon not will be available at the same amount as earlier. In my free time, I have been doing a lot of volunteering. I have been a coach for a handball team for 11-12 year old girls for two years. I learned a lot about coaching, planning and taught the girls to be team players. Further I have been part of the organizing committee for the study start and the council for my study line for three years. Where I further developed my competencies planning, leading and get things done. I usually take the lead when things need to be done, but I dont know if Im suited for management. I hope to get a closer look at "the real life", to get ready when I finish my thesis in January. I want to a mentee to get knowledge about the "life" after university. I would prefer a mentor who works with civil engineering, but a mentor who can taught me difference between consulting and entrepreneur firms, so I can find out what is right for me, would be a nice. I still don\'t know what exactly I can be, but I would appreciate some advice. I hope to achieve a way into the business, which could help me find a job after my thesis.'] # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb # Preprocess the documents, including the query string corpus = [preprocess(document) for document in documents] query = preprocess(query_string) ''' Then we create a similarity matrix, that contains the similarity between each pair of words, weighted using the term frequency: ''' # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") similarity_index = WordEmbeddingSimilarityIndex(glove) # Build the term dictionary, TF-idf model print("Everything has been initialized") dictionary = Dictionary(corpus + [query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) ''' Finally, we calculate the soft cosine similarity between the query and each of the documents. Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), the soft cosine similarity considers word similarity as well. ''' # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] count = 0 print("Mentee values: {}".format(query_string)) for idx in sorted_indexes: count += 1 if count > 10: break print( f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}') return doc_similarity_scores
def calculate_soft_cosine_similarity(self, topic_models, sentences, *args, **kwargs): topic_claim_relations = {} for topic in topic_models: topic_claim_relations[topic] = [] documents = [] for topic in topic_models: documents.append(topic.lower().split()) for sentence in sentences: documents.append(sentence.lower().split()) dictionary = corpora.Dictionary(documents) w2v_model = api.load("glove-wiki-gigaword-100") similarity_index = WordEmbeddingSimilarityIndex(w2v_model) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) for sentence in sentences: best_cosine_result = 0 x = 0 normal_sentence = sentence sentence = sentence.lower().split() stop_words = stopwords.words('english') sentence = [w for w in sentence if w not in stop_words] while x <= len(topic_models) - 1: topic_model = (topic_models[x]).lower().split() topic_model = [w for w in topic_model if w not in stop_words] topic_model_bow = dictionary.doc2bow(topic_model) sentence_bow = dictionary.doc2bow(sentence) similarity = similarity_matrix.inner_product(topic_model_bow, sentence_bow, normalized=True) print('similarity = %.4f' % similarity) if similarity > best_cosine_result: best_cosine_result = similarity matched_topic = topic_models[x] if x == len(topic_models) - 1: if best_cosine_result > 0.3: topic_claim_relations[matched_topic].append( normal_sentence) x = x + 1 return topic_claim_relations
def find_similarity(search_w, corpus_w): rv = {} rv['result'] = [] bmatch = False #Tokenize the sentence into words #search_tokens = [word for word in search_w.split()] #corpus_tokens = [word for word in corpus_w.split()] search_tokens = search_w corpus_tokens = corpus_w #print(search_tokens) #print(corpus_tokens) #print("-----") #cp = [] #for c in corpus_tokens: # cp.append([c]) #corpus_tokens = cp search_tokens = [search_w] print(corpus_tokens) print(search_tokens) # Prepare a dictionary and a corpus. #documents = [svc_tokens, specs_tokens] dictionary = corpora.Dictionary(corpus_tokens) termsim_index = WordEmbeddingSimilarityIndex(model.wv) bow_corpus = [dictionary.doc2bow(doc) for doc in corpus_tokens] similarity_matrix = SparseTermSimilarityMatrix( termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) # Compute soft cosine similarity for t in search_tokens: #print("looking for %s" %(t.split())) for e in t.split(','): match = {} e = e.strip() lkup = [e] try: result = docsim_index[dictionary.doc2bow(lkup)] except: result = [(0, 0)] print(f"looking for {lkup}, result {result}") if len(result) and result[0][1] > 0.5: match['word'] = e.split() match['value'] = str(result) rv['result'].append(match) bmatch = True #print(docsim_index[dictionary.doc2bow(search_tokens)]) return rv if bmatch else None
def __init__(self, model): """ Creates the class. Args: modelName: name of the model to download through gensim """ # public properties self.itemScores = [] self.dictionary = corpora.Dictionary() self.model = model self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model)
def _setup_model(self): if self.verbose: print('Loading model') loaded_model = load_facebook_model(self.default_model) self.model = loaded_model.wv if self.verbose: print('Model loaded') self.similarity_index = WordEmbeddingSimilarityIndex(self.model) self.model_ready = True
def compute_msg_dist_matrix(data): lst_notifications = data # print(lst_notifications) model = Word2Vec(lst_notifications, min_count=1) # train word-vectors termsim_index = WordEmbeddingSimilarityIndex(model.wv) data_2 = [d.split() for d in lst_notifications] #print(data) dictionary = Dictionary(data_2) bow_corpus = [dictionary.doc2bow(document) for document in data_2] similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix) sof_cosine_distance_matrix = 1- np.array(docsim_index) return sof_cosine_distance_matrix
def __init__(self, params): super().__init__(params) self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] ) self.new_model.init_sims(replace=True) # Normalizes the vectors in the word2vec class. #Computes cosine similarities between word embeddings and retrieves the closest #word embeddings by cosine similarity for a given word embedding. self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv) #Build a term similarity matrix and compute the Soft Cosine Measure. self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary) self.dict_distance_dispatcher = { DistanceMetric.COS: self.cos_scipy, SimilarityMetric.Pearson: self.pearson_abs_scipy, DistanceMetric.WMD: self.wmd_gensim, DistanceMetric.SCM: self.scm_gensim }
def train(self, sentences): """Train a word2vec model with sentences""" dictionary = Dictionary(sentences) ft = Word2Vec(sentences, workers=cpu_count(), min_count=5, size=300, seed=12345) index = WordEmbeddingSimilarityIndex(ft.wv) matrix = SparseTermSimilarityMatrix(index, dictionary) self.dictionary = dictionary self.ft = ft self.matrix = matrix
def computeDocumentSimilarityIndex(self, corpus): """ Compute the similarity matrix of the model Args: corpus: dictionary to use to create index Returns: SoftCosineSimilarity instance """ if self.wordEmbedding is None: self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model) # create similarity matrix, update flags simMatrix = SparseTermSimilarityMatrix(self.wordEmbedding, corpus) return SoftCosineSimilarity([x.sentence for x in self.itemScores], simMatrix)
def get_embedding_files(self, num_best=10): """ Get the dictionary, bow_corpos, similiarity matrix and docsim index pre-trained on all image tags. """ # embeddings try: with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "rb") as f: self.dictionary, self.bow_corpus, self.similarity_matrix, _ = pickle.load( f) self.docsim_index = SoftCosineSimilarity(self.bow_corpus, self.similarity_matrix, num_best=num_best) except FileNotFoundError: print( f'no file found, training word2vec to get bow_corpus, similarity matrix and docsim index' ) # read in all tags try: with open(f'{constants.DATA_DIR}/all_img_tags.pkl', 'rb') as fp: all_img_tags_lower = pickle.load(fp) except FileNotFoundError: print( f'no file found at {constants.DATA_DIR}/all_img_tags.pkl') model = Word2Vec(all_img_tags_lower, size=20, min_count=1) # train word2vec termsim_index = WordEmbeddingSimilarityIndex(model.wv) self.dictionary = Dictionary(all_img_tags_lower) self.bow_corpus = [ self.dictionary.doc2bow(document) for document in all_img_tags_lower ] self.similarity_matrix = SparseTermSimilarityMatrix( termsim_index, self.dictionary) # construct similarity matrix # 10 (default) most similar image tag vectors self.docsim_index = SoftCosineSimilarity(self.bow_corpus, self.similarity_matrix, num_best=num_best) print( f'Saving bow_corpus, similarity matrix and docsim index to {constants.EMBEDDING_DIR}' ) with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "wb") as f: pickle.dump((self.dictionary, self.bow_corpus, self.similarity_matrix, self.docsim_index), f)
def calculate_distance(self,query_string,documents): def preprocess(doc): # Tokenize, clean up input document string doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc) doc = sub(r'<[^<>]+(>|$)', " ", doc) doc = sub(r'\[img_assist[^]]*?\]', " ", doc) doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc) return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS] # Preprocess the documents, including the query string corpus = [preprocess(document) for document in documents] query = preprocess(query_string) # Load the model: this is a big file, can take a while to download and open similarity_index = WordEmbeddingSimilarityIndex(glove) # Build the term dictionary, TF-idf model dictionary = Dictionary(corpus+[query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] if len(documents) > 1: for idx in sorted_indexes: print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}') # print(doc_similarity_scores) return doc_similarity_scores
def __init__(self, documents): print(type(documents[0])) if isinstance(documents[0], list): print("It is a list") documents = [[" ".join(document)] for document in documents if isinstance(document, list)] documents = [str(document) for document in documents] self.corpus = [ preprocess(document) for document in documents if type(document) is str ] self.documents = documents ''' Then we create a similarity matrix, that contains the similarity between each pair of words, weighted using the term frequency: ''' # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") self.similarity_index = WordEmbeddingSimilarityIndex(glove)
def calculate_softcosine_w2v(test_data): data = [i.split() for i in (test_data.text).tolist()] dictionary = corpora.Dictionary(data) corpus = [dictionary.doc2bow(d) for d in data] similarity_index = WordEmbeddingSimilarityIndex(w2v_model) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) softsim_w2v_matrix = np.empty(shape=(len(data), len(data))) * np.nan for d1 in range(0, len(data)): for d2 in range(0, len(data)): softsim_w2v_matrix[d1, d2] = similarity_matrix.inner_product( corpus[d1], corpus[d2], normalized=True) doc_sim_max_index, doc_sim_max_values = calculate_max_similarity( softsim_w2v_matrix) softsim_w2v_df = export_result(test_data, doc_sim_max_index, doc_sim_max_values, 'softsim_w2v') print( "Similarity using soft cosine similarity using w2v vectors is calculated!!" ) return softsim_w2v_df
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return
def create_model(storage_client, json_in, video_id): """ Create soft cosine similarity model Keywords arguments: storage_client -- a Storage instance json_in -- json returned from the YouTube Captions API video_id -- the Youtube video_id Returns: - A Soft Cosine Measure model - The dictionary of terms computed """ video_id = video_id.lower() # check if bucket exists if blob_exists(storage_client, video_id): # retrieve blob from bucket bucket = storage_client.bucket(bucket_name) blob = bucket.blob(video_id) # The blob's name is the video ID # download the storage pickle as a binary string blob_str = blob.download_as_string() dictionary, index = pickle.loads(blob_str) return dictionary, index # download stop_words and glove stop_words, glove = download_resources() # Create Glove similarity Index similarity_index = WordEmbeddingSimilarityIndex(glove) # parse json captions into document form documents = processInput(json_in) # create a corpus from documents corpus = [preprocess(document, stop_words) for document in documents] # create dictionary from documents dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) # create a term similarity matrix similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) # Compute Soft Cosine Measure between documents index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) # save index and dictionary storage_client = storage.Client() # create a binary pickle representation bin_tuple = pickle.dumps((dictionary, index)) bucket = storage_client.bucket(bucket_name) blob = bucket.blob(video_id) # save to storage blob.upload_from_string(bin_tuple) if debug_messages: print("Binary model with name {} and dictionary uploaded.".format( video_id)) return dictionary, index
total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format( round((time.time() - t) / 60, 2))) w2v_model.init_sims(replace=True) w2v_model.save("w2v-20newsgroups") print(w2v_model.vector_size) len(w2v_model.wv.vocab) termsim_index = WordEmbeddingSimilarityIndex(w2v_model.wv) # get termsim index # dictionary = Dictionary(df['tokenized']) # dictionary for model to use for indexing later # finding a similarity matrix def load_obj(name): with open(name + '.pkl', 'rb') as f: return pickle.load(f) dictionary = load_obj('LDADICT') bow_corpus = [dictionary.doc2bow(document) for document in df['tokenized']] # generate a bow corpus similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # Testing: Case 1 Atheism vs Windows
def main(): tfidf = None word2vec = None similarityMatrix = None browndict = {} corporadict = None word2vec = None choice = "" while choice != "exit": choice = "" while choice not in ["tfidf", "word2vec", "exit"]: choice = input( "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower() if choice == "exit": break catType = "" while catType not in ["within", "between", "return"]: catType = input( "Within or between clusters? [Within, Between, Return]\n>" ).lower() if catType == "return": break # get all of the words for each document per category texts = [] if catType == "within": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words) # build a dictionary for me to use later browndict[c] = words elif catType == "between": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words[:len(words) // 2]) texts.append(words[len(words) // 2:]) # build a dictionary for me to use later browndict[c + "1/2"] = words[:len(words) // 2] browndict[c + "2/2"] = words[len(words) // 2:] # create the corpora dictionary built from gensim corporadict = corpora.Dictionary(texts) # create a corpus for the training corpus = [] for line in texts: corpus.append(corporadict.doc2bow(line)) if choice == "tfidf": # create the tfidf model from our built corpus tfidf = TfidfModel(corpus=corpus) # build the similarity matrix similarityMatrix = MatrixSimilarity(corpus, num_features=len(corporadict)) elif choice == "word2vec": word2vec = Word2Vec(brown.sents()) # build term similiarity matrix from our models word-vector termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv) # build sparse similarity matrix sparseSimiliarityMatrix = SparseTermSimilarityMatrix( termSimilarityIndex, corporadict) # build similarity word-vector WV_SimilarityMatrix = SoftCosineSimilarity( corpus, sparseSimiliarityMatrix) maxes = {} if choice == "tfidf": # Print out the code keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and to a tfidf vector, then query it. query_bow = corporadict.doc2bow(browndict[keys[i]]) query_tfidf = tfidf[query_bow] # Get the similarity of every cluster query_similarity = similarityMatrix[query_tfidf] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("") elif choice == "word2vec": keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and query it query_bow = corporadict.doc2bow(browndict[keys[i]]) # Get the similarity of every cluster query_similarity = WV_SimilarityMatrix[query_bow] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("")
n_topics = len(topics) # pre-process topic keyword lists topics_cleaned = list(map(clean_sentence,topics)) # build complete dictionary tokenized_neg_reviews_and_topics = topics_cleaned + list(df_negative_sentences['review_sentence_cleaned']) neg_dictionary = corpora.Dictionary(tokenized_neg_reviews_and_topics) # create bag-of-words vectors corpus_neg_reviews = [neg_dictionary.doc2bow(text) for text in list(df_negative_sentences['review_sentence_cleaned'])] corpus_neg_topics = [neg_dictionary.doc2bow(text) for text in topics_cleaned] # build similarity matrix of word embeddings print('Building similarity matrix of word embeddings. Might take a few minutes...') termsim_index = WordEmbeddingSimilarityIndex(fasttext_model300) similarity_matrix = SparseTermSimilarityMatrix(termsim_index,neg_dictionary) print('done') # compute soft cosine similarity between sentences and topics print('Computing soft cosine similarity between sentences and topics. Might take a few minutes...') neg_data_topics = [] for review_item in corpus_neg_reviews: review_item_topics = [] for topic in corpus_neg_topics: review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True)) neg_data_topics.append(review_item_topics) print('done') # extract topic with highest soft cosine similarity # I set a minimum threshold (0.10) that needs to be reached in order to assign a topic.
embedding1 = np.average([model.wv[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1) embedding2 = np.average([model.wv[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1) sim = cosine_similarity(embedding1, embedding2)[0][0] sims.append(sim) return sims # Create Term Similarity Index from Word2Vec model termsim_index = WordEmbeddingSimilarityIndex(model.wv) # Create Corpus List corpus_list = [] for data in dataset: docs = "" for sentence in data['gejala']: docs += " " + sentence corpus_list.append(docs) # Create token list for all document corpus corpus_list_token = [preprocess(doc) for doc in corpus_list] dictionary = Dictionary(corpus_list_token) bow_corpus = [dictionary.doc2bow(document) for document in corpus_list_token]
stopwords = ['the', 'and', 'are', 'a'] # Preprocess the documents, including the query string corpus = [preprocess(document) for document in documents] file_corpus_w = open("data/corpus.pickle", 'wb') pickle.dump(corpus, file_corpus_w) file_corpus_w.close() print("Preprocessing finished") print(time.time() - start_time) print("Loading model") # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") similarity_index = WordEmbeddingSimilarityIndex(glove) file_sim_idx_w = open("data/sim_idx.pickle", 'wb') pickle.dump(similarity_index, file_sim_idx_w) file_sim_idx_w.close() print("Model loaded") print(time.time() - start_time) ##################### print("Building term dictionary and similarity matrix") # Build the term dictionary, TF-idf model dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) file_tfidf_w = open("data/tfidf.pickle", 'wb') pickle.dump(tfidf, file_tfidf_w) file_tfidf_w.close()