def main(): args = parser.parse_args() # dialect = ['pa','sy'] dialect = [args.dialect_one, args.dialect_two] folder = args.corpus_folder + '/' # clean_data/comparable/msa/ , clean_data/comparable/egypt/ corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt'] dictionary, corpus = models.build_comparable_ldamodel_training(folder, dialect) # sys.exit() # print('dict',len(dictionary)) # print(dictionary.token2id) # print('corpus', len(corpus)) lda_model = models.build_ldamodel(corpus, dictionary) folders = [folder + dialect[0] + '/', folder + dialect[1] + '/'] # for sub_folder in folders: Hellinger_summation = 0 Jaaccard_summation = 0 for file in os.listdir(folders[0]): try: extension = os.path.splitext(file)[1] if extension == '.txt': first_filepath = os.path.join(folders[0], file) second_filepath = os.path.join(folders[1], file) with open(first_filepath, encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] # print(first_dialect) with open(second_filepath, encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # print(second_dialect) bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # print(bow_first_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] # print(lda_bow_first_dialect) print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) Hellinger_summation = Hellinger_summation + hellinger(lda_bow_first_dialect, lda_bow_second_dialect) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) Jaaccard_summation = Jaaccard_summation + jaccard(bow_first_dialect, bow_second_dialect) # sys.exit() except : pass print('total hellinger = ', Hellinger_summation / 10197) print('Total JC = ', Jaaccard_summation / 10197)
def comparable_corpus_distance(folder, dialect): dictionary, corpus = models.build_comparable_ldamodel_training( folder, dialect) lda_model = models.build_ldamodel(corpus, dictionary) folders = [folder + dialect[0] + '/', folder + dialect[1] + '/'] Hellinger_summation = 0 Jaaccard_summation = 0 for file in os.listdir(folders[0]): try: extension = os.path.splitext(file)[1] if extension == '.txt': first_filepath = os.path.join(folders[0], file) second_filepath = os.path.join(folders[1], file) with open(first_filepath, encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] # print(first_dialect) with open(second_filepath, encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # print(second_dialect) bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # print(bow_first_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] # print(lda_bow_first_dialect) print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) Hellinger_summation = Hellinger_summation + hellinger( lda_bow_first_dialect, lda_bow_second_dialect) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) Jaaccard_summation = Jaaccard_summation + jaccard( bow_first_dialect, bow_second_dialect) # sys.exit() except: pass print('total hellinger = ', Hellinger_summation / 10197) print('Total JC = ', Jaaccard_summation / 10197)
def test_distributions(self): # checking bag of words as inputs vec_1 = [(2, 1), (3, 4), (4, 1), (5, 1), (1, 1), (7, 2)] vec_2 = [(1, 1), (3, 8), (4, 1)] result = matutils.jaccard(vec_2, vec_1) expected = 1 - 0.3 self.assertAlmostEqual(expected, result) # checking ndarray, csr_matrix as inputs vec_1 = np.array([[1, 3], [0, 4], [2, 3]]) vec_2 = csr_matrix([[1, 4], [0, 2], [2, 2]]) result = matutils.jaccard(vec_1, vec_2) expected = 1 - 0.388888888889 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = np.array([6, 1, 2, 3]) vec_2 = [4, 3, 2, 5] result = matutils.jaccard(vec_1, vec_2) expected = 1 - 0.333333333333 self.assertAlmostEqual(expected, result)
def distance_metrics_Jaccard(text_standart, textsList): #Получить список лемм сех текстов data_lemmatized_list = [] data_lemmatized_list.append(text_standart.lemma_text) for text in textsList: data_lemmatized_list.append(text.lemma_text) # Модель LDA для поиска темы в тексте текста models = Models() model_LDA = models.text_LDA(data_lemmatized_list) # Получить мешок слов bow_text_standart = model_LDA.id2word.doc2bow(text_standart.lemma_text) for text in textsList: bow_text = model_LDA.id2word.doc2bow(text.lemma_text) # print("jaccard [0 - подобны; 1 - не подобны]") # print(jaccard(bow_text_standart, bow_text)) text.jaccard_coeff = round(jaccard(bow_text_standart, bow_text), 2)
def Jaccard_similiarity(self, corpus, corpus_model_user_description, num_best=5): 'for each user query it computes the Jaccard coefficient with respect to each hotel' length = len(corpus_model_user_description) queryXhotel = np.zeros((length, len(corpus))) for i in range(length): for j in range(len(corpus)): queryXhotel[i][j] = jaccard(corpus_model_user_description[i], corpus[j]) #np.save('jaccard_similiarity', queryXhotel) accuracy_array = self.make_accuracy_array(queryXhotel, num_best, bol=False) return accuracy_array
def process(self, udpipe: str, reference: str, other: str): # init reference_text = Text(reference, udpipe) other_text = Text(other, udpipe) # Получить список лемм всех текстов data_lemmatized_list = [ reference_text.lemma_text, other_text.lemma_text ] # Модель LDA для поиска темы в тексте текста models = Models() model_LDA = models.text_LDA(data_lemmatized_list) # Получить мешок слов bow_reference = model_LDA.id2word.doc2bow(reference_text.lemma_text) bow_other = model_LDA.id2word.doc2bow(other_text.lemma_text) # print("jaccard [0 - подобны; 1 - не подобны]") # print(jaccard(bow_text_standart, bow_text)) other_text.jaccard_coeff = round(jaccard(bow_reference, bow_other), 2) return other_text.jaccard_coeff
def corpus_distance(folder, dialect, corpus_files): dictionary, corpus = models.build_ldamodel_training(folder, dialect) # dictionary, corpus = premodel.upload_data(dialect) # print('here', len(corpus)) lda_model = models.build_ldamodel(corpus, dictionary) # now we add the two dialects to test the distance betwen them with open(corpus_files[0], encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] with open(corpus_files[1], encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # now let's make these into a bag of words format bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) print('kullback_leibler between 1 to 2') # print(kullback_leibler(lda_bow_first_dialect, lda_bow_second_dialect)) print('kullback_leibler between 2 to 1')
tfidf_bow_water = tfidf[bow_water] tfidf_bow_finance = tfidf[bow_finance] tfidf_bow_bank = tfidf[bow_bank] from gensim.matutils import kullback_leibler, jaccard, hellinger hellinger(lda_bow_water, lda_bow_finance) hellinger(lda_bow_finance, lda_bow_bank) hellinger(lda_bow_bank, lda_bow_water) hellinger(lda_bow_finance, lda_bow_water) kullback_leibler(lda_bow_water, lda_bow_bank) kullback_leibler(lda_bow_bank, lda_bow_water) jaccard(bow_water, bow_bank) jaccard(doc_water, doc_bank) jaccard(['word'], ['word']) def make_topics_bow(topic): # takes the string returned by model.show_topics() # split on strings to get topics and the probabilities topic = topic.split('+') # list to store topic bows topic_bow = [] for word in topic: # split probability and word prob, word = word.split('*') # get rid of spaces word = word.replace(" ","") # convert to word_type
from gensim.corpora import Dictionary from gensim.models import ldamodel from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full import numpy import pickle from nltk.stem.porter import PorterStemmer from gensim import corpora, models, similarities stemmer = PorterStemmer() document = [] topic_dict, cosine = pickle.load(open( "../data/cosine.p", "rb" ) ) for key,value in topic_dict.items(): text_tokens = [stemmer.stem(item) for item in key.split()] text_key = ' '.join([w for w in text_tokens]) text = str(text_key +" "+value) document.append(text) texts = [[word for word in doc.split()] for doc in document] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] jaccard_matrix = [round(jaccard(corpus[i], corpus[j])*100.00,5) for i in range(0, 1680) for j in range(0, 1680)] pickle.dump(jaccard_matrix, open( "../data/jaccard.p", "wb" ) )
threshold = 0.0 regex = re.compile('[%s]' % re.escape(string.punctuation)) for i in range(len(sentences)): for j in range(i + 1, len(sentences)): try: sen1 = [ pStemmer.stem(word) for word in regex.sub('', sentences[i].lower()).split(" ") if word not in STOPWORDS ] sen2 = [ pStemmer.stem(word) for word in regex.sub('', sentences[j].lower()).split(" ") if word not in STOPWORDS ] simScore = 1 - jaccard(sen1, sen2) except TypeError: pass except UnicodeError: sen1 = [ pStemmer.stem(word.decode("utf-8")) for word in regex.sub('', sentences[i].lower()).split(" ") if word not in STOPWORDS ] sen2 = [ pStemmer.stem(word.decode("utf-8")) for word in regex.sub('', sentences[j].lower()).split(" ") if word not in STOPWORDS ] if simScore > threshold: similarities.append((simScore, sentences[i], sentences[j]))
filecontent = filecontent + word + ' ' documents.append(filecontent) stoplist = set(stopwords.words('english')) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] basetext = [] for list in texts: for item in list: basetext.append(item) bow_1 = lda.id2word.doc2bow(basetext) lda_1 = lda[bow_1] print("******************", filename) print("hellinger", hellinger(lda_1, lda_2)) print("kullback_leibler", kullback_leibler(lda_1, lda_2)) print("jaccard", jaccard(lda_1, lda_2)) file.close() #dictionary = corpora.Dictionary(texts) #corpus = [dictionary.doc2bow(text) for text in texts] #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5) #print(lda1) #print(texts) """ basetext=[] for list in texts: for item in list: basetext.append(item) #print(len(basetext)) #print(basetext) bow_1 = lda.id2word.doc2bow(basetext)
max_p = max(probabilities) topic = topics[probabilities.index(max_p)] return topic colors = ["skyblue", "pink", "red", "green", "yellow", "cyan", "purple", "magenta", "orange", "blue"] def get_node_color(i): return colors[get_most_likely_topic(texts[i])] # return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink' G = nx.Graph() for i, _ in enumerate(texts): G.add_node(i) for (i1, i2) in itertools.combinations(range(len(texts)), 2): bow1, bow2 = texts[i1], texts[i2] distance = jaccard(bow1, bow2) if(distance > 0.001): G.add_edge(i1, i2, weight=1/distance) pos = nx.spring_layout(G) threshold = 1.04 elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold] esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <= threshold] node_colors = [get_node_color(i) for (i, _) in enumerate(texts)] nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors) nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2) nx.draw_networkx_edges(G,pos,edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed') nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif') plt.show()
# Leibler_divergence>`_ and `Hellinger # <https://en.wikipedia.org/wiki/Hellinger_distance>`_ to figure out what suits # your needs. # ############################################################################### # Jaccard # ------- # # Let us now look at the `Jaccard Distance # <https://en.wikipedia.org/wiki/Jaccard_index>`_ metric for similarity between # bags of words (i.e, documents) # from gensim.matutils import jaccard print(jaccard(bow_water, bow_bank)) print(jaccard(doc_water, doc_bank)) print(jaccard(['word'], ['word'])) ############################################################################### # The three examples above feature 2 different input methods. # # In the first case, we present to jaccard document vectors already in bag of # words format. The distance can be defined as 1 minus the size of the # intersection upon the size of the union of the vectors. # # We can see (on manual inspection as well), that the distance is likely to be # high - and it is. # # The last two examples illustrate the ability for jaccard to accept even lists # (i.e, documents) as inputs.