def jaccard_distance_candidates(mention_spans,concept_spans,n_jaccard): candidate_indices = [] for mention in mention_spans: distances = [nltk.jaccard_distance(set(mention),set(concept)) for concept in concept_spans] indices = np.argpartition(np.array(distances),-n_jaccard)[-n_jaccard:].tolist() candidate_indices.append(indices) return candidate_indices
def spell_correction_english(input_sentence): input_tokens = nltk.word_tokenize(input_sentence) corrected = "" for token in input_tokens: bg_input_word = set(nltk.ngrams(token, n=2)) words = nltk.corpus.words.words() min_jaccard = 5; for word in words: bg_word = set(nltk.ngrams(word, n=2)) jd = nltk.jaccard_distance(bg_input_word, bg_word) if jd < min_jaccard: suggested_words = [] min_jaccard = jd suggested_words.append(word) if jd == min_jaccard: suggested_words.append(word) print(suggested_words) min_edit_distance = 1000 return_words = [] for word in suggested_words: ed = nltk.edit_distance(token, word) if ed < min_edit_distance: return_words = [] min_edit_distance = ed return_words.append(word) if ed == min_edit_distance: return_words.append(word) corrected = corrected + " " + return_words[0] return corrected
def add_jackard_distance(self): for i in range(len(self.df)): head = self.df.loc[i] for j in range(len(self.df)): node = self.df.loc[j] self.jackard_distance[i, j] = jaccard_distance( set(head.text.split()), set(node.text.split()))
def token_sim(s1,s2): #""" jaccard similarity between two strings""" try: aset = set(nltk.word_tokenize(s1)) dset = set(nltk.word_tokenize(s2)) return nltk.jaccard_distance(aset,dset) except: return 0
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): recommendations = [] for e in entries: distances = [(nltk.jaccard_distance(set(nltk.ngrams(e, n=4)), set(nltk.ngrams(a, n=4))), a) for a in correct_spellings if a[0] == e[0] and len(a) > 2] recommendations.append(sorted(distances)[0][1]) return recommendations # Your answer here
def nameCheck(str1, str2): tokens_1 = set(nltk.ngrams(str1, n=3)) tokens_2 = set(nltk.ngrams(str2, n=3)) distance = nltk.jaccard_distance(tokens_1, tokens_2) if distance > 0.75: return False else: return True
def jaccard_similarity(s1, s2): if s1 is None or len(s1) == 0: return 0.0 elif s2 is None or len(s2) == 0: return 0.0 else: jd = nltk.jaccard_distance(set(s1), set(s2)) return 1.0 - jd
def process_input(): test_jaccard = [] test_edit = [] train_jaccard = [] train_edit = [] test_scores = test_gs.readlines() train_scores = train_gs.readlines() global gs for score in test_scores: gs.append(float(score.rstrip('\n'))) for score in train_scores: gs.append(float(score.rstrip('\n'))) test_lines = test.readlines() train_lines = train.readlines() test_i = 0 for line in test_lines: pair = test_lines[test_i].split('\t') j_score = nltk.jaccard_distance(set(nltk.word_tokenize(pair[0])), set(nltk.word_tokenize(pair[1]))) test_jaccard.append(j_score) edit_score = nltk.edit_distance(nltk.word_tokenize(pair[0]), nltk.word_tokenize(pair[1])) test_edit.append(edit_score) test_i += 1 train_i = 0 for line in train_lines: pair = train_lines[train_i].split('\t') j_score = nltk.jaccard_distance(set(nltk.word_tokenize(pair[0])), set(nltk.word_tokenize(pair[1]))) train_jaccard.append(j_score) edit_score = nltk.edit_distance(nltk.word_tokenize(pair[0]), nltk.word_tokenize(pair[1])) train_edit.append(edit_score) train_i += 1 global jaccard jaccard = test_jaccard + train_jaccard global edit edit = test_edit + train_edit
def match(csv, phrases): df = pd.read_csv(csv) indexes = [] for item in df.iterrows(): text = item[1]['comment'] jaccards = float('inf') matched = '' index = 0 for i, phrase in enumerate(phrases): if nltk.jaccard_distance(set(phrase), set(text)) < jaccards: tolerance = (abs(len(phrase) - len(text)) / (max([len(phrase), len(text) ]))) * 0.8 ## this number can be tinkered matched = phrase jaccards = nltk.jaccard_distance(set(phrase), set(text)) if jaccards < tolerance: indexes.append(item[0]) return indexes
def content_distance_check(html1, html2): txt1 = extract_usable_text(html1) txt2 = extract_usable_text(html2) txt1 = re.sub('\s+', ' ', txt1) txt2 = re.sub('\s+', ' ', txt2) # print(txt1) # print(txt2) distance = jaccard_distance(set(txt1), set(txt2)) return distance
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): rcmd = [] for entry in entries: spell_list = [spell for spell in correct_spellings if spell.startswith(entry[0]) and len(spell) > 2] dist_list = [nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(spell, n=4))) for spell in spell_list] min_idx = dist_list.index(min(dist_list)) rcmd.append(spell_list[min_idx]) return rcmd
def jaccard_dist(test_code, train_code): tc = transform_to_ngram(test_code) trc = transform_to_ngram(train_code) try: return nltk.jaccard_distance(tc, trc) except: print(test_code) print(train_code) exit()
def jaccardDistance(text1, text2): # the tokenize function converts a string to a set. # def tokenize(self, s): # return s.split(self._string) # by definition the jaccard distance calculates unique set1 = set(nltk.word_tokenize(text1)) set2 = set(nltk.word_tokenize(text2)) return nltk.jaccard_distance(set1, set2)
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): result = [] import operator for entry in entries: spell_list = [spell for spell in correct_spellings if spell.startswith(entry[0]) and len(spell) > 2] distance_list = [(spell, nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(spell, n=4)))) for spell in spell_list] result.append(sorted(distance_list, key=operator.itemgetter(1))[0][0]) return result
def compare_edge_labels(e_a_list, e_b_list): score = 0 for ea in e_a_list: for eb in e_b_list: # score += GSA.compare_strings_glove(ea, eb) # score += max(0, (1 - nltk.edit_distance(ea, eb) / max(len(ea), len(eb)))) score += max(0, (1 - nltk.jaccard_distance(set(ea), set(eb)))) # jaccard_distance based on each character as element # score += max(0, (1 - nltk.jaccard_distance(set(ea.split()), set(eb.split())))) # jaccard_distance based on each character as element # score += GSA.compare_strings(ea, eb) return score / (1 + len(e_a_list))
def evaluate_individual_sentence(self, original_sentence, paraphrase) -> Dict: original_sentence_tokens = nltk.word_tokenize( normalize_spaces_remove_urls(original_sentence)) paraphrase_tokens = nltk.word_tokenize( normalize_spaces_remove_urls(paraphrase)) # Bleu score bleu_score = nltk.translate.bleu_score.sentence_bleu( [normalize_spaces_remove_urls(original_sentence)], normalize_spaces_remove_urls(paraphrase)) # Sentence embedding cosine similarity emb1 = self.model.encode(original_sentence) emb2 = self.model.encode(paraphrase) cos_sim = util.pytorch_cos_sim(emb1, emb2) # Levenshtein distance edit_distance = pylev.levenshtein(original_sentence_tokens, paraphrase_tokens) length = max(len(original_sentence_tokens), len(paraphrase_tokens)) normalized_edit_distance = (length - edit_distance) / length # Jaccard jaccard = nltk.jaccard_distance(set(original_sentence_tokens), set(paraphrase_tokens)) # Jaccard * cosine similarity jaccard_embedding_factor = jaccard * cos_sim.item() metrics = { 'original_sentence': original_sentence, 'paraphrase': paraphrase, 'bleu_score': bleu_score, 'normalized_original_sentence': normalize_spaces_remove_urls(original_sentence), 'normalized_paraphrase': normalize_spaces_remove_urls(paraphrase), 'embedding_cosine_similarity': cos_sim.item(), 'edit_distance': edit_distance, 'normalized_edit_distance': normalized_edit_distance, 'jaccard': jaccard, 'jaccard_embedding_factor': jaccard_embedding_factor } return metrics
def text_to_code(text: str) -> Tuple[str, float]: text = text.strip() print(text) dists = [ nltk.jaccard_distance(definition, set(text.split(" "))) for definition in definitions ] min_dist_index = np.argmin(dists) min_dist = dists[min_dist_index] code = codes[min_dist_index] return code, min_dist
def display_concept(i_txt_file, concept_chosen, beginning, filename): with open(os.path.join('Uploads', filename)) as csvfile: csvdata = csv.reader(csvfile) next(csvdata, None) for row in csvdata: if row[12] == beginning and row[1] == i_txt_file: hof = str2bool(row[6]) negated = str2bool(row[10]) location = row[8] concept_to_display = row[3] range_txt_display = row[13] cosine_similarity_value = combined_cosine_similarity( concept_to_display.lower(), range_txt_display.lower()) jaccard_dist = nltk.jaccard_distance(set(concept_to_display), set(range_txt_display)) if request.method == 'POST': if not request.form['correct'] or not request.form['location'] or not request.form['hof'] \ or not request.form['negation']: flash('Please insert yes or no.') return redirect(request.url) else: correct_answ = request.form['correct'] location_answ = request.form['location'] hof_answ = request.form['hof'] negation_answ = request.form['negation'] db = get_db() db.execute( 'INSERT INTO feedback (concept, negation, hof, location, correct_answ,\ hof_answ, location_answ, negation_answ, cosine_sim, jaccard_dist)' ' VALUES (?,?,?,?,?,?,?,?,?,?)', (str(concept_chosen), int(negated), int(hof), str(location), str2int(correct_answ), str2int(hof_answ), str2int(location_answ), str2int(negation_answ), float(cosine_similarity_value), float(jaccard_dist))) db.commit() return redirect( url_for('note.sentence', i_txt_file=i_txt_file, filename=filename, conc=concept_chosen)) return render_template('note/concept_display.html', hof=hof, negated=negated, range_txt=range_txt_display, location=location, concept=concept_to_display, filename=filename, cosine=cosine_similarity_value, jaccard=jaccard_dist, i_txt_file=i_txt_file, concept_chosen=concept_chosen)
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): result = [] n = 4 for entry in entries: words = [word for word in correct_spellings if word[0] == entry[0]] distances = [(nltk.jaccard_distance(set(nltk.ngrams(entry, n=n)), set(nltk.ngrams(match, n=n))), match) for match in words] result.append(sorted(distances)[0][1]) return result
def similitud(): for i in comments: first = i['text'] corr_f = [] vars.append([first, i['value'], get_color(i['value'])]) print(first) for j in comments: second = j['text'] jd = nltk.jaccard_distance(set(first), set(second)) corr_f.append(round(1 - jd, 2)) corr.append(corr_f)
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): spellings = [] for entry in entries: entry_spellings = [x for x in correct_spellings if x[0] == entry[0]] jac_distance = [ nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(x, n=4))) for x in entry_spellings ] spellings.append(entry_spellings[np.argmin(jac_distance)]) return spellings
def nearestNeighbourGenre(movie): closestMovie = None closestDistance = 1 for m in movies.values(): if (m.title != movie.title): distance = nltk.jaccard_distance(movie.lemmas, m.lemmas) if distance < closestDistance: closestDistance = distance closestMovie = m print(closestMovie.title) return closestMovie.genre
def save_distance_word_mapping(word_list,embeddings, target_path_edit, target_path_jaccard): print(len(word_list)) with open(target_path_edit,'w',encoding='utf-8') as f: for word in word_list: distance_words = [(emb_word, nltk.edit_distance(emb_word,word)) for emb_word in embeddings if emb_word != word] new_word = min(distance_words, key = lambda t: t[1]) f.write(str(word)+ ' '+str(new_word[0]) +' '+ str(new_word[1]) +"\n") with open(target_path_jaccard,'w',encoding='utf-8') as g: for word in word_list: distance_words = [(emb_word, nltk.jaccard_distance(set(emb_word),set(word))) for emb_word in embeddings if emb_word != word] new_word = min(distance_words, key = lambda t: t[1]) g.write(str(word)+ ' '+str(new_word[0]) +' '+ str(new_word[1]) +"\n")
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): outcomes = [] for entry in entries: spellings = [w for w in correct_spellings if w.startswith(entry[0])] distances = ((nltk.jaccard_distance(set(nltk.ngrams(entry, n=4)), set(nltk.ngrams(word, n=4))), word) for word in spellings) closest = min(distances) outcomes.append(closest[1]) return outcomes
def spellchecker(misspelled_word='caaar',n_gram = 1): correct_words = [i for i in correct_spelling if i.startswith(misspelled_word[0]) and len(i) > 1] min_distance = 1 min_word = '' for word in correct_words: distance = nltk.jaccard_distance(set(nltk.ngrams(misspelled_word,n=n_gram)) , set(nltk.ngrams(word,n=n_gram))) if distance < min_distance: min_distance = distance min_word = word return min_word
def calculate_similarity(docs_list, similarity_type, threshold): ''' Calculate vector similarity of all possible pairs in list ''' results = list() counter = 0 # Get all possible combinations of tweets that have same NER all_combinations = list(combinations(docs_list, 2)) # Filter handles, hashtags, emoticons, etc. for tweet_pair in all_combinations: tweet_pair[0].filter("*") tweet_pair[1].filter("*") # Filter out pairs with exact sentences if tweet_pair[0].clean_text != tweet_pair[1].clean_text: # Filter out sentences shorter than 4 words if tweet_pair[0].tweet_len() > 3 and tweet_pair[1].tweet_len() > 3: # Filter out those combinations with excesive word number differences if abs(tweet_pair[0].tweet_len() - tweet_pair[1].tweet_len()) < 4: if similarity_type == "jaccard": settext1 = tweet_pair[0].word_set() settext2 = tweet_pair[1].word_set() d = jaccard_distance(settext1, settext2) if similarity_type == "jaro_winkler": d = 1 - distance.jaro_winkler_similarity(tweet_pair[0].clean_text, tweet_pair[1].clean_text) if similarity_type == "levenshtein": d = damerau_levenshtein_distance(tweet_pair[0].clean_text, tweet_pair[1].clean_text) # Only return those results above the threshold if d < threshold: # Put in source sentences with more oov words and extra filter target if tweet_pair[0].oov_words() > tweet_pair[1].oov_words(): bi_combination = tweet_pair[0].source_filter(), tweet_pair[1].target_filter() else: bi_combination = tweet_pair[1].source_filter(), tweet_pair[0].target_filter() if bi_combination not in results: results.append(bi_combination) counter += 1 sys.stdout.write(f"\rAdding combinations...") sys.stdout.flush() return results
def jaccard_similarity(): train_list=[] for community_key in community_users_hashtags_dict.keys(): for community_key_inside in community_users_hashtags_dict.keys(): try: print("==============================================") print(community_key,community_key_inside) # print(community_users_hashtags_dict[community_key].hashtags_list) # print( cosine_similarity(community_users_hashtags_dict[community_key].hashtags_list,tfidf_matrix_train)) # here the first element of tfidf_matrix_train is matched with other three elements jd_sent_1_2 = nltk.jaccard_distance(set(nltk.ngrams(community_users_hashtags_dict[community_key].hashtags_list, n=3)), set(nltk.ngrams(community_users_hashtags_dict[community_key_inside].hashtags_list, n=3))) print(jd_sent_1_2) except Exception as ex: print("General Exception")
def jacc_trigramTOKEN(tupla1, tupla2): sent1 = concatenate_list_data(tupla1) sent2 = concatenate_list_data(tupla2) tokens1 = nltk.word_tokenize(sent1) tokens2 = nltk.word_tokenize(sent2) ng1_tokens = set(nltk.ngrams(tokens1, n=3)) ng2_tokens = set(nltk.ngrams(tokens2, n=3)) jd_sent_1_2 = 1 - nltk.jaccard_distance(ng1_tokens, ng2_tokens) vector = [jd_sent_1_2] return vector
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): results = dict() for mistake in entries: results[mistake] = [] words_to_ckeck = [i for i in correct_spellings if i[0] == mistake[0]] results[mistake] = [ (nltk.jaccard_distance(set(nltk.ngrams(mistake, n=4)), set(nltk.ngrams(word, n=4))), word) for word in words_to_ckeck ] return [sorted(results[entry])[0][1] for entry in entries]
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): recommend = [] for entry in entries: # Match first letter. input_spell contains all words in correct_spellings with the same first letter. input_spell = [x for x in correct_spellings if x[0] == entry[0]] # Find the jaccard distance between the entry word and every word in correct_spellings with the same first letter. jaccard_dist = [nltk.jaccard_distance(set(nltk.ngrams(entry,n=4)), set(nltk.ngrams(x,n=4))) for x in input_spell] # Recommend the word in input_spell with the minimum Jaccard distance. recommend.append(input_spell[np.argmin(jaccard_dist)]) return recommend # Your answer here
""" Tokenizes string s, removes stopwords, and returns a set of k-shingles """ s, k, stopwords = args return kshinglize(s, k, stopwords) def kshinglize(s, k=KSHINGLES, stopwords=STOPWORDS): """ Tokenizes string s, removes stopwords, and returns a set of k-shingles """ s = s.strip().lower() tokens_raw = twokenize.tokenize(s) tokens = filterstopwords(tokens_raw, stopwords) return tokens_to_kshingles(tokens, k) def _calculate_distance(((eid_a, shingles_a), (eid_b, shingles_b))): if (shingles_a and shingles_b): jd = nltk.jaccard_distance(shingles_a, shingles_b) else: # One of the elements has no shingles jd = 1.0 return ((eid_a, eid_b), jd) def build_distance_table(kshingles): """ Create a hash table of the Jaccard distance between all elements in the dict kshingles. kshingles = { id : set(shingle1, shingle2, ...) } """ distance = ReversibleKeyDict() with closing(multiprocessing.Pool()) as pool: for (eid_a, eid_b), jd in pool.imap_unordered(_calculate_distance, itertools.combinations(kshingles.iteritems(), 2), 250): distance[eid_a, eid_b] = jd