def mergeItemsDiscretes(item1, item2): diff_price = feature_diffPrice(item1['price'], item2['price']) diff_price = discretizaDiffPrice(diff_price) jw_title = jf.jaro_winkler(item1['title'], item2['title']) jw_desc = jf.jaro_winkler(item1['description'], item2['description']) simi_json,eq_keys_json = feature_attrJson(item1, item2) diff_latlon = feature_diffLatLon(item1['lon'], item1['lat'], item2['lon'], item2['lat']) metroID1 = '0' if(item1['metroID'] != ''): metroID1 = item1['metroID'] metroID2 = '0' if(item2['metroID'] != ''): metroID2 = item2['metroID'] json = { 'x': [ int(item1['itemID'] == item2['itemID']), int(item1['categoryID']), int(item2['categoryID']), int(item1['categoryID'] == item2['categoryID']), int(item1['metroID'] == ''), int(item2['metroID'] == ''), int(round(float(metroID1))), int(round(float(metroID2))), int(item1['metroID'] == item2['metroID']), int(item1['locationID'] == ''), int(item2['locationID'] == ''), int(item1['locationID']), int(item2['locationID']), int(item1['locationID'] == item2['locationID']), int(item1['price'] == item2['price']), int(item1['price'] == ''), int(item2['price'] == ''), int(item1['price'] == '1.0'), int(item2['price'] == '1.0'), diff_price, round(jw_title * 10), round(jw_desc * 10), int(item1['attrsJSON'] == ''), int(item2['attrsJSON'] == ''), round(round(simi_json * 100)/10), round(round(eq_keys_json * 100)/10), round(((1/diff_latlon)/1000)) ] } return json
def get_closest_jaro_winkler(needle,haystack): closest = None; for x in haystack: if(closest == None): closest = (x,jellyfish.jaro_winkler(needle,x)); else: temp = (x,jellyfish.jaro_winkler(needle,x)); if(temp[1] > closest[1]): closest = temp; if(closest == None): return None; return closest[0];
def test_jellyfish(): text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.' text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi' lst1 = normalize(text1) lst2 = normalize(text2) text_norm1 = ' '.join(lst1) text_norm2 = ' '.join(lst2) print(jellyfish.jaro_distance(text1, text2)) print(jellyfish.jaro_distance(text_norm1, text_norm2)) print(jellyfish.jaro_winkler(text1, text2)) print(jellyfish.jaro_winkler(text_norm1, text_norm2)) print(jellyfish.nysiis(text1)) print(jellyfish.nysiis(text2)) exit()
def score(self,s,t): ''' Returns the similarity score ''' similar = namedtuple('Similar',['r1','r2','sim']) similarity=[] tfidfdict = self.builddict() for i,ti in enumerate(s.split(" ")): for j,tj in enumerate(t.split(" ")): dist = jf.jaro_winkler(ti,tj) if dist >= THRESHOLD: similarity.append(similar(i,j, dist*tfidfdict.get(ti)* tfidfdict.get(tj))) similarity.sort(reverse=True,key=lambda x:x.sim) sused = np.array([False]*len(s),dtype=bool) tused = np.array([False]*len(t),dtype=bool) #check that the term are counted only once sim = 0.0 for s in similarity: if(sused[s.r1] | tused[s.r2]): continue; sim+=s.sim sused[s.r1] = True tused[s.r2] = True return sim
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def jaro_winkler_similarity(s, t): """ Jaro-Winkler Similarity """ jw_sim = jellyfish.jaro_winkler(s, t) return jw_sim
def find_string_similarity(first_str, second_str, normalized=False, ignore_list=[]): """ Calculates matching ratio between two strings Args: first_str (str) : First String second_str (str) : Second String normalized (bool) : if True ,method removes special characters and extra whitespace from strings then calculates matching ratio ignore_list (list) : list has some characters which has to be substituted with "" in string Returns: Float Value : Returns a matching ratio between 1.0 ( most matching ) and 0.0 ( not matching ) using difflib's SequenceMatcher and and jellyfish's jaro_winkler algorithms with equal weightage to each Examples: >>> find_string_similarity("hello world","Hello,World!",normalized=True) 1.0 >>> find_string_similarity("entrepreneurship","entreprenaurship") 0.95625 >>> find_string_similarity("Taj-Mahal","The Taj Mahal",normalized= True,ignore_list=["the","of"]) 1.0 """ first_str = process_str_for_similarity_cmp(first_str, normalized=normalized, ignore_list=ignore_list) second_str = process_str_for_similarity_cmp(second_str, normalized=normalized, ignore_list=ignore_list) match_ratio = (difflib.SequenceMatcher(None, first_str, second_str).ratio() + jellyfish.jaro_winkler(unicode(first_str), unicode(second_str)))/2.0 return match_ratio
def row_stats(in_row): """Compute additional stats for each row""" out_row = {} index, row = in_row out_row['index'] = index out_row['positives'] = row.count() out_row['distincts'] = row.dropna().unique().size out_row['max'] = row.value_counts().max() labels = row.dropna() if labels.size < 2: out_row['similarity'] = np.nan else: similarities = [] for l1, l2 in itertools.combinations(labels, 2): s = jellyfish.jaro_winkler(l1, l2) similarities.append(s) out_row['resemblance'] = np.mean(similarities) out_row['resemblance_min'] = np.min(similarities) out_row['resemblance_max'] = np.max(similarities) return out_row
def score(s,t): similar = namedtuple('Similar',['r1','r2','sim']) similarity=[] for i,ti in enumerate(s.split(" ")): for j,tj in enumerate(t.split(" ")): dist = jf.jaro_winkler(ti,tj) if dist >= THRESHOLD: similarity.append(similar(i,j, dist*tfidfdict[ti]*tfidfdict[tj])) similarity.sort(reverse=True,key=lambda x:x.sim) sused = np.array([False]*len(s),dtype=bool) tused = np.array([False]*len(t),dtype=bool) sim = 0.0 for s in similarity: if(sused[s.r1] | tused[s.r2]): continue; sim+=s.sim sused[s.r1] = True tused[s.r2] = True return sim
def best_match(s, categories, top_n=5): """Return the top N best matches from your categories with the best match in the 0th position of the return list. Usage: >>> best_match('ilinois', ['Michigan', 'Ohio', 'Illinois'], 2) [('Illinois', 96), ('Michigan', 22)] :param s: str value to find best match :param categories: list values to compare against :param top_n: number of matches to return :returns: list of tuples (guess, percentage) """ scores = [] for cat in categories: scores.append((cat, jellyfish.jaro_winkler( s.encode('ascii', 'replace').upper(), cat.encode('ascii', 'replace').upper() ))) scores = sorted(scores, key=lambda x: x[1]) scores = scores[-top_n:] scores = [(score[0], int(score[1] * 100)) for score in scores] scores.reverse() return scores
def get_jaro_winkler_avg(row1, row2): sum = 0 for columnIndex in xrange(1,15): a = row1[columnIndex] b = row2[columnIndex] sum += jellyfish.jaro_winkler(a, b) return sum / 14.0
def best_match(s, categories, top_n=5): """ Return the top N best matches from your categories with the best match in the 0th position of the return list. The comparison does not check the first element of the category name, only the second element. Usage: >>> best_match('illinois', [ ('_', 'Michigan'), ('_', 'Ohio', ('_', 'Illinois') ], 2) [('Illinois', 96), ('Michigan', 22)] Args: s: str value to find best match categories: list of tuples to compare against. needs to be [('table1', 'value1'), ('table2', 'value2')] top_n: number of matches to return Returns: list of tuples (table, guess, percentage) """ # print 'starting match on {}'.format(s) scores = [] for cat in categories: # verify that the category has two elements, if not, then just # return _ for the first category. Need this because fuzzy_in_set uses the # same method table_name = '_' category = None if isinstance(cat, tuple): table_name = cat[0] category = cat[1] else: category = cat scores.append( ( table_name, category, jellyfish.jaro_winkler( s.encode('ascii', 'replace').lower(), category.encode('ascii', 'replace').lower() ) ) ) # sort first by the ones # print 'all scores for {} are {}'.format(s, scores) scores = sorted(scores, cmp=sort_scores) # take the top n number of matches scores = scores[:top_n] # convert to hundreds scores = [(score[0], score[1], int(score[2] * 100)) for score in scores] # print 'ending all categories match of {} with scores {}'.format(s, scores) return scores
def is_same_label(label1, label2): # noop cannot be same with non-noop if (label1 == "noop" and label2 != "noop") or (label1 != "noop" and label2 == "noop"): return [False, "", "noop", 0] if label1 == label2: return [True, label1, "identical", 1] l1 = label1.lower() l2 = label2.lower() if l1 == l2: return [True, l1, "case", 1] fl1 = get_filtered_label(l1) fl2 = get_filtered_label(l2) if get_filtered_label(l1) == get_filtered_label(l2): return [True, get_filtered_label(l1), "stopword", 1] # TODO: string comparison, sentence analysis # import difflib # sim_score = difflib.SequenceMatcher(None, fl1,fl2).ratio() import jellyfish sim_score = jellyfish.jaro_winkler(fl1, fl2) if sim_score >= SIM_THRESHOLD: # print "fuzzy", sim_score, # print "[fuzzy]", sim_score, l1, "===", l2 # fl1, "===", fl2, "|||", final_label = max([l1, l2], key=len) return [True, final_label, "sim", sim_score] # print "diff", sim_score, l1, "===", l2 return [False, "", "", sim_score]
def build_uid_hash(self): for user in self.users: self.uid_hash[user.id] = [user] for i, user_i in enumerate(self.users): if user_i.id in self.marked: continue for j, user_j in enumerate(self.users): if j < i + 1: continue cn_i = user_i.data['cn'] cn_j = user_j.data['cn'] distance = jaro_winkler(cn_i, cn_j) if distance >= DuplicatesKiller.RESEMBLANCE_CUANTUM: self.uid_hash[user_i.id].append(user_j) self.marked.add(user_j.id) for k, v in self.uid_hash.items(): # at least one resemblance for current object if v.__len__() >= 2: self.groups.append(v) return self.groups
def organize_results(self): self.structured_references = defaultdict(lambda: defaultdict(list)) self.tagContradictions = [] self.notReferenced = [] #print self.scrape_results for category in self.scrape_results: for file_name in self.scrape_results[category]: for tag in self.scrape_results[category][file_name]: finds = self.scrape_results[category][file_name][tag][0] known_files = [] if len(self.scrape_results[category][file_name][tag]) > 1: known_files = self.scrape_results[category][file_name][tag][1:] referenced = False for find in finds: idtag, title = find[0], find[1] print idtag, title for tag2 in self.structured_references: for id in self.structured_references[tag2]: if id in idtag or idtag in id: referenced = True if len(self.structured_references[tag2][id]) == 0: if title.strip() != '': self.structured_references[tag2][id] = [title, {title: [file_name]}] elif title in self.structured_references[tag2][id][1] or max( [jf.jaro_winkler(unicode(title), unicode(t)) for t in self.structured_references[tag2][id][1]]) > 0.85: if title.strip() != '': self.structured_references[tag2][id][1][title].append(file_name) else: if title.strip() != '': self.structured_references[tag2][id][1][title] = [file_name] else: if len(self.structured_references[tag2][id]) != 0: if title in self.structured_references[tag2][id][1] or max( [jf.jaro_winkler(unicode(title), unicode(t)) for t in self.structured_references[tag2][id][1]]) > 0.85: if idtag.strip() == '' and title.strip() != '': referenced = True self.structured_references[tag2][id][1][title].append(file_name) elif title.strip() != '': self.tagContradictions.append( [file_name, self.scrape_results[category][file_name], tag2, id]) if not referenced: self.notReferenced.append([file_name, self.scrape_results[category][file_name]]) print len(self.structured_references) self.vote_and_restructure() self.flip_references()
def mc_is_close_match(self): """True if the given candidate is a close match only missing a word like Inc or Ltd """ stripped = list( self.kb.common_stripped.get(self.mc['candidate'], ['']))[0] if jf.jaro_winkler(self.mention_text, stripped) > 0.95: return True return False
def test_jaro_winkler(self): cases = [("dixon", "dicksonx", 0.8133), ("dicksonx", "dixon", 0.8133), ("martha", "marhta", 0.9611), ("dwayne", "duane", 0.84)] for (s1, s2, value) in cases: actual = jellyfish.jaro_winkler(s1, s2) self.assertAlmostEqual(actual, value, places=4)
def jaro_winkler_apply(x): try: return jellyfish.jaro_winkler(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def cluster(df): df1 = df[['nname', 'nzip', 'snum']].reset_index() dfm = df1.merge(df1, on='nzip') #dfm = dfm[dfm.index_x != dfm.index_y] dfm['g1'] = dfm[['nname_x','nname_y']].apply(lambda x: 1 if jellyfish.jaro_winkler(x[0],x[1]) > 0.8 else 0, axis=1) dfm['g2'] = dfm[['snum_x','snum_y']].apply(lambda x: 1 if x[0] == x[1] else 0, axis=1) dfc = clusterdf(dfm) dd = dfc.groupby(['cluster_id', 'index_x']).first().reset_index() ddd = dfc.groupby('cluster_id').first() return df.ix[ddd.index_x]
def best_match(s, categories, top_n=5): """Return the top N best matches from your categories.""" scores = [] for cat in categories: scores.append((cat, jellyfish.jaro_winkler(s.upper(), cat.upper()))) scores = sorted(scores, key=lambda x: x[1]) scores = scores[-top_n:] scores = [(scores[0][0], int(scores[0][1] * 100))] return scores
def measure_mrn_similarity(ssn1, ssn2, sign): if ssn1 == "" or ssn2 == "" or ssn1 is None or ssn2 is None: return 0 r1 = jellyfish.jaro_winkler(ssn1, ssn2) r2 = 1 - jellyfish.hamming_distance(ssn1, ssn2) / len(ssn1) if sign == "t": print("jw-{} vs hd-{}".format(r1, r2)) elif sign == "w": return max(r1, r2)
def fuzzy_match_with_ref_list(nouns, skill_list): possible_skills = set() for noun in nouns: max_score = 0.00 for skill in skill_list: jaro_score = jellyfish.jaro_winkler(skill, noun) if jaro_score > max_score: max_score = jaro_score ref_skill = skill if max_score >= 0.88: possible_skills.add(ref_skill) return possible_skills
def searchstratergy1(record1,record2): ''' Input: Two strings Output: The similarity score if its above the threshold Example: Jaro-winkler with threshold 0.9 ''' score = j.jaro_winkler(record1, record2) if score >= JARO_THRESHOLD: return score else: return 0
def jaro_match(tup): import jellyfish birth = tup[0] death = tup[1] birth_name = birth.first_name + ' ' + birth.last_name death_name = death.first_name + ' ' + death.last_name if jellyfish.jaro_winkler(birth_name, death_name) > JARO_THRESH: return True return False
def match_title(anidb_title, absolute_titles): max_simi = 0 for title in absolute_titles: simi = jellyfish.jaro_winkler(anidb_title, title.encode('utf-8')) if simi > 0.9 and simi > max_simi: max_simi = simi if max_simi > 0: return max_simi else: return 0
def person_similarity(p1, p2, year_window, parent_sims=False, people_dict1=None, people_dict2=None): # Don't match inaccurate and accurate dates (unrealistic) if (p1.byear is not None) != (p2.byear is not None): return 0 # Don't match if dates are too different if (p1.byear is not None) and (p2.byear is not None) and \ abs(p1.byear-p2.byear) > year_window: return 0 terms = 0 sim_sum = 0 sim_sum += jellyfish.jaro_winkler(p1.clean_first_name, p2.clean_first_name) terms += 1 sim_sum += jellyfish.jaro_winkler(p1.clean_last_name, p2.clean_last_name) terms += 1 sim = sim_sum / float(terms) if parent_sims and people_dict1 is not None and people_dict1 is not None: parent_factor = 0.5 n_parents = 0 recursive = False if p1.dad is not None and p2.dad is not None: sim += parent_factor * person_similarity( people_dict1[p1.dad], people_dict2[p2.dad], year_window, recursive, people_dict1, people_dict2) n_parents += 1 if p1.mom is not None and p2.mom is not None: sim += parent_factor * person_similarity( people_dict1[p1.mom], people_dict2[p2.mom], year_window, recursive, people_dict1, people_dict2) n_parents += 1 sim /= float(1 + parent_factor * n_parents) return sim
def commit_localization(graph): """Computes the relative number of directories modified by a commit.""" result = [] for commit in graph.iterate_commits(): paths = map(os.path.dirname, graph.commit_files[commit]) if len(paths) <= 1: result.append(len(paths)) continue similarity_scores = [] for pair_of_files in itertools.combinations(paths, 2): distance = jellyfish.jaro_winkler(*pair_of_files) similarity_scores.append(distance) result.append(float(sum(similarity_scores)) / max(len(similarity_scores), 1)) return result
def addressMatches(self, businessAddress, addressTerms) : print "For %s, comparing %s to %s..."%(self.NICKNAME, businessAddress, addressTerms) #Compare the City, State and Zip first.. if these don't match, then exit. for addressTerm in addressTerms[1:] : if addressTerm not in businessAddress[1:] : return False #use the Jaro_winkler distance algo jarowinkdist = jellyfish.jaro_winkler(businessAddress[0], addressTerms[0]) if(jarowinkdist > 0.80): print "Match!" return True
def mergeItemsGeneral(item1, item2): diff_price = feature_diffPrice(item1['price'], item2['price']) diff_latlon = feature_diffLatLon(item1['lon'], item1['lat'], item2['lon'], item2['lat']) simi_json,eq_keys_json = feature_attrJson(item1, item2) json = { 'x': [ int(item1['categoryID'] == item2['categoryID']), int(item1['locationID'] == item2['locationID']), int(item1['metroID'] == item2['metroID']), int(item1['metroID'] == ''), int(item2['metroID'] == ''), int(item1['price'] == item2['price']), int(item1['price'] == ''), int(item2['price'] == ''), int(item1['price'] == '1.0'), int(item2['price'] == '1.0'), diff_price, jf.jaro_winkler(item1['title'], item2['title']), jf.jaro_winkler(item1['description'], item2['description']), int(item1['attrsJSON'] == ''), int(item2['attrsJSON'] == ''), diff_latlon, simi_json, eq_keys_json ] } # no for seguinte, estou assumindo que os dois itens tem a mesma categoria sempre! for cat in categories.keys(): if(int(item1['categoryID']) == cat): json['x'].append(1) else: json['x'].append(0) return json
def score(self,s,t): ''' Input: s - multi-word string t - multi-word string Output : score Note: In single word string, score = jaro-winkler score ''' cummax = 0 maxscore=0 for ws in s.split(" "): for wt in t.split(" "): maxscore = max(maxscore,j.jaro_winkler(ws,wt)) cummax += maxscore return cummax/len(s.split(" "))
def are_strings_similar(string_a, string_b): d = jellyfish.jaro_winkler(string_a, string_b) return d >= 0.9
def social_graph_creation(G, dataframe): import numpy as np actor_tot_list = [] actor_buffer_list = [ ] # List used to check if an article is a perfect replica of the previous one for actor_list, theme_list in zip(dataframe.V2ENHANCEDPERSONS.unique(), dataframe.V2ENHANCEDTHEMES.unique()): actor_temp_list, offset_temp_list = [], [] if not isinstance(actor_list, float): max_offset_diff = maximum_offset_difference(actor_list, theme_list) for actor in actor_list.split(';'): [actor_temp, offset_temp] = actor.split(',') if offset_temp not in offset_temp_list: offset_temp_list.append(offset_temp) # Compute similarity between actor_temp and all actors in the tot_list if actor_tot_list: similarity_max = np.max([ jellyfish.jaro_winkler(actor_temp, actor2) for actor2 in actor_tot_list ]) index_max = np.argmax([ jellyfish.jaro_winkler(actor_temp, actor2) for actor2 in actor_tot_list ]) actor_max = actor_tot_list[index_max] nb_identical_names = len( set(actor_temp.split(' ')) & set(actor_max.split(' '))) else: similarity_max = 0 nb_identical_names = 0 # Condition to correct the name if there is a misdetected 'A' if actor_temp[0:2] == 'A ': actor_temp = actor_temp[2:] if 'Kanzler Joseph' in actor_temp: actor_max = 'Youssef Chahed' similarity_max, nb_identical_names = 1, 1 if similarity_max > 0.7 and nb_identical_names > 0: # This actor is already present in the list actor_temp = actor_max else: actor_tot_list.append(actor_temp) G.add_node(actor_temp) if actor_temp not in actor_temp_list: actor_temp_list.append(actor_temp) if actor_temp_list != actor_buffer_list: actor_buffer_list = actor_temp_list nb_actors = len(actor_temp_list) #print("Actor list: ", nb_actors, actor_temp_list) # Edge creation between the actors of the article for index1 in range(0, len(actor_temp_list)): actor1 = actor_temp_list[index1] offset1 = int(offset_temp_list[index1]) for index2 in range(index1 + 1, len(actor_temp_list)): actor2 = actor_temp_list[index2] offset2 = int(offset_temp_list[index2]) weight_edge = np.abs(offset2 - offset1) / ( max_offset_diff * nb_actors) #print("Weight: ", weight_edge) if G.has_edge(actor1, actor2): G[actor1][actor2]['weight'] += weight_edge else: G.add_edge(actor1, actor2, weight=weight_edge)
def fuzzy_value_scoring(values_list1, values_list2): """ string pairwise matcher NB only best matches are taken this is not all by all gets fuzzy pair match based on jarowinkler returns dict with mean, stc and 0.9 qualtile for jarowinkler, damerau levenshtein and hamming distances If the number of values is too long (>1000) the most frequently used values are taken as best representatives. This is to make computation doable. """ if len(values_list1) > 0 and len(values_list2) > 0: if len(values_list1) > 1000 or len(values_list2) > 1000: if len(values_list1) > 1000: x = value_info.get(facet1) value_df = pd.DataFrame(columns=['frequency']).from_dict( x, orient='index').reset_index().rename(columns={ "index": "value", 0: "frequency" }).sort_values(['frequency'], ascending=False).head(n=1000) values_list1 = value_df['value'].tolist() if len(values_list2) > 1000: x = value_info.get(facet2) value_df = pd.DataFrame(columns=['frequency']).from_dict( x, orient='index').reset_index().rename(columns={ "index": "value", 0: "frequency" }).sort_values(['frequency'], ascending=False).head(n=1000) values_list2 = value_df['value'].tolist() if len(values_list1) > len(values_list2): short_list = values_list2 long_list = values_list1 else: short_list = values_list1 long_list = values_list2 # calculate the best fuzzy matches best_match_list = [] for value1 in short_list: jaro_distance_list = [] for value2 in long_list: try: damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance( value1, value2) except ValueError: damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance( value1, value2) jaro_winkler = jellyfish.jaro_winkler(value1, value2) hamming_distance = jellyfish.hamming_distance(value1, value2) jaro_tuple = (value1, value2, jaro_winkler, damerau_levenshtein_distance, hamming_distance) jaro_distance_list.append(jaro_tuple) best_match = max(jaro_distance_list, key=lambda x: x[2]) best_match_list.append(best_match) df = pd.DataFrame(best_match_list, columns=[ 'facet1', 'facet2', 'jaro_distance', 'damerau_levenshtein_distance', 'hamming_distance' ]) jaro_distance_quant = df['jaro_distance'].quantile(0.9) jaro_distance_mean = df['jaro_distance'].mean() jaro_distance_std = df['jaro_distance'].std() damerau_levenshtein_distance_quant = df[ 'damerau_levenshtein_distance'].quantile(0.9) damerau_levenshtein_distance_mean = df[ 'damerau_levenshtein_distance'].mean() damerau_levenshtein_distance_std = df[ 'damerau_levenshtein_distance'].std() hamming_distance_quant = df['hamming_distance'].quantile(0.9) hamming_distance_mean = df['hamming_distance'].mean() hamming_distance_std = df['hamming_distance'].std() results = { 'jaro_distance_quant': jaro_distance_quant, 'jaro_distance_mean': jaro_distance_mean, 'jaro_distance_std': jaro_distance_std, 'damerau_levenshtein_distance_quant': damerau_levenshtein_distance_quant, 'damerau_levenshtein_distance_mean': damerau_levenshtein_distance_mean, 'damerau_levenshtein_distance_std': damerau_levenshtein_distance_std, 'hamming_distance_quant': hamming_distance_quant, 'hamming_distance_mean': hamming_distance_mean, 'hamming_distance_std': hamming_distance_std } # so a good match will be a high mean, low std. The quantile is prob better than mean. return results else: # 'N.A.' returned if one or both of the facets dont have any values. results = {'jaro_distance_quant':'N.A.', \ 'jaro_distance_mean':'N.A.', \ 'jaro_distance_std':'N.A.', \ 'damerau_levenshtein_distance_quant':'N.A.', \ 'damerau_levenshtein_distance_mean':'N.A.', \ 'damerau_levenshtein_distance_std':'N.A.', \ 'hamming_distance_quant':'N.A.', \ 'hamming_distance_mean':'N.A.', \ 'hamming_distance_std':'N.A.'} return results
def score(self, s, t): """ Returns the soft tf-idf similarity """ # Check to see whether a model exists; otherwise default to degenerate solution if (self.LOG_IDF is None) | (self.CORPUS_VOCAB is None) | (self.OOV_IDF_VAL is None): self.logger.info( "Either (or both) IDF or corpus vocabulary parameters not given " + "Defaulting to degenerate mode where corpus consists only of the " + "two strings given as input.") self.compute_query_idf([s, t]) # Get V(w,S) and V(w,T) (along with vocab lists for s and t) try: (s_vocab, vprime_ws, vprime_ws_norm) = self.compute_VwS(s) (t_vocab, vprime_wt, vprime_wt_norm) = self.compute_VwS(t) except ValueError: self.logger.info("string got stop-listed; most likely b/c " \ "it is of length 1, with the only character being a " \ "non-normalized punctuation mark. (i.e. '.')") sim = 0.0 return sim #compute D(w,T) for all w max_vT = dict() jw_sims = dict() for w in s_vocab: max_vT[w] = dict() max_vT[w]['score'] = 0.0 max_vT[w]['max_v'] = '' jw_sims[w] = dict() for v in t_vocab: dist = jf.jaro_winkler(w, v) jw_sims[w][v] = dist if (dist >= max_vT[w]['score']): max_vT[w]['score'] = dist max_vT[w]['max_v'] = v self.logger.debug("max_vT: {0}".format(max_vT)) # compute soft tf-idf sim sim = 0.0 self.logger.debug(s_vocab) for w in s_vocab: for v in t_vocab: if (jw_sims[w][v] >= self.THRESHOLD): inner_sum = (vprime_ws[w] / vprime_ws_norm) * ( vprime_wt[max_vT[w]['max_v']] / vprime_wt_norm) * max_vT[w]['score'] self.logger.debug( u"(w,vprime_ws[w],vprime_ws_norm): ({0},{1},{2})". format(w, vprime_ws[w], vprime_ws_norm)) self.logger.debug( u"(max_vT[w]['max_v'],vprime_wt[max_vT['max_v'],vprime_wt_norm): ({0},{1},{2})" .format(max_vT[w]['max_v'], vprime_wt[max_vT[w]['max_v']], vprime_wt_norm)) self.logger.debug(u"(max_vT[w]['score']): ({0})".format( max_vT[w]['score'])) self.logger.debug(u"(w,v,inner_sum): ({0},{1},{2})".format( w, v, inner_sum)) sim += inner_sum break self.logger.debug("Soft TF-IDF Similarity: {0}".format(sim)) return sim
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) denominator = math.sqrt(sum1) * math.sqrt(sum2) if not denominator: return 0.0 else: return float(numerator) / denominator def text_to_vector(text): words = WORD.findall(text) return Counter(words) text1 = u'I am happy.' text2 = u'I am very happy.' vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) cosine = get_cosine(vector1, vector2) print "sentences are \n", text1, "\n", text2 print 'Cosine distance :', cosine ########################################################## #--> JAro distance between sentences and other distance as well lvd = j.damerau_levenshtein_distance((text1), (text2)) jd = j.jaro_winkler((text1), (text2)) print "levenshtein_distance :", lvd print "Jaro distance :", jd
def closest_word(word): # check closest GLOVE word return max(glove, key=lambda x: jellyfish.jaro_winkler(x, word))
X_test = np.zeros((len(X_test_1a), maxsents, maxlen), dtype = 'int32') print('Loading Death certificates...') death_cert = [X_test_1a, X_test_1b, X_test_1c, X_test_1d, X_test_2] for m in range(len(death_cert)): part = death_cert[m] for i, sentences in enumerate(part): sentences = tokenize.sent_tokenize( sentences ) k = 0 for j, sent in enumerate(sentences): wordTokens = text_to_word_sequence(sent) for _ , word in enumerate(wordTokens): if word_index.get(word) == None: aux = [(jellyfish.jaro_winkler(k,word),v) for k,v in word_index.items()] if k < maxlen and max(aux)[1] < max_features: X_test[i,m,k] = max(aux)[1] k = k + 1 else: if k < maxlen and word_index.get(word) < max_features: X_test[i,m,k] = word_index.get(word) k = k + 1 print('Loading bic...') bic_components = [X_test_bic, X_test_bic_admiss, X_test_bic_sit] for m in range(len(bic_components)): bic_part = bic_components[m] for i, sentences in enumerate(bic_part): sentences = tokenize.sent_tokenize( sentences )
def getFeatures(self, a, b): # feature vector f = {} aa, ab = self.authors[a], self.authors[b] name_para = (('mid', 'name_middle'), ('first', 'name_first'), ('last', 'name_last')) for id_f, id_o in name_para: la, lb = len(aa[id_o]), len(ab[id_o]) if la == 0 or lb == 0: #at least one lacks the name part f[id_f] = 3 if (la == lb) else 2 elif aa[id_o] == ab[id_o]: #full name match f[id_f] = 5 if (la > 1) else 4 elif la > 1 and lb > 1: #full names supplied and no match f[id_f] = 0 elif aa[id_o][0] == ab[id_o][0]: #at least one is initial and initials match f[id_f] = 4 else: #initials don't match f[id_f] = 1 if aa['fullname_tfidf'] is not None and ab['fullname_tfidf'] is not None: f['fullname_sharedidf'] = shared_terms_sum(aa['fullname_tfidf'], ab['fullname_tfidf']) if aa['affil_tfidf'] is not None and ab['affil_tfidf'] is not None: f['has_affil'] = 2 elif aa['affil_tfidf'] is not None or ab['affil_tfidf'] is not None: f['has_affil'] = 1 else: f['has_affil'] = 0 if f['has_affil'] != 2: f['affil_sharedidf'] = np.nan else: f['affil_sharedidf'] = shared_terms_sum(aa['affil_tfidf'], ab['affil_tfidf']) if aa['name_last'] == ab['name_last'] and ( (aa['name_first'] == ab['name_middle'] and not aa['name_middle']) or (ab['name_first'] == aa['name_middle'] and not ab['name_middle']) ): if len(aa['name_first']) > 1: f['firstmidswap'] = 2 else: f['firstmidswap'] = 1 else: f['firstmidswap'] = 0 # 1 = off by two, 2 = off by one f['offbylastone'] = 0 la, lb = len(aa['fullname']), len(ab['fullname']) if aa['fullname'].startswith(ab['fullname']): f['subsetprefix'] = lb if la - lb <= 2: f['offbylastone'] = 3 - (la - lb) elif ab['fullname'].startswith(aa['fullname']): f['subsetprefix'] = la if lb - la <= 2: f['offbylastone'] = 3 - (lb - la) else: f['subsetprefix'] = 0 f['lastidf'] = 0 if (aa['name_last'] != ab['name_last'] or not aa['name_last']) else aa['lastname_idf'] f['iFfLidf'] = 0 if (aa['iFfL'] != ab['iFfL'] or not aa['iFfL']) else aa['iFfL_idf'] f['exact'] = int(aa['fullname_joined'] == ab['fullname_joined'] and len(aa['fullname_joined']) > 0) f['jaro_distance'] = 0 if (':' in aa['fullname'] or ':' in ab['fullname']) else jellyfish.jaro_distance(aa['fullname'], ab['fullname']) f['jaro_winkler'] = 0 if (':' in aa['fullname'] or ':' in ab['fullname']) else jellyfish.jaro_winkler(aa['fullname'], ab['fullname']) f['jarow_first'] = jellyfish.jaro_winkler(aa['name_first'], ab['name_first']) f['jarow_mid'] = jellyfish.jaro_winkler(aa['name_middle'], ab['name_middle']) f['jarow_last'] = jellyfish.jaro_winkler(aa['name_last'], ab['name_last']) f['jarow_firstmid'] = jellyfish.jaro_winkler(aa['name_first']+aa['name_middle'], ab['name_first']+ab['name_middle']) f['jarow_midlast'] = jellyfish.jaro_winkler(aa['name_middle']+aa['name_last'], ab['name_middle']+ab['name_last']) f['suffix'] = int(aa['name_suffix'] == ab['name_suffix'] and len(aa['name_suffix']) > 0) f['metaphone'] = int(aa['metaphone_fullname'] == ab['metaphone_fullname'] and len(aa['metaphone_fullname']) > 0) f.update(self.PFG.getEdgeFeatures(a, b)) return f
def get_target_pert_indices(gse_gsm_info): """ Best match the perturbation samples with control samples Args: gse_gsm_info: the GSE and GSM info tuple Returns: the GSE and GSM info tuple """ key, val = gse_gsm_info gse_id, pert_agent, gsm_ids, ctrl_text, ctrl_indices = key pert_texts, pert_indices = val target_index = max_score = max_days_diff = None # Search for time-based samples ctrl_days_text = re.search("\d+\s*(d(ays?)?|h((ours?)|(r|rs)?))", ctrl_text, flags=re.IGNORECASE) pert_days_texts = [ re.search("\d+\s*(d(ays?)?|h((ours?)|(r|rs)?))", x, flags=re.IGNORECASE) for x in pert_texts ] if ctrl_days_text is None: ctrl_days_text = re.search("\d+", "0") # If both control and perturbation samples contain time-based texts, # Match the perturbation sample with the maximum time difference to the control sample if ctrl_days_text is not None and any(x is not None for x in pert_days_texts): ctrl_days_num = int(re.search("\d+", ctrl_days_text.group()).group()) for i, pert_days_text in enumerate(pert_days_texts): if pert_days_text is not None: pert_days_num = int( re.search("\d+", pert_days_text.group()).group()) days_diff = pert_days_num - ctrl_days_num if days_diff >= 0 and (max_days_diff is None or days_diff > max_days_diff): max_days_diff = days_diff target_index = i # Match the perturbation sample with the highest text similarity with the control sample else: for i, pert_text in enumerate(pert_texts): score = jellyfish.jaro_winkler(ctrl_text, pert_text) if max_score is None or score > max_score: max_score = score target_index = i if target_index is None: return None gsm_ids = np.array(gsm_ids) ctrl_indices = list(ctrl_indices) target_pert_indices = pert_indices[target_index] # Create string for microarray analysis microarray_grouping = np.chararray(len(gsm_ids), unicode=True) microarray_grouping[:] = "X" microarray_grouping[ctrl_indices] = "0" microarray_grouping[target_pert_indices] = "1" microarray_grouping = "".join(microarray_grouping) return gse_id, (pert_agent, "|".join(gsm_ids[ctrl_indices]), "|".join(gsm_ids[target_pert_indices]), microarray_grouping)
def partition_tuples(zagat, fodors, match_tuples,\ unmatch_tuples, possible_tuples): ''' Iterates through all possible combinations of entries from zagat and fodors dataframes and computes tuples. Sends each possible combination to its respective dataframe Inputs: zagat(Pandas Dataframe): zagat dataframe fodors(Pandas Dataframe): fodors dataframe match_tuples(list): list of tuples to be classified as matches unmatch_tuples(list): list of tuples to be classified as unmatches possible_tuples(list): list of tuples to be classified as possible matches Outputs: matches_df: dataframe of matches possible_df: dataframe of possible matches unmatches_df: dataframe of non matches ''' column_index = (['z_restaurant', 'z_city', 'z_address',\ 'f_restaurant', 'f_city', 'f_address']) matches_rows = [] unmatches_rows = [] possible_rows = [] for i in range(len(zagat) - 1): for j in range(len(fodors) - 1): z_restaurant = zagat['restaurant'][i] f_restaurant = fodors['restaurant'][j] z_city = zagat['city'][i] f_city = fodors['city'][j] z_address = zagat['address'][i] f_address = fodors['address'][j] r_score = jellyfish.jaro_winkler(z_restaurant, f_restaurant) c_score = jellyfish.jaro_winkler(z_city, f_city) a_score = jellyfish.jaro_winkler(z_address, f_address) tup = (util.get_jw_category(r_score), util.get_jw_category\ (c_score), util.get_jw_category(a_score)) if tup in match_tuples: matches_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) elif tup in unmatch_tuples: unmatches_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) elif tup in possible_tuples: possible_rows.append([z_restaurant, z_city, z_address,\ f_restaurant, f_city, f_address]) matches_df = pd.DataFrame(data=matches_rows, columns=column_index) unmatches_df = pd.DataFrame(data=unmatches_rows, columns=column_index) possible_df = pd.DataFrame(data=possible_rows, columns=column_index) return matches_df, possible_df, unmatches_df
def closest_match_neighbors(self, search_name): line_penalty = lambda x: 100 * (x.parent_station[0] in search_name.split("_")[-1]) d = lambda x: fish.jaro_winkler(unicode(x.stop_name.lower()), unicode(search_name.lower())) + line_penalty(x) distances = self.data.apply(d, axis=1) i = np.argmax(distances) return self.data.stop_name[i], self.data.stop_lat[i], self.data.stop_lon[i], self.data.parent_station[i]
def parse(self, bedes_version, schema_version): # parse correct bedes version bedes = BedesParser(bedes_version) bedes.save() # check for manual mappings CSV file the_path = os.path.join(os.path.dirname(__file__), '../../lib/bedes', bedes_version) if not os.path.isfile("%s/manual_mapping_table.csv" % (the_path)): raise FileNotFoundError( "Cannot find the manual_mapping_table.csv file in lib/bedes/{} directory" .format(bedes_version)) # read data from manual mappings CSV file and store in local dict variable csv_file = open("%s/manual_mapping_table.csv" % (the_path), mode='r') manual_mappings_file = csv.DictReader(csv_file) manual_mappings = {} for term in manual_mappings_file: manual_mappings[term['BSync String']] = term['BEDES String'] # read the fields from the database, right now default to schema 0.3 schema = Schema.objects.filter(version=schema_version).first() results = {} for attribute in schema.attributes.all().order_by('id'): # use id as the key since name is not unique results[attribute.id] = [] # run function to find and replace words that are in the manual mappings table bsync_term = self.manual_mapping(attribute.name, manual_mappings) for bt in bedes.terms: distance = jellyfish.jaro_winkler(bsync_term.lower(), bt['Term'].lower()) if distance >= 0.98: results[attribute.id].append({ "attribute_name": attribute.name, "transformed_name": bsync_term, "attribute_path": attribute.path, "bedes_term": bt['Term'], "bedes_object": bt, "distance": distance, "term_or_lo": 'Term' }) # if no matches found in BEDES terms, check list options if not results[attribute.id]: for be in bedes.enumerations: # .lower() function used to neutralize upper/lower case discrepancies (there are many in enumerations/list options) distance = jellyfish.jaro_winkler( bsync_term.lower(), be['List-Option'].lower()) if distance >= 0.98: results[attribute.id].append({ "attribute_name": attribute.name, "transformed_name": bsync_term, "attribute_path": attribute.path, "bedes_term": be['List-Option'], "bedes_object": be, "distance": distance, "term_or_lo": 'List-Option' }) # sort matched terms by distance value (highest matched in first index position) results[attribute.id] = sorted(results[attribute.id], key=lambda k: -k['distance']) if not results[attribute.id]: # didn't find any term-to-term or term-to-list-option matches, start word-level matching words_data = defaultdict(list) bsync_words = [] # split BSync term into a list of individual words bsync_words = self.acronym_check( re.findall('[^:^(^)^,^ ][^,^ ^:^(^)]*', bsync_term)) # go through entire list, converting to an array that assigns an availability flag to each individual word for i in range(len(bsync_words)): bsync_words[i] = [bsync_words[i], True] if len(bsync_words) > 2: restart_process = True while restart_process: restart_process = False # generate word groups, starting with largest groups first for number_of_words in range( len(bsync_words) - 1, 1, -1): word_groups = {} # search through bsync_words for all possible combinations of word groups of proper length # made up of concurrent words with availability flags set to True, save in dict word_groups for starting_index in range( len(bsync_words) - number_of_words + 1): word_construction = '' word_construction_success = True for i in range( starting_index, starting_index + number_of_words): if bsync_words[i][1]: word_construction += bsync_words[i][ 0] + ' ' else: word_construction_success = False break if word_construction_success: word_groups[word_construction.strip( ' ')] = starting_index # attempt to match each word group generated against BEDES terms and list options using jaro_winkler distance for word_group in word_groups.keys(): for bt in bedes.terms: distance = jellyfish.jaro_winkler( word_group.lower(), bt['Term'].lower()) if distance >= 0.98: for i in range( word_groups[word_group], word_groups[word_group] + number_of_words): bsync_words[i][1] = False words_data['matched_to_term'].append( word_group) words_data['matched_term_URL'].append( bt['URL']) words_data['term_or_lo'].append('Term') # start process to break out of loops, restart entire word grouping process (now with appropriate words unavailable for grouping) restart_process = True break if restart_process: break else: # if no matches found in BEDES terms, check list options for be in bedes.enumerations: distance = jellyfish.jaro_winkler( word_group.lower(), be['List-Option'].lower()) if distance >= 0.98: for i in range( word_groups[word_group], word_groups[word_group] + number_of_words): bsync_words[i][1] = False words_data[ 'matched_to_term'].append( word_group) words_data[ 'matched_term_URL'].append( be['URL']) words_data['term_or_lo'].append( 'List-Option') # start process to break out of loops, restart entire word grouping process (now with appropriate words unavailable for grouping) restart_process = True break if restart_process: break # check through remaining individual words that weren't matched as part of a word group # individual words will only match 1-to-1 and will also search through individual words # of BEDES terms and list options for partial matches for i in range(len(bsync_words)): if bsync_words[i][1]: bsync_word = bsync_words[i][0] # run word matching function against BEDES terms and list options (enumerations) term_match_status, term_match_URL = self.word_matching( bsync_word, bedes.terms) lo_match_status, lo_match_URL = self.word_matching( bsync_word, bedes.enumerations) if term_match_status == 'Matched-Term': words_data['matched_to_term'].append(bsync_word) words_data['matched_term_URL'].append( term_match_URL) words_data['term_or_lo'].append('Term') else: if lo_match_status == 'Matched-Term': words_data['matched_to_term'].append( bsync_word) words_data['matched_term_URL'].append( lo_match_URL) words_data['term_or_lo'].append('List-Option') else: if term_match_status == 'Matched-Word': words_data['matched_to_word'].append( bsync_word) words_data[ 'matched_word_example_URL'].append( term_match_URL) else: if lo_match_status == 'Matched-Word': words_data['matched_to_word'].append( bsync_word) words_data[ 'matched_word_example_URL'].append( lo_match_URL) else: words_data['unmatched_words'].append( bsync_word) results[attribute.id].append({ "attribute_name": attribute.name, "transformed_name": bsync_term, "attribute_path": attribute.path, "word_matching": True, "term_or_lo": words_data['term_or_lo'], "matched_to_term": words_data['matched_to_term'], "matched_term_URL": words_data['matched_term_URL'], "matched_to_word": words_data['matched_to_word'], "matched_word_example_URL": words_data['matched_word_example_URL'], "unmatched_words": words_data['unmatched_words'] }) # store the results to CSV the_path = os.path.join(os.path.dirname(__file__), '../../lib/bedes', bedes_version, "schema" + schema_version) print("THE PATH: {}".format(the_path)) if not os.path.exists(the_path): os.makedirs(the_path) unique_column_words = {} unique_column_words['matched_to_term'] = [] unique_column_words['matched_to_word'] = [] unique_column_words['unmatched_words'] = [] content_uuids = [] with open("%s/bedes-mappings-terms.csv" % (the_path), 'w', newline='') as file: writer = csv.writer(file, delimiter=',') # write row of column headers writer.writerow([ 'attribute_name', 'transformed_name', 'attribute_id', 'attribute_path', 'bedes_content_uuid', 'bedes_term', 'bedes_category', 'bedes_definition', 'bedes_url', 'distance', 'match_type', 'matched_to_term', 'term_or_lo', 'matched_term_URL', 'matched_to_word', 'matched_word_example_URL', 'unmatched_words' ]) for id, be in results.items(): if len(be) > 0 and 'bedes_object' in be[0]: # 'if' structures to grab relevant information from appropriate fields depending on whether a term or a list option was matched if be[0]['term_or_lo'] == 'Term': output_category = be[0]['bedes_object']['Category'] output_definition = be[0]['bedes_object'][ 'Term-Definition'] output_match_type = 'Term-to-Term Match' elif be[0]['term_or_lo'] == 'List-Option': output_category = '' output_definition = be[0]['bedes_object'][ 'List-Option-Definition'] output_match_type = 'Term-to-List-Option Match' out = [ be[0]['attribute_name'], be[0]['transformed_name'], id, be[0]['attribute_path'], be[0]['bedes_object']['Content-UUID'], be[0]['bedes_term'], output_category, output_definition, be[0]['bedes_object']['URL'], be[0]['distance'], output_match_type, '', '', '', '', '', '' ] content_uuids.append(be[0]['bedes_object']['Content-UUID']) else: # output word matching data if no direct term matches were found if 'word_matching' in be[0]: # determine the match type tag depending on the specific combination of word match types if len(be[0]['matched_to_term']) > 0: if len(be[0]['matched_to_word']) > 0: if len(be[0]['unmatched_words']) > 0: output_match_type = 'Words: Term, Word, Unmatched' else: output_match_type = 'Words: Term, Word' elif len(be[0]['unmatched_words']) > 0: output_match_type = 'Words: Term, Unmatched' else: output_match_type = 'Words: all Term' elif len(be[0]['matched_to_word']) > 0: if len(be[0]['unmatched_words']) > 0: output_match_type = 'Words: Word, Unmatched' else: output_match_type = 'Words: all Word' else: output_match_type = 'Words: all Unmatched' # store all unique words from each word-matching column for key in unique_column_words.keys(): if be[0][key] != []: for word in be[0][key]: if word not in unique_column_words[key]: unique_column_words[key].append(word) out = [ be[0]['attribute_name'], be[0]['transformed_name'], id, be[0]['attribute_path'], '', '', '', '', '', '', output_match_type, ', '.join(be[0]['matched_to_term']), ', '.join(be[0]['term_or_lo']), ', '.join(be[0]['matched_term_URL']), ', '.join(be[0]['matched_to_word']), ', '.join(be[0]['matched_word_example_URL']), ', '.join(be[0]['unmatched_words']) ] else: # this code should never run if working properly. Can search 'debug134' in output file to make sure it hasn't run out = [ be[0]['attribute_name'], '', id, be[0]['attribute_path'], '', '', 'debug134', '', '', '', '', '', '', '', '', '', '' ] writer.writerow(out) list_set = set(content_uuids) # convert the set to the list unique_cnt = len(list(list_set)) self.stdout.write( '*******There are {} unique BEDES terms to add*******'.format( unique_cnt) ) # not sure if this code is relevent or accurate anymore # output file with list of unique words from each word-matching column - useful data to find candidates for manual mapping with open("%s/bsync_unique_words.csv" % (the_path), 'w', newline='') as file: writer = csv.writer(file, delimiter=',') # write out row of column headers writer.writerow([ 'Unique Words Matched to Term', 'Unique Words Matched to Words', 'Unique Unmatched Words' ]) mt_length = len(unique_column_words['matched_to_term']) mw_length = len(unique_column_words['matched_to_word']) uw_length = len(unique_column_words['unmatched_words']) rows = max(mt_length, mw_length, uw_length) for i in range(rows): val1 = '' val2 = '' val3 = '' if i + 1 <= mt_length: val1 = unique_column_words['matched_to_term'][i] if i + 1 <= mw_length: val2 = unique_column_words['matched_to_word'][i] if i + 1 <= uw_length: val3 = unique_column_words['unmatched_words'][i] writer.writerow([val1, val2, val3]) results = {} for enumeration in Enumeration.objects.filter(schema=schema): results[enumeration.id] = [] # retrieve associated attribute ID for CSV attrs = AttributeEnumerationClass.objects.filter( enumeration_class=enumeration.enumeration_class) associated_attrs = [] for attr in attrs: associated_attrs.append(attr.attribute_id) print(associated_attrs) for be in bedes.enumerations: distance = jellyfish.jaro_winkler(enumeration.name, be['List-Option']) if distance >= 0.95: results[enumeration.id].append({ "enumeration_name": enumeration.name, "bedes_term": be['List-Option'], "bedes_object": be, "distance": distance, "associated_attribute_ids": ' '.join([str(item) for item in associated_attrs]) }) results[enumeration.id] = sorted(results[enumeration.id], key=lambda k: -k['distance']) if not results[enumeration.id]: # didn't find anything results[enumeration.id].append({ "enumeration_name": enumeration.name, "associated_attribute_ids": ' '.join([str(item) for item in associated_attrs]) }) # store the results to CSV content_uuids = [] with open("%s/bedes-mappings-enumerations.csv" % (the_path), 'w', encoding='utf-8', newline='') as file: writer = csv.writer(file, delimiter=',') # headers: enumeration name, enumeration id, # bedes Content-UUID, bedes term, bedes definition, bedes URL, bedes Related Term UUID, distance writer.writerow([ 'enum_name', 'enum_id', 'bedes_content_uuid', 'bedes_term', 'bedes_definition', 'bedes_url', 'bedes_related_term_uuid', 'distance', 'associated_attribute_ids' ]) for enum, be in results.items(): if len(be) > 0 and 'bedes_object' in be[0]: content_uuids.append(be[0]['bedes_object']['Content-UUID']) out = [ be[0]['enumeration_name'], enum, be[0]['bedes_object']['Content-UUID'], be[0]['bedes_term'], be[0]['bedes_object']['List-Option-Definition'], be[0]['bedes_object']['URL'], be[0]['bedes_object']['Related-Term-UUID'], be[0]['distance'], be[0]['associated_attribute_ids'] ] else: out = [ be[0]['enumeration_name'], enum, '', '', '', '', '', '', be[0]['associated_attribute_ids'] ] writer.writerow(out) list_set = set(content_uuids) # convert the set to the list unique_cnt = len(list(list_set)) self.stdout.write( '*******There are {} unique BEDES enum values to add*******'. format(unique_cnt)) self.stdout.write('Finished parsing bedes')
def _is_similar(self, line1, line2): similarity_score = jellyfish.jaro_winkler(line1, line2) log.debug('companring: ({}, {}) similarity_score: {}'.format( line1, line2, similarity_score)) return similarity_score >= self.threshold
import pandas as pd import jellyfish import nltk data = pd.read_csv('TestdafFT01.txt', sep='\t', encoding='UTF-8') df = pd.DataFrame() df = df.append(data) pattern = input("Please enter search pattern:\n") distance = [] df['dist'] = -1 for index, row in df.iterrows(): dist = jellyfish.jaro_winkler(row['Antwort'], pattern) if (dist > 0): distance.append(dist) else: distance.append(0) df['dist'] = distance final_df = df.sort_values('dist', ascending=False) # final_df = final_df[(final_df['lexicalfuzz'] > (lexicalVariance))] final_df.to_csv('jwdist.tsv', index=False, sep='\t')
def headquarters(): positive = 0 negative = 0 not_found = 0 f_not_found = open("not_found.txt", "w") f_negative = open("negative.txt", "w") f_positive = open("positive.txt", "w") tuples_not_found = set() for t in results: # first, try a direct match org_extracted = t[0].decode("utf8").upper().strip() locations_groundtruth = ground_truth.get(org_extracted) # if its a direct match with a ground truth organization, compare the locations if locations_groundtruth: loc_extracted = t[1].decode("utf8").upper().strip() found = False for locations in locations_groundtruth: # some locations in DBpedia contain diferente references, e.g., city,state # e.g.,: AUBURN HILLS, MICHIGAN # split and compare with both # in case it was found and got outside the for-loop below # no need to check more references if found == True: break locations_parts = locations.split(",") for loc in locations_parts: # match locations with Jaro-Winkler, keep those >=0.8 similarity score score = jellyfish.jaro_winkler( loc_extracted.encode("utf8"), loc.strip().encode("utf8")) if score >= 0.8: f_positive.write(t[0] + '\t' + t[1] + '\n') positive += 1 found = True break # if ground-truth (from DBpedia) is a country, and extracted is a city # check if the city is in that country elif loc in countries: if loc_extracted.encode("utf8") in country_cities[loc]: f_positive.write(t[0] + '\t' + t[1] + '\t' + '\n') positive += 1 found = True break #TODO # if ground-truth (from DBpedia) is a city, and extracted location is a country # check if that city is located in that country only # elif if found == False: negative += 1 f_negative.write( t[0] + '\t' + t[1] + '\t\t:' + ';'.join(locations_groundtruth).encode("utf8") + '\n') else: tuples_not_found.add(t) # try to expand the acronyms names_found = set() for name in tuples_not_found: # if it is a single token with all uppercase letters if len(name[0].split()) == 1 and name[0].isupper(): found = False # get all the possible expansions that match this acronym expansions = acronyms.get(name[0]) if expansions: # check if any of these expansions is an organization in the # ground_truth database and if it is, extract the locations for e in expansions: locations_groundtruth = ground_truth.get(e.upper()) if locations_groundtruth: for location in locations_groundtruth: locations_parts = location.split(",") for loc in locations_parts: # approximate similarity score = jellyfish.jaro_winkler( loc.encode("utf8"), name[1].upper()) if score >= 0.8: #f_positive.write(name[0]+' ('+e+')\t'+name[1]+'\t'+str(avg_score)+'\n') f_positive.write(name[0] + ' (' + e + ')\t' + name[1] + '\n') positive += 1 found = True names_found.add(name) break if (found == True): break for n in names_found: tuples_not_found.remove(n) # for tuples not found query Freebase # cache of strings that were already queried to Freebase queried = [] for line in fileinput.input( '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt'): queried.append(line.strip()) fileinput.close() # file to save Freebase query results output = codecs.open( '/home/dsbatista/gigaword/ground-truth/freebase-output.txt', 'a', "utf-8") # open file for append, update 'freebase-queried.txt' with new issue queries f_queried = open( '/home/dsbatista/gigaword/ground-truth/freebase-queried.txt', "a") tuples_found = [] for t in tuples_not_found: org = t[0].strip() # for now do not query acronyms to Freebase with ~=, too many false positives if not (len(t[0].split()) == 1 and name[0].isupper()): # first check if that query string was already issued to Freebase # if not, query Freebase and save the result if org not in queried: if org == "Star-Times": continue response = queryFreebase(org) queried.append(org) if response != 'error': try: if response['result']: print "found:\t", org parseResponse(org, response, output) else: print "not found:\t", org f_queried.write(org + '\n') f_queried.flush() except TypeError, e: print org print e print response f_queried.close() output.close() sys.exit(0) except Exception, e: print org print e print response f_queried.close() output.close() sys.exit(0)
def jaro_dist(scan_res, desired): scan_line = get_file_as_string(scan_res) desired_line = get_file_as_string(desired) return jellyfish.jaro_winkler(scan_line, desired_line, long_tolerance=True)
def similarity(ori, inp): return (jellyfish.jaro_winkler(inp, ori))
def get_similarity(value_to_check: str, against: str) -> float: result = jellyfish.jaro_winkler(value_to_check, against) if value_to_check.startswith(against): result += 1.0 return result
def word_similarity(word_to_compare='Vignir', list_of_words=["Heigigr","Beðurni"], return_top_n=20, use_cut_off=False, cut_off = 0.5, sim_measure='Levenshtein' ,#SequenceMatcher #Jaro-Winkler #Hamming, min_characters=2, #Null for no restriction, filter_non_capital_letters = True ): """Compare similarity between a word and a list of words Returns list of similar words/names based on a similarity measure Args: word_to_compare (str) -word to compare with each value in list list_of_words (lst) - list of strings to compare against return_top_n (int) - return only top n 10 results based on similarity measure use_cut_off (bool) - whether to use a cut off value based on similarity cut_off (int) - cut off value Returns: Returns two ints; average epoc_loss and epoch_accuracy """ word_similarity_list=[] for word in list_of_words: dict_Words ={} dict_Words['word_to_compare']=word_to_compare dict_Words['word_to_compare_against']=word if sim_measure=='Levenshtein': ##dict_Words['similarity']=Levenshtein.ratio(word_to_compare, word) dict_Words['similarity']=jellyfish.levenshtein_distance(word_to_compare, word)*-1 dict_Words['similarity_measure']='Levenshtein' elif sim_measure=='SequenceMatcher': dict_Words['similarity']=SequenceMatcher(None,word_to_compare, word).ratio() dict_Words['similarity_measure']='SequenceMatcher' #https://docs.python.org/2.4/lib/sequencematcher-examples.html elif sim_measure=='Jaro-Winkler': dict_Words['similarity']=jellyfish.jaro_winkler(word_to_compare, word) dict_Words['similarity_measure']='Jaro-Winkler' elif sim_measure=='Hamming': dict_Words['similarity']=jellyfish.hamming_distance(word_to_compare, word)*-1 dict_Words['similarity_measure']='Hamming' word_similarity_list.append(dict_Words) #Convert to frame df_word_similarity = pd.DataFrame(word_similarity_list) #Sort df_word_similarity=df_word_similarity.sort_values(by='similarity', ascending=False) #Return top results if return_top_n>0: if len(df_word_similarity)>return_top_n: df_word_similarity=df_word_similarity[0:return_top_n] else: return df_word_similarity[0:0] #Whether to use cutoff if use_cut_off: df_word_similarity=df_word_similarity[df_word_similarity.similarity>cut_off] #Filter min characters if min_characters>0: df_word_similarity=df_word_similarity[df_word_similarity.word_to_compare_against.str.len()>min_characters] #Filter out words that does not start with a large character if filter_non_capital_letters: df_word_similarity=df_word_similarity[df_word_similarity.word_to_compare_against.str.istitle()] return df_word_similarity
def get_version_name_similarity(self, candidate): import jellyfish return jellyfish.jaro_winkler(self.version_name, candidate.version_name)
def jaro_winkler_distance(self, row): gn_name = self.df_source.loc[row['geonamesid'], 'name'] sn_name = self.df_target.loc[row['swissnamesid'], 'NAME'] dist = jellyfish.jaro_winkler(gn_name, sn_name) return dist
def word_similarity(s1, s2): return jellyfish.jaro_winkler(unicode(s1.lower()), unicode(s2.lower()))
def jaro(self, cand1, cand2): return jellyfish.jaro_winkler(cand1, cand2)
def theme_network_creation(G_themes, list_actor, dataframe, themes_of_interest, tf_idf): ''' Creation of a graph between the actors and the themes. For each theme mentioned in the articles, we draw an edge between this theme and the closest actor in terms of offset. This will give us a bipartite graph, with the actors on one side and the themes on the other side. The goal is to see if some actors are strongly linked to very specific themes, as detected by GDELT ''' uncommon_theme = [ 'GOV_DIVISIONOFPOWER', 'HATE_SPEECH', 'INFO_HOAX', 'POLITICAL_PRISONER', 'MEDIA_CENSORSHIP' ] for actor_list, theme_list, doc_id in zip( dataframe.V2ENHANCEDPERSONS.unique(), dataframe.V2ENHANCEDTHEMES.unique(), dataframe.GKGRECORDID): actor_list_temp, offset_list_temp = [], [] #print("begin: ", actor_list, theme_list, doc_id) if not isinstance(actor_list, float): for actor in actor_list.split(';'): actor_list_temp.append(actor.split(',')[0]) offset_list_temp.append(int(actor.split(',')[1])) # First, we need to get the themes and their respective offset in two separate lists if not isinstance(theme_list, float) and not isinstance( actor_list, float): #print("Here: ", doc_id) number_theme = len(theme_list) max_offset_diff = maximum_offset_difference(actor_list, theme_list) for theme in theme_list.split(';'): if theme: theme_temp = theme.split(',')[0] offset_temp = int(theme.split(',')[1]) if theme_temp in themes_of_interest: if not G_themes.has_node(theme_temp): G_themes.add_node(theme_temp) index_actor = np.argmin( np.abs([ offset - offset_temp for offset in offset_list_temp ])) actor_offset = actor_list_temp[index_actor] # We need to find this actor in the nodes of the network similarity_max = np.max([ jellyfish.jaro_winkler(actor_offset, actor2) for actor2 in list_actor ]) index_max = np.argmax([ jellyfish.jaro_winkler(actor_offset, actor2) for actor2 in list_actor ]) actor_max = list_actor[index_max] ''' for (actor, offset_actor) in zip(actor_list_temp, offset_list_temp): offset_diff = np.abs(offset_actor - offset_temp) similarity_max = [jellyfish.jaro_winkler(actor, actor2) for actor2 in list_actor] index_max = np.argmax(similarity_max) actor_max = list_actor[index_max] # The weight associated with this theme and article is extracted from the tf-idf dictionary weight_theme = tf_idf[doc_id][theme_temp] * (1 - offset_diff / max_offset_diff) # Now that we have the theme and the actor, we can draw an edge between the two if G_themes.has_edge(actor_max, theme_temp): G_themes[actor_max][theme_temp]['weight'] += weight_theme else: #print("New edge! ", actor_max, theme_temp) G_themes.add_edge(actor_max, theme_temp, weight = weight_theme) ''' #print("Theme: ", doc_id, theme_temp) weight_theme = tf_idf[doc_id][theme_temp] if G_themes.has_edge(actor_max, theme_temp): G_themes[actor_max][theme_temp][ 'weight'] += weight_theme else: G_themes.add_edge(actor_max, theme_temp, weight=weight_theme) return G_themes
def _get_jaro(word1, word2): """ Calculate Jaro-Winkler distance between two words """ return jellyfish.jaro_winkler(unicode(word1), unicode(word2))
def tracker_message_handler(message): tracker_magnitude_regexes = [{ "label": "Spend", "units": "$", "regex": re.compile(r"(\$[0-9]?[0-9]\.?[0-9]?[0-9]?)|[0-9]?[0-9]\.[0-9][0-9]", flags=re.IGNORECASE) }, { "label": "Calories", "units": "Cal", "regex": re.compile(r"[0-9][0-9]+ ?cal|calo?r?i?e?s? ?[0-9][0-9]", flags=re.IGNORECASE) }, { "label": "Distance", "units": "km", "regex": re.compile(r"[0-9][0-9]* ?(k.?m?|m.?i?)", flags=re.IGNORECASE) }] from_user = message.from_user chat_info = message.chat dash_message_id = str(datetime.datetime.now()) + str(from_user.id) dash_message_id = hashlib.md5(dash_message_id.encode('utf-8')).hexdigest() latest_tracker_message = { "message_id": dash_message_id, "chat_id": chat_info.id, "type": "tracker", "status": "Unassigned", "title": "Unassigned", "user_id": from_user.id, "user_name": "{} {}".format(from_user.first_name, from_user.last_name), "datetime_logged": str(datetime.datetime.now()), "message_date": message.date, "input_datetime": str(datetime.datetime.now()), "content": "Unassigned", "magnitude": 30, "units": "$", "attributes": { "example_attribute_1": "example_attribute_value" }, "estimate": "NA" } for search_logic in tracker_magnitude_regexes: regexp = search_logic["regex"] label = search_logic["label"] regex_search_result = regexp.search(message.text) #look for the magnitude if regex_search_result: #figured out what type of magnitude it is latest_tracker_message["status"] = label magnitude = regex_search_result.group(1) try: latest_tracker_message["magnitude"] = float( re.sub("[a-z]|[A-Z]|\\$", "", magnitude)) latest_tracker_message["estimate"] = "{} {}".format( re.sub("[a-z]|[A-Z]|\\$", "", magnitude), search_logic["units"]) except Exception as e: print(e) #set_trace() description = message.text.replace(magnitude, "").replace("/t", '').strip() message_distance_arr = [ (x, jellyfish.jaro_winkler(description, x)) for x in distinct_message_bins if jellyfish.jaro_winkler(description, x) > 0.91 ] #assign it accordingly #TODO #handle the exact match case better #TODO NEXT ##his is also writing empty stuff in when it doesn't know if len(message_distance_arr) >= 1: most_likely_title, dist = max(message_distance_arr, key=operator.itemgetter(1)) latest_tracker_message["title"] = most_likely_title latest_tracker_message["content"] = most_likely_title #update_firebase_with_message(latest_tracker_message) update_google_sheet_tracker(latest_tracker_message) # with open("{}/dash_app/data/tracker_data/{}.json".format(TELEGRAMPA_PROJECT_HOME,dash_message_id),"w") as f: # json.dump(latest_tracker_message, f, indent= 2) else: #if it is unrecognized ask user if they want to create a new list latest_tracker_message["title"] = description latest_tracker_message["content"] = description msg = bot.reply_to( message, 'Would you like to create a new tracker list?') try: bot.register_next_step_handler( msg, lambda x: confirm_new_tracker( x, latest_tracker_message)) update_google_sheet_tracker(latest_tracker_message) except Exception as e: print(e)
sys.stdout.flush() all_res.append(snapshot_res) if stratified_attribute == 'screen_name': source_name = source_screen_names[source_user] target_name = target_screen_names[testing_map[source_user]] else: source_name = source_user_names[source_user] target_name = target_user_names[testing_map[source_user]] if source_name is None: source_name = u'' if target_name is None: target_name = u'' name_dis_list.append(jellyfish.jaro_winkler(source_name, target_name)) print time.time() - start_time, 'seconds used.' print 'all_res', len(all_res) all_res = np.array(all_res) name_dis_list = np.array(name_dis_list) np.save(PATH + OUTPUT + 'all_res.npy', all_res) np.save(PATH + OUTPUT + 'name_dis_list.npy', name_dis_list) precision = np.mean(all_res, axis=0) if len(all_res) > 0 else 0. print precision f = open(PATH + OUTPUT + 'report.txt', 'w') f.write(str(precision))
def test_jaro_winkler_deprecation(jf): # backwards compatibility function from jellyfish import jaro_winkler with pytest.deprecated_call(): assert jaro_winkler("a", "a") == 1
info = sentence.replace("\n", "").split(" ") ## print str(info) + " vs. " + str(cl) if distance == "stringdist": caseline = StringDist.compare(info, cl.split(" ")) if distance == "levensthein": dist = jellyfish.levenshtein_distance( unicode(sentence.replace("\n", "")), unicode(cl)) caseline = (decimal.Decimal( max(len(sentence.replace("\n", "")), len(cl))) - dist) / decimal.Decimal( max(len(sentence.replace("\n", "")), len(cl))) ## print "levensthein distance:" + str(caseline) ## print "Jaro distance:" + str(caseline) if distance == "jaro-winkler": try: caseline = jellyfish.jaro_winkler( unicode(sentence.replace("\n", "")), unicode(cl)) except: caseline = 0 ## print "jaro-winkler distance:" + str(caseline) if distance == "w2vec": model = train_W2vecmodel() caseline = StringDist.compare_Word2vec(info, cl, model) #print caseline if caseline == 1: break if caseline > best_case: best_case = caseline decided_class = cl if best_case >= 0.3: nb_individuals += 1 if dict_classes.has_key(decided_class):
def get_handle_similarity(self, candidate): import jellyfish return jellyfish.jaro_winkler(self.handle, candidate.handle)