def maximizeProbabilities(originalWord, words, paperText): bestOccurrence = 0 result = '' for word in words: occurences = len(re.findall(word, paperText)) if occurences > bestOccurrence: result = word bestOccurrence = occurences elif occurences == bestOccurrence: if editdistance.eval(originalWord, result) > editdistance.eval(originalWord, word): result = word return result
def parse_grades(text_path): rows = [] with open(text_path) as f: text = f.read() pages = [page for page in text.split(PAGE_SEPARATOR) if len(page.strip()) > 0] for page_index in range(len(pages)): page = pages[page_index] lines = page.splitlines() lines = [line.strip() for line in lines] lines = [line for line in lines if len(line) > 0] year = term = UNDEFINED for line in lines: tokens = line.split() if len(tokens) < 2: continue match = False t0, t1 = tokens[0], tokens[1] if editdistance.eval(t0, SPRING) <= 1: term = SPRING match = True elif editdistance.eval(t0, FALL) <= 1: term = FALL match = True elif editdistance.eval(t0, SUMMER) <= 1: term = SUMMER match = True if match: year = parse_int(t1) break tallies = [UNDEFINED] * 22 for i in range(len(lines)): if editdistance.eval(lines[i], GRADE_TITLE) <= 2: high_grade_line = lines[i+2] low_grade_line = lines[i+5] tallies = parse_tally_line(high_grade_line) + parse_tally_line(low_grade_line) rows.append([page_index + 1, year, term] + tallies) return rows
def findMatches(self, record): bkv = record["bkv"] results = self.querier.query(bkv=bkv) matchingRecords = [record] for r in results: if int(editdistance.eval(r["surname"], record["surname"])) < 2 \ and int(editdistance.eval(r["forename"], record["forename"])) < 2 \ and int(editdistance.eval(r["title"], record["title"])) < 2 \ and int(editdistance.eval(r["occupation"], record["occupation"])) < 2 \ and int(editdistance.eval(r["address"], record["address"])) < 2: matchingRecords.append(r) return matchingRecords
def annotate(self, tokens): X_focus = self.preprocessor.transform(tokens=tokens)['X_focus'] X_context = self.pretrainer.transform(tokens=tokens) # get predictions: new_in = {} if self.include_token: new_in['focus_in'] = X_focus if self.include_context: new_in['context_in'] = X_context preds = self.model.predict(new_in) if isinstance(preds, np.ndarray): preds = [preds] annotation_dict = {'tokens': tokens} if self.include_lemma: pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx]) annotation_dict['lemmas'] = pred_lemmas if self.postcorrect: for i in range(len(pred_lemmas)): if pred_lemmas[i] not in self.known_lemmas: pred_lemmas[i] = min(self.known_lemmas, key=lambda x: editdistance.eval(x, pred_lemmas[i])) annotation_dict['postcorrect_lemmas'] = pred_lemmas if self.include_pos: pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx]) annotation_dict['pos'] = pred_pos if self.include_morph: pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx]) annotation_dict['morph'] = pred_morph return annotation_dict
def calculateRatioNGram(self, itemChildrenTextFile, path, htmlFileBackgroundKnowledge, nodeBackgroundKnowledge): ''' N-Gram distance measure - to be developed n-gram overlap https://pythonhosted.org/ngram/index.html http://odur.let.rug.nl/~vannoord/TextCat/textcat.pdf ''' ratioList = [] if len(itemChildrenTextFile) > 0: sumBkn = sum( [ htmlFileBackgroundKnowledge[key]['extractCount'] for key in htmlFileBackgroundKnowledge.keys()]) extractCount = htmlFileBackgroundKnowledge[path]['extractCount'] for itemChild in itemChildrenTextFile: ratio = [] for itemBack in nodeBackgroundKnowledge: ratio.append(editdistance.eval(itemChild, itemBack) * 2 / (len(itemChild)+len(itemBack))) ratioList.append(median(ratio) / len(itemChildrenTextFile)) else: ratioList.append(0) return mean(ratioList)
def dist(v1, v2): import editdistance def L1dis(l1, l2): dic = {} for k in l1: dic[k[0]] = k[1] for k in l2: if k[0] in dic.keys(): dic[k[0]] = math.fabs(dic[k[0]] - k[1]) else: dic[k[0]] = k[1] res = 0 for k in dic: res += dic[k] return res def tokennum(v): res = 0 for l in v: res += l[1] return res timedf = (v1[2] - v2[2]) / 86400 if v1[4] == v2[4]: audf = 0 else: audf = 1 fndf = int(editdistance.eval(v1[3], v2[3])) comdf = L1dis(v1[1], v2[1]) codedf = L1dis(v1[5], v2[5]) return (str(v1[0]), str(v2[0]), timedf, audf, fndf, int(comdf), int(codedf), tokennum(v1[5]), tokennum(v2[5]))
def match_states(self): for pref_d, vp_aspth_dict_d in self.trace_state.iteritems(): print pref_d # if pref_d not in self.pref_d_c: # self.pref_d_c[pref_d] = {} if pref_d not in self.pref_c_d: self.pref_c_d[pref_d] = {} best_match_val = sys.maxint best_match_lst = [] # raw_input('...') for vp_d, aspath_d in vp_aspth_dict_d.iteritems(): # if vp_d not in self.pref_d_c[pref_d]: # self.pref_d_c[pref_d][vp_d] = {} vp_aspth_dict_c = self.bgp_state[pref_d] for vp_c, aspath_c in vp_aspth_dict_c.iteritems(): dist = editdistance.eval(aspath_c.split(), aspath_d.split()) # self.pref_d_c[pref_d][vp_d][vp_c] = [dist, max(len(aspath_d.split()), len(aspath_c.split()))] if vp_c not in self.pref_c_d[pref_d]: self.pref_c_d[pref_d][vp_c] = {} # if vp_d not in self.pref_c_d[pref_d][vp_c]: # self.pref_c_d[pref_d][vp_c][vp_d] = {} self.pref_c_d[pref_d][vp_c][vp_d] = [dist, max(len(aspath_d.split()), len(aspath_c.split()))] if dist < best_match_val: best_match_val = dist best_match_lst = [vp_c] elif dist == best_match_val: best_match_lst.append(vp_c) # print vp_d, '%', vp_c, '=>', editdistance.eval(aspath_c.split(), aspath_d.split()) # print vp_d, '%', best_match_val, '=>', best_match_lst """ NetworkX """
def is_warning(line): line = re.sub(r'[,.]$', '', line) for base in WARNINGS: d = editdistance.eval(base, line) if 2 * d < len(base): return True return False
def dist(v1, v2): import editdistance def L1dis(l1, l2): dic = {} for k in l1: dic[k[0]] = k[1] for k in l2: if k[0] in dic.keys(): dic[k[0]] = math.fabs(dic[k[0]] - k[1]) else: dic[k[0]] = k[1] res = 0 for k in dic: res += dic[k] return res timedf = (v1[2] - v2[2]) / 86400 if v1[4] == v2[4]: audf = 0 else: audf = 1 fndf = int(editdistance.eval(v1[3], v2[3])) comdf = L1dis(v1[1], v2[1]) codedf = L1dis(v1[5], v2[5]) return timedf + audf + fndf + comdf + codedf
def average_similar(vocab, embs, dist): print "Starting average of similar words" dim = len(embs.values()[0]) added = 0 for i,w1 in enumerate(vocab): similar = [] if i%10==0: print "\rVocab word", i, sys.stdout.flush() if w1 in embs: continue for w2 in embs: if w1[:3] == w2[:3] and editdistance.eval(w1, w2) <= dist: #same language similar.append(w2) if len(similar) > 0: added += 1 print "\r{}".format(added), sys.stdout.flush() v = np.zeros(dim) for w2 in similar: if len(v) != len(embs[w2]): print "Mismatched dimensions" ipy.embed() v += embs[w2] v /= len(similar) embs[w1] = v return embs
def analyzing_sender_profile(self): len_ordering = {} count = 1 for sender in self.sender_profile.keys(): len_ordering[sender] = {} for ordering in self.sender_profile[sender]: num = len(ordering.split(" ")) if num not in len_ordering[sender].keys(): len_ordering[sender][num] = 1 else: len_ordering[sender][num] += 1 count += 1 # key: diff val: number of times I've seen count_diff = {} for sender, ordering in self.sender_profile.items(): if len(ordering) > 1: one = list(ordering)[0].split(" ") two = list(ordering)[1].split(" ") count_diff = self.add_dict(count_diff, int(editdistance.eval(one, two))) #pprint.pprint(len_ordering) pprint.pprint(count_diff) for sender, ordering in self.sender_profile.items(): if len(ordering) > 4: print(len(ordering)) print(self.sender_to_newformat[sender]) print("===up===") for sender, smt in self.sender_to_newformat.items(): if smt > 1: print(smt) print(len(self.sender_profile[sender])) print("===uppp===")
def base32_distances(base32_nmers, metric='levenshtein'): """ Get pairwise distances (different metrics) This takes a little while """ N = len(base32_nmers) total = N*(N-1.0)/2 print 'Calculating', N*(N-1)/2, 'pairwise distances.' d = np.empty(shape=(N, N), dtype=np.float) n = 0 for i in xrange(N): for j in xrange(i, N): n += 1 if n%500000 == 0: sys.stdout.write('\r'+'%.4f' % (float(n*100)/total)+'%') sys.stdout.flush() if metric == 'levenshtein': dij = editdistance.eval(base32_nmers[i], base32_nmers[j]) elif metric == 'bespoke': dij = bespoke_distance(base32_nmers[i], base32_nmers[j]) else: raise NotImplementedError d[i, j] = dij d[j, i] = dij print '' return d
def filter_hits(self, hits): """ Solves nasty edge case. 'Nes 33 1012KC Amsterdam' -> 'Nes 33-H' NOT Aert van Nesstraat 33 """ new_hits = [] for hit in hits: hit_straat = hit['straatnaam'].lower() distance = editdistance.eval(hit_straat, self.straatnaam) if not distance: # we have an ~exactisch street name match. # trumps other matches. new_hits.append(hit) # no exact matching streetname.. if not new_hits: return hits return new_hits
def getLabels(ontology_file, outputFile): # In[29]: # ontology_file = 'https://github.com/DataONEorg/sem-prov-ontologies/blob/master/observation/d1-ECSO.owl' ontology = ConjunctiveGraph() ontology.parse(open(ontology_file), format="nt") classes = [ontology.resource(c) for c in ontology[: RDF.type : OWL.Class]] classes = [ c for c in classes if c.identifier.startswith(oboe) or c.identifier.startswith("http://purl.dataone.org/odo/") ] output = ConjunctiveGraph() summary = pd.DataFrame( columns=["uri", "label", "resource", "dbpl", "score", "combined", "editdist", "altLabel", "definition"] ) i = 0 for c in classes: i += 1 label = c.label() mentions = extract_mentions(label) if len(mentions) == 0: continue g = get([x for x, score in mentions]) for uri, score in mentions: dbpl = g.label(uri) if dbpl is None: continue editdist = editdistance.eval(label.value.replace("_", " "), dbpl) if editdist < 4: labels = list(g.objects(uri, skos.altLabel)) for l in labels: output.add((uri, skos.altLabel, l)) defn = g.value(uri, RDFS.comment) if defn is not None: output.add((uri, skos.definition, defn)) for label in labels: summary = summary.append( dict( uri=c, label=label, resource=uri, dbpl=dbpl, score=score, combined=score / (0.1 + editdist), editdist=editdist, definition=defn, altLabel=label, ), ignore_index=True, ) break f = open(outputFile + ".nt", "w") f.write(output.serialize(format="ntriples")) f.close() summary.sort("combined", ascending=False).to_csv(outputFile + ".csv", encoding="utf-8")
def closest_by_edit_distance(self, x): if x in self: # Optimization: if x is in multiset, then closest # edit dist = 0. Nothing can be any closer. return (x, 0) # Optimization: If we've looked up this value before, # return previously computed answer. cached_answer = self.cache1.get(x) if cached_answer: return cached_answer cached_answer = self.cache2.get(x) if cached_answer: return cached_answer closest = None closest_dist = None for y,_ in self.most_common(): d = editdistance.eval(x, y) if not closest_dist or d < closest_dist: closest = y closest_dist = d if d == 1: # Optimization: nothing can be any closer, as # we know there's nothing at edit distance 0 (x is not # in the multiset). self.cache1.put(x, (closest, closest_dist)) return (closest, closest_dist) self.cache2.put(x, (closest, closest_dist)) return (closest, closest_dist)
def beng_word(word): word=word.lower() bengdict = open("./beng_words.txt",'r') line = bengdict.readline() line = line.split(",") for dict_word in line: dict_word=dict_word.strip() ''' if(editdistance.eval(phone[i].lower(),word.lower())<=1 or editdistance.eval(key[i].lower(),word.lower())<=1): print("1"+key[i]+" "+phone[i]) bengdict.close() return 1 ''' if(editdistance.eval(dict_word.lower(),word.lower())<=1): bengdict.close() return 1 bengdict.close() beng_suff = open("./beng_suffix.csv",'r') suff_list = beng_suff.readline().split(",") for suff in suff_list: suff=suff.strip(" ") if ((word.find(suff,2)==(len(word)-len(suff))) and len(word)>len(suff)): beng_suff.close() return 1 beng_suff.close() return 0
def phones_for_closest_match(word): """Brute force. Look for lowest distance between all words that are in the CMU dictionary. """ by_distance = [] for possibility in pronouncing.pronunciations: # levenstein distance = editdistance.eval(possibility, word) # give a bonus for same first letter / last letter if possibility.startswith(word[0]): distance -= 1 if possibility.endswith(word[-1]): distance -= 1 # break ties with difference in length character_difference = abs(len(possibility) - len(word)) by_distance.append((distance, character_difference, possibility)) # find the lowest (final tie breaker is alphabetical, oh well) d_edit, d_length, suggestion = min(by_distance) # return the suggestion and the phones for the suggestion return suggestion, pronouncing.phones_for_word(suggestion)
def edit_dist(self, anotherVector): # intersect_features = set(self.features.keys()) & set(anotherVector.features.keys()) intersect_features = self.featureSet.intersection(anotherVector.featureSet) intersect_features = [feature for feature in intersect_features if feature not in self.na_metadata ] file_edit_distance = 0.0 count = 0 for feature in intersect_features: file1_feature_value = self.featuresText[feature] file2_feature_value = anotherVector.featuresText[feature] divider = (len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) if divider == 0: continue feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/ divider file_edit_distance += feature_distance count += 1 if count == 0: return file_edit_distance file_edit_distance /= count return file_edit_distance
def _finalize_cache(self): keys = list(self.cache.keys()) for key in keys: orig_password_list = list(self.cache[key]) del self.cache[key] if len(orig_password_list) > 1: shp = list(find_shortest_hamiltonian_path_in_complete_graph(orig_password_list, False)) if len(shp) == 0: continue # shortest_hamiltonian_path did not return well. edit_distances = [] for a, b in zip(shp, shp[1:]): ed = editdistance.eval(a, b) edit_distances.append(ed) if ed not in self.cache_key_edit_distance_list: self.cache_key_edit_distance_list[ed] = [] self.cache_key_edit_distance_list[ed].append((a, b)) self.cache[key] = {} self.cache[key]['password'] = shp self.cache[key]['edit_distance'] = [0] + edit_distances mean_edit_distance_key = float('{0:.2f}'.format(np.mean(edit_distances))) if mean_edit_distance_key not in self.cache_key_edit_distance_keep_user_struct: self.cache_key_edit_distance_keep_user_struct[mean_edit_distance_key] = [] new_elt = {'password': self.cache[key]['password'], 'edit_distance': self.cache[key]['edit_distance'], 'email': key} self.cache_key_edit_distance_keep_user_struct[mean_edit_distance_key].append(new_elt)
def fix_ambiguous(ambiguous_sbi): """ For each ambiguous sbi code find to most likely candidate 0 vs.id, 1 vs.naam, 2 codes.hr_code, 3 codes.alt_code, 4 codes.title, 5 codes.alt_title, 6 codes.sub_cat, 7 codes.alt_sub_cat, 8 codes.mks_title """ original_count = 0 suggestion_count = 0 for row in ambiguous_sbi: normalcode = row[2] zerocode = row[3] desc1 = row[4] desc2 = row[5] original = row[8] distance_desc1 = editdistance.eval(desc1, original) distance_desc2 = editdistance.eval(desc2, original) if distance_desc1 > distance_desc2: # the alternative match with 0 is better suggestion_count += 1 ves = hrmodels.Vestiging.objects.get(id=row[0]) invalid_activiteit = ves.activiteiten.get(sbi_code=normalcode) # fix the code invalid_activiteit.sbi_code = zerocode # save the corrected sbi code invalid_activiteit.save() # now save updated code else: # do nothing default is fine original_count += 1 log.debug(f'{normalcode}, {zerocode}, {desc1[:18]}, {desc2[:18]}, {original[:18]}, {distance_desc1}, {distance_desc2}') # noqa log.debug("%s-%s = Original-Suggestion", original_count, suggestion_count)
def compute_score_withjson(json_input_file, outCSV, acceptTypes, allKeys): f = open(os.getcwd()+"\\test.json","w") na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) metadata_dict={} with open(json_input_file) as inputfile: parsedData = json.load(inputfile) parsedData = parsedData.get("response").get("docs") f.write(str(parsedData)) for doc in parsedData: metadata_dict[doc["id"]]=doc files_tuple = itertools.combinations(metadata_dict.keys(), 2) for file1, file2 in files_tuple: try: row_edit_distance = [file1, file2] file1_metadata = metadata_dict[file1] file2_metadata = metadata_dict[file2] intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_metadata[feature]) file2_feature_value = stringify(file2_metadata[feature]) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_metadata.keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_metadata.keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance) except ConnectionError: sleep(1) except KeyError: continue return
def _edit_distance(self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token]) -> float: edit_distance = float(editdistance.eval(' '.join(e.text for e in entity_text), token.text)) return 1.0 - edit_distance / len(token.text)
def get_normalized_editscore_words(sent1,sent2): dist=editdistance.eval(sent1, sent2) l1=len(sent1); l2=len(sent2); l=l1; if l2>l1: l=l2; return ( 1-dist/(1.0*l) );
def string_sim(n1, n2): """ Applies Levenshtein distance between strings.""" if (not n1) or (not n2): return 0 l1 = len(n1) l2 = len(n2) diff = editdistance.eval(n1,n2) return 1-(diff/(l1 if l1 > l2 else l2))
def accept_pair(premise, hypothesis): if not d.check(premise) or not str.isalnum(premise): return False threshold = min(len(premise), len(hypothesis), 4) edit_distance = editdistance.eval(premise, hypothesis) if edit_distance < threshold: return False return True
def similar_strings(s1, s2, threthold): new_s1 = s1 new_s2 = s2 if len(s1) < len(s2): new_s1 = s1 + ' '.ljust(len(s2) - len(s1)) new_s2 = s2 elif len(s1) > len(s2): new_s2 = s2 + ' '.ljust(len(s1) - len(s2)) new_s1 = s1 length = len(new_s1) hamming = distance.hamming(new_s1,new_s2,normalized=True) print "hamming %f, threthold: %f" %(hamming, threthold) #if hamming >= threthold: # return True print "calculating levenshtein ...length: %d vs %d " %(len(s1), len(s2)) integer_threthold = 0 if min(len(s1), len(s2)) > 15000: s1_arr = re.split('[;,]',s1) s2_arr = re.split('[;,]',s2) print "using fast levenshtein algorithm: s1-len:%d s2-len:%d" \ %(len(s1_arr), len(s2_arr)) levenshtein = editdistance.eval(s1_arr, s2_arr) integer_threthold = min(len(s1_arr), len(s2_arr)) * (1 - threthold) else: print "using standard levenshtein algorithm" levenshtein = editdistance.eval(s1, s2) integer_threthold = min(len(s1), len(s2)) * (1 - threthold) print "result levenshtein %d vs threthold %d " %(levenshtein,integer_threthold) if levenshtein <= integer_threthold: return True else: return False print "Done ",str(levenshtein) #jaccard = distance.jaccard(s1,s2) #print str(jaccard) ''' levenshtein = distance.levenshtein(s1,s2) print "levenshtein %d, threthold:%d" %(levenshtein, int(length * threthold)) if levenshtein < length - length * threthold: return True ''' return False
def most_similar(self, chat_line): closest = (None, None, float("inf")) for archive_date, lines in self._by_date.items(): for line in lines: distance = editdistance.eval(line.text, chat_line.text) if distance < closest[2]: closest = (archive_date, line, distance) return closest
def edit_dist_less_two(words1, words2): sum = 0 for word1 in words1: for word2 in words2: if len(word1) > 5 and len(word2) > 5 and editdistance.eval(word1, word2) <= 2 and word1[0] == word2[0]: sum += 1 break return sum
def computeScores(inputDir, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) filename_list = [filename for filename in filename_list if parser.from_file(filename)] if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." files_tuple = itertools.combinations(filename_list, 2) for file1, file2 in files_tuple: row_edit_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_parsedData["metadata"][feature]) file2_feature_value = stringify(file2_parsedData["metadata"][feature]) feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance)
def calculate_edit(allKeys): with open("csvOutput", "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) na_metadata = [] solr_handle = solr.Solr("http://localhost:8984/solr/polrsolr") select = solr.SearchHandler(solr_handle, "/select") row_count = 1000 response = select.__call__(q="*", rows = row_count) file_metadata_list = {} files_list = [] for file_data in response.results: file_metadata_list[file_data['id']] = file_data files_list.append(file_data['id']) files_tuple = itertools.combinations(files_list,2) for file_1, file_2 in files_tuple: try: row_edit_distance = [file_1, file_2] file1_data = file_metadata_list[file_1] file2_data = file_metadata_list[file_2] intersect_features = set(file1_data.keys()) & set(file2_data.keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(str(file1_data[feature])) file2_feature_value = stringify(str(file2_data[feature])) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_data.keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_data.keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance) except KeyError: continue
def computer_cer(preds, labels): dist = sum(editdistance.eval(label, pred) for label, pred in zip(labels, preds)) total = sum(len(l) for l in labels) return dist, total
def infer(): tensor_global_step = tf.train.get_or_create_global_step() model_infer = args.Model(tensor_global_step, encoder=args.model.encoder.type, decoder=args.model.decoder.type, training=False, args=args) dataset_dev = args.dataset_test if args.dataset_test else args.dataset_dev saver = tf.train.Saver(max_to_keep=40) size_variables() config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True config.log_device_placement = False with tf.train.MonitoredTrainingSession(config=config) as sess: checkpoint = tf.train.latest_checkpoint(args.dirs.checkpoint_init) saver.restore(sess, checkpoint) total_cer_dist = 0 total_cer_len = 0 total_wer_dist = 0 total_wer_len = 0 with open(args.dir_model.name + '_decode.txt', 'w') as fw: for i, sample in enumerate(dataset_dev): if not sample: continue dict_feed = { model_infer.list_pl[0]: np.expand_dims(sample['feature'], axis=0), model_infer.list_pl[1]: np.array([len(sample['feature'])]) } sample_id, shape_batch, _ = sess.run(model_infer.list_run, feed_dict=dict_feed) # decoded, sample_id, decoded_sparse = sess.run(model_infer.list_run, feed_dict=dict_feed) res_txt = array2text(sample_id[0], args.data.unit, args.idx2token, args.token2idx) ref_txt = array2text(sample['label'], args.data.unit, args.idx2token, args.token2idx) list_res_char = list(res_txt) list_ref_char = list(ref_txt) list_res_word = res_txt.split() list_ref_word = ref_txt.split() cer_dist = ed.eval(list_res_char, list_ref_char) cer_len = len(list_ref_char) wer_dist = ed.eval(list_res_word, list_ref_word) wer_len = len(list_ref_word) total_cer_dist += cer_dist total_cer_len += cer_len total_wer_dist += wer_dist total_wer_len += wer_len if cer_len == 0: cer_len = 1000 wer_len = 1000 if wer_dist / wer_len > 0: fw.write('id:\t{} \nres:\t{}\nref:\t{}\n\n'.format( sample['id'], res_txt, ref_txt)) sys.stdout.write( '\rcurrent cer: {:.3f}, wer: {:.3f};\tall cer {:.3f}, wer: {:.3f} {}/{} {:.2f}%' .format(cer_dist / cer_len, wer_dist / wer_len, total_cer_dist / total_cer_len, total_wer_dist / total_wer_len, i, len(dataset_dev), i / len(dataset_dev) * 100)) sys.stdout.flush() logging.info('dev CER {:.3f}: WER: {:.3f}'.format( total_cer_dist / total_cer_len, total_wer_dist / total_wer_len))
data = json.loads(response.read()) responsep = urllib.urlopen(urlp) rsp = eval(responsep.read()) print "QTime=", rsp['responseHeader']['QTime'] print "number of matches=", rsp['response']['numFound'] #print out the name field for each returned document sumx = 0 mindist = 255 + len(userin) pick = '' #note: was referencing doc['title'][0] for doc in rsp['response']['docs']: dist = editdistance.eval(userin, doc['title']) if (dist < mindist): mindist = dist pick = doc['title'] print "mindist=", mindist for doc in rsp['response']['docs']: dist = editdistance.eval(userin, doc['title']) print 'title field =', doc['title'], " score=", doc[ 'score'], " dist=", dist if (dist == mindist): sumx += doc['score'] # do a roulette wheel selection based on the sum rndpoint = random.uniform(0, sumx) sumy = 0 for doc in rsp['response']['docs']:
def computeScores(inputDir, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." files_tuple = itertools.combinations(filename_list, 2) num = 0 for file1, file2 in files_tuple: try: row_edit_distance = [file1, file2] print num fp = open(file1, "r") fp2 = open(file2, "r") file1_parsedData = {} file1_parsedData["metadata"] = json.load(fp) file2_parsedData = {} file2_parsedData["metadata"] = json.load(fp2) intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_parsedData["metadata"][feature]) file2_feature_value = stringify(file2_parsedData["metadata"][feature]) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance) num += 1 except ConnectionError: sleep(1) except KeyError: continue
def calculate_edit_distance(test_name, train_names_features): global edit_distance edit_distance[test_name] = {} for name in train_names_features: edit_distance[test_name][name] = editdistance.eval(test_name, name)
def forward(self, model, sample, reduction="sum", log_probs=True): """Computes the cross entropy with accuracy metric for the given sample. This is similar to CrossEntropyCriterion in fairseq, but also computes accuracy metrics as part of logging Args: logprobs (Torch.tensor) of shape N, T, D i.e. batchsize, timesteps, dimensions targets (Torch.tensor) of shape N, T i.e batchsize, timesteps Returns: tuple: With three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training TODO: * Currently this Criterion will only work with LSTMEncoderModels or FairseqModels which have decoder, or Models which return TorchTensor as net_output. We need to make a change to support all FairseqEncoder models. """ net_output = model(**sample["net_input"]) num_output = net_output["num_output"].int() if model.training: lprobs, qua_loss, ce_loss = self.compute_loss( model, net_output, sample, reduction, log_probs ) nsentences = sample["target"].size(0) + 1.0 ntokens = sample["ntokens"] loss = self.args.lambda_qua * qua_loss * ntokens / nsentences + ce_loss sample_size, logging_output = self.get_logging_output( sample, lprobs, loss, qua_loss, ce_loss ) else: import editdistance loss = qua_loss = sample_size = 0.0 logging_output = { "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size } c_err = 0 c_len = 0 with torch.no_grad(): for logits, l, t in zip(net_output['logits'], num_output, sample["target"]): decoded = logits.argmax(dim=-1)[:l] p = (t != self.task.target_dictionary.pad()) & ( t != self.task.target_dictionary.eos() ) targ = t[p] targ_units_arr = targ.tolist() pred_units_arr = decoded.tolist() # targ_units_arr = targ.unique_consecutive().tolist() # pred_units_arr = decoded.unique_consecutive().tolist() c_err += editdistance.eval(pred_units_arr, targ_units_arr) c_len += len(targ_units_arr) logging_output["c_errors"] = c_err logging_output["c_total"] = c_len return loss, sample_size, logging_output
def dist3(word): one_array = [] for words in Dictionary: if editdistance.eval(word, words) == 3: one_array.append(words) return one_array
def main(): if len(sys.argv) != 2: print('wrong number of arguments') exit(1) folder_path = sys.argv[1] dirs = [ dir for dir in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, dir)) ] compiled_set = set() for dir in dirs: full_dir_path = os.path.join(folder_path, dir) print(f'checking: {dir}') cpp_list = [x for x in os.listdir(full_dir_path) if '.cpp' in x] if not os.path.isfile(os.path.join(full_dir_path, 'test')): stdout, stderr, code = compile_file(' '.join(cpp_list), full_dir_path) if code != 0 or len(stdout) != 0 or len(stderr) != 0: print('error on compiling!') print(stdout, stderr) else: print('compiled!') compiled_set.add(dir) else: compiled_set.add(dir) print('already compiled') print('-----check code similarity------') similarity = np.zeros((len(dirs), len(dirs))) for i in range(len(dirs) - 1): for j in range(i + 1, len(dirs)): i_dir = dirs[i] j_dir = dirs[j] i_full_path = os.path.join(folder_path, i_dir) j_full_path = os.path.join(folder_path, j_dir) cpp_list = [x for x in os.listdir(i_full_path) if '.cpp' in x] sum = 0 for cpp in cpp_list: try: i_file = open(os.path.join(i_full_path, cpp), 'r') j_file = open(os.path.join(j_full_path, cpp), 'r') i_content = i_file.read() j_content = j_file.read() sim = editdistance.eval(i_content, j_content) sum = sum + sim i_file.close() j_file.close() except: print( 'invalid file encoding found, not valid ascii charaters in: ' ) sum = np.Inf try: i_file = open(os.path.join(i_full_path, cpp), 'r') i_file.read() i_file.close() except: print( f'invalid characters found in file {i_full_path}') try: j_file = open(os.path.join(j_full_path, cpp), 'r') j_file.read() j_file.close() except: print( f'invalid characters found in file {j_full_path}') i_file.close() j_file.close() similarity[i][j] = sum similarity[j][i] = sum for i in range(len(dirs)): similarity[i][i] = np.Inf print(similarity) print('-----done code similarity check------') print('-----similarity report-----') print('min is:') print(np.min(similarity)) mix_arg = np.argmin(similarity) x, y = np.unravel_index(mix_arg, similarity.shape) print(f'index at: ({x}, {y})') print(f'student: {dirs[x]}, {dirs[y]}') print('second highest:') similarity[x][y] = np.Inf similarity[y][x] = np.Inf print(np.min(similarity)) mix_arg = np.argmin(similarity) x, y = np.unravel_index(mix_arg, similarity.shape) print(f'index at: ({x}, {y})') print(f'student: {dirs[x]}, {dirs[y]}') print('-----end of similarity report-----') for username in dirs: exec_path = os.path.join(folder_path, username, 'test') try: if test(exec_test_case, exec_path): print(f'{username} successfully pass all the test case') else: print(f'{username} failed the test case') except: print( f'{username} have invalid char in the program, please double check' )
def levenshtein_avg(weights, seq1, seq2): norm = .5 * (len(seq1) + len(seq2)) return 1 - (editdistance.eval(seq1, seq2) / norm)
for test in test_set: #bow = vectorizer.transform(test) smatrix = vectorizer.transform(test) tfidf = TfidfTransformer(norm="l2") tfidf.fit(smatrix) for item in items: term_match = 0 matched_on = [] specimen = utils.init_fields(item['data']) for term in vectorizer.vocabulary_: for key, value in specimen.iteritems(): clean_term = utils.normalize(term) clean_value = utils.normalize(value) dist = editdistance.eval(clean_term, clean_value) if term == value or dist < 5: term_match += 1 if key not in matched_on: matched_on.append(key) if term_match >= 10: print "specimen match" print ", ".join(matched_on) result = db.epandda_match.insert_one({ "oid": record['oid'], "uuid": item['uuid'] })
def calc_str_distance(command, candidate): """ Calculates the distance between two strings ref: Levenshtein Distance """ return editdistance.eval(command, candidate)
def forward(self, model, sample, reduction="sum", log_probs=True): """Computes the cross entropy with accuracy metric for the given sample. This is similar to CrossEntropyCriterion in fairseq, but also computes accuracy metrics as part of logging Args: logprobs (Torch.tensor) of shape N, T, D i.e. batchsize, timesteps, dimensions targets (Torch.tensor) of shape N, T i.e batchsize, timesteps Returns: tuple: With three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training TODO: * Currently this Criterion will only work with LSTMEncoderModels or FairseqModels which have decoder, or Models which return TorchTensor as net_output. We need to make a change to support all FairseqEncoder models. """ nsentences = sample["target"].size(0) ntokens = sample["ntokens"] if model.training: net_output = model(**sample["net_input"]) num_output = torch.round(net_output["num_output"]).int() gold_rate = net_output["gold_rate"] if "gold_rate" in net_output else 0.0 lprobs, ctc_loss, qua_loss, ce_loss = self.compute_loss( model, net_output, sample, reduction, log_probs ) e_len = int(sum(abs(sample["target_lengths"].data - num_output.data))) loss = ce_loss + \ self.args.lambda_qua * qua_loss * ntokens / nsentences + \ self.args.lambda_ctc * ctc_loss sample_size, logging_output = self.get_logging_output( sample, lprobs, e_len, loss, ctc_loss, qua_loss, ce_loss, gold_rate ) else: import editdistance net_output = model(**sample["net_input"]) num_output = torch.round(net_output["num_output"]).int() loss = sample_size = 0.0 logging_output = { "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size } c_err = 0 c_len = 0 e_len = 0 with torch.no_grad(): for i, logits, l, t in zip(range(9999), net_output['logits'], num_output, sample["target"]): # decoded = logits.argmax(dim=-1)[:l] p = t != self.task.target_dictionary.pad() decoded = logits.argmax(dim=-1)[:l] targ = t[p] targ_units_arr = targ.tolist() pred_units_arr = decoded.tolist() c_err += editdistance.eval(pred_units_arr, targ_units_arr) c_len += len(targ_units_arr) e_len += abs(len(targ_units_arr) - len(pred_units_arr)) * 1.0 logits2sent(pred_units_arr, targ_units_arr, model.tgt_dict, rate=0.03) logging_output["c_errors"] = c_err logging_output["c_total"] = c_len logging_output["e_len"] = e_len return loss, sample_size, logging_output
list_files = os.listdir(parent) list_files.sort() division = len(list_files) / 8 for i in xrange(division * 1, division * 2): curr_ref = getRef(list_files[i]) citis.write("********" + list_files[i] + "********\n") if curr_ref is None: continue for each_wiki in list_files: if each_wiki is list_files[i]: continue ref_each_wiki = getRef(each_wiki) if ref_each_wiki is None: continue print_citis = list() for each_curr_ref in curr_ref: for each_ref_each_wiki in ref_each_wiki: if each_curr_ref['year'] == each_ref_each_wiki['year']: edit_dis_authors = editdistance.eval( each_curr_ref['authors'], each_ref_each_wiki['authors']) edit_dis_ref = editdistance.eval(each_curr_ref['ref'], each_ref_each_wiki['ref']) else: continue if edit_dis_authors < 6 or edit_dis_ref < 6: print_citis.append(each_wiki) # less6.write(str(each_curr_ref)+"\n"+str(each_ref_each_wiki)+"\n"+"{"+str(int(edit_dis_authors))+","+str(int(edit_dis_ref))+"}\n") citis.write(str(set(print_citis)) + "\n")
def __eq__( self, other ): if ( isinstance( other, stringa ) and editdistance.eval( self.stringa, other.stringa ) < 5 ): return True; else: return False;
def get_max_edit_dist(target): dists = [ editdistance.eval(target, rand_seq(len(target))) for _ in xrange(1000) ] return min(10, np.percentile(dists, 0.5))
def levenshtein_max(weights, seq1, seq2): norm = 1.0 * max(len(seq1), len(seq2)) return 1 - (editdistance.eval(seq1, seq2) / norm)
print(f"Exception: {e.get_code()}") print(src +'\t'+ trg) continue try: lang2 = detect(trg) except Exception as e: print(f"Exception: {e.get_code()}") print(src +'\t'+ trg) continue if lang1 != 'en' or lang2 != 'en': print("NOT ENGLISH:\t" + src +'\t'+ trg) continue zh_tmp = re.sub(r"[\s,\.]", "", src) en_tmp = re.sub(r"[\s,\.]", "", trg) min_len = min(len(zh_tmp), len(en_tmp)) dist = editdistance.eval(zh_tmp.lower(), en_tmp.lower()) ratio = dist * 1.0 / min_len if ratio < 0.4: print(f"OVERLAP {ratio}:\t" + src +'\t'+ trg) continue if len(src) > len(trg): long_txt = src shrt_txt = trg len_short = len(trg) else: long_txt = trg shrt_txt = src len_short = len(src) segment1 = long_txt[:len_short] segment2 = long_txt[-len_short:] dist1 = editdistance.eval(segment1.lower(), shrt_txt.lower())
def forward(self, xs_pad, ilens, ys_pad): """E2E forward. Args: xs_pad (torch.Tensor): batch of padded input sequences (B, Tmax, idim) ilens (torch.Tensor): batch of lengths of input sequences (B) ys_pad (torch.Tensor): batch of padded character id sequence tensor (B, Lmax) Returns: loss (torch.Tensor): transducer loss value """ # 0. Frontend if self.frontend is not None: hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens) hs_pad, hlens = self.feature_transform(hs_pad, hlens) else: hs_pad, hlens = xs_pad, ilens # 1. encoder hs_pad, hlens, _ = self.enc(hs_pad, hlens) # 2. decoder loss = self.dec(hs_pad, hlens, ys_pad) # 3. compute cer/wer # note: not recommended outside debugging right now, # the training time is hugely impacted. if self.training or not (self.report_cer or self.report_wer): cer, wer = 0.0, 0.0 else: word_eds, word_ref_lens, char_eds, char_ref_lens = [], [], [], [] batchsize = int(hs_pad.size(0)) batch_nbest = [] for b in six.moves.range(batchsize): if self.beam_size == 1: nbest_hyps = self.dec.recognize(hs_pad[b], self.recog_args) else: nbest_hyps = self.dec.recognize_beam(hs_pad[b], self.recog_args) batch_nbest.append(nbest_hyps) y_hats = [nbest_hyp[0]['yseq'][1:] for nbest_hyp in batch_nbest] for i, y_hat in enumerate(y_hats): y_true = ys_pad[i] seq_hat = [self.char_list[int(idx)] for idx in y_hat] seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1] seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, ' ') seq_true_text = "".join(seq_true).replace(self.recog_args.space, ' ') hyp_words = seq_hat_text.split() ref_words = seq_true_text.split() word_eds.append(editdistance.eval(hyp_words, ref_words)) word_ref_lens.append(len(ref_words)) hyp_chars = seq_hat_text.replace(' ', '') ref_chars = seq_true_text.replace(' ', '') char_eds.append(editdistance.eval(hyp_chars, ref_chars)) char_ref_lens.append(len(ref_chars)) wer = 0.0 if not self.report_wer else float(sum(word_eds)) / sum(word_ref_lens) cer = 0.0 if not self.report_cer else float(sum(char_eds)) / sum(char_ref_lens) self.loss = loss loss_data = float(self.loss) if not math.isnan(loss_data): self.reporter.report(loss_data, cer, wer) else: logging.warning('loss (=%f) is not correct', loss_data) return self.loss
def score(true, pred, iou_threshold=0.5, similarity_threshold=0.5, translator=None): """ Args: true: The ground truth boxes provided as a dictionary of {image_id: annotations} mappings. `annotations` should be lists of dicts with a `text` and `vertices` key. `vertices` should be a list of (x, y) coordinates. Optionally, an "ignore" key can be added to indicate that detecting an annotation should neither count as a false positive nor should failure to detect it count as a false negative. pred: The predicted boxes in the same format as `true`. iou_threshold: The minimum IoU to qualify a box as a match. similarity_threshold: The minimum texg similarity required to qualify a text string as a match. translator: A translator acceptable by `str.translate`. Used to modify ground truth / predicted strings. For example, `str.maketrans(string.ascii_uppercase, string.ascii_lowercase, string.punctuation)` would yield a translator that changes all strings to lowercase and removes punctuation. Returns: A results dictionary reporting false positives, false negatives, true positives and near matches (IoU > iou_threshold but similarity < similarity_threshold) along with the compute precision and recall. """ true_ids = sorted(true) pred_ids = sorted(pred) assert all( true_id == pred_id for true_id, pred_id in zip(true_ids, pred_ids) ), "true and pred dictionaries must have the same keys" results: typing.Dict[str, typing.List[dict]] = { "true_positives": [], "false_positives": [], "near_true_positives": [], "false_negatives": [], } for image_id in true_ids: true_anns = true[image_id] pred_anns = copy.deepcopy(pred[image_id]) pred_matched = set() for true_index, true_ann in enumerate(true_anns): match = None for pred_index, pred_ann in enumerate(pred_anns): iou = iou_score(true_ann["vertices"], pred_ann["vertices"]) if iou >= iou_threshold: match = { "true_idx": true_index, "pred_idx": pred_index, "image_id": image_id, } pred_matched.add(pred_index) true_text = true_ann["text"] pred_text = pred_ann["text"] if true_ann.get("ignore", False): # We recorded that this prediction matched something, # so it won't be a false positive. But we're also ignoring # this ground truth label so we won't count it as a true # positive or a near true positive. continue if translator is not None: true_text = true_text.translate(translator) pred_text = pred_text.translate(translator) edit_distance_norm = max(len(true_text), len(pred_text)) if edit_distance_norm == 0: similarity = 1 else: similarity = 1 - ( editdistance.eval(true_text, pred_text) / max(len(true_text), len(pred_text)) ) if similarity >= similarity_threshold: results["true_positives"].append(match) else: results["near_true_positives"].append(match) if match is None and not true_ann.get("ignore", False): results["false_negatives"].append( {"image_id": image_id, "true_idx": true_index} ) results["false_positives"].extend( {"pred_index": pred_index, "image_id": image_id} for pred_index, _ in enumerate(pred_anns) if pred_index not in pred_matched ) fns = len(results["false_negatives"]) fps = len(results["false_positives"]) tps = len( set( (true_positive["image_id"], true_positive["true_idx"]) for true_positive in results["true_positives"] ) ) precision = tps / (tps + fps) recall = tps / (tps + fns) return results, (precision, recall)
def cer(predict, truth): cer = [1.0*editdistance.eval(p[0], p[1])/len(p[1]) for p in zip(predict, truth)] return np.array(cer).mean()
def edit_cer_from_list(truth, pred): edit = 0 for t, p in zip(truth, pred): edit += editdistance.eval(t, p) return edit
obo2dist = {} allDiseaseWords = actDisease.split(' ') for oboname in oboname2node: foundOne = True for word in allDiseaseWords: if len(word) > 4 and word in oboname: foundOne = True break if not foundOne: continue dist = editdistance.eval(oboname, actDisease) obo2dist[oboname] = dist if len(obo2dist) == 0: continue bestDistances = sorted(obo2dist.items(), key=lambda x: x[1]) print(disease, bestDistances[0]) if len(disease) < 8 and actDisease not in bestDistances[0][0]: continue # add best distance disease2obo[disease].add(oboname2node[bestDistances[0][0]].id)
index2 = blend2.find('\t') blend3 = blend2[index2+1:len(blend2)] blend2 = blend2[0:index2] #print(blend1,blend2,blend3) for candi in candis: candi = candi.strip() #print (candi) if blend1 == candi: recall_list[ i ]= blend1 i += 1 pos=find_vowel(blend1) fpc = first_part(blend1,pos) fpd = front_part(blend2,pos) spc = second_part(blend1,pos) epd = end_part(blend2,len(blend2)-len(blend1)+pos) ed_list1[order] = editdistance.eval(fpc,fpd) ged_list1[order] = ged(fpc,fpd) ngarm_list1[order] = NGram(fpc,fpd) ed_list2[order] = editdistance.eval(spc,epd) #ged_list2[order] = ged(spc,epd) ngarm_list2[order] = NGram(spc,epd) order += 1 print("for ed_list1") analyze(ed_list1) print("for ed_list2") analyze(ed_list2) print("for ged_list1") analyze(ged_list1)
def main(): start = time.time() tweets = list(get_tweets()) # get tweets movies = json.load(codecs.open('./data/imdb_titles.json', 'r', encoding='utf-8')) # get movie titles movie_titles = list(movies.keys()) # comment out to use entire IMDB # movies = list(get_imdb_movies()) # movie_titles = list(map(lambda _m: _m['originalTitle'], movies)) # movie title without episode movie_titles_without_eps = filter(lambda _title: '-' in _title, movie_titles) movie_titles_without_eps = map(lambda _title: _title.split('-')[0].strip(), movie_titles_without_eps) movie_titles_without_eps = list(movie_titles_without_eps) # short title short_movie_titles = filter(lambda _title: ':' in _title, movie_titles) short_movie_titles = map(lambda _title: _title.split(':')[0].strip(), short_movie_titles) short_movie_titles = list(short_movie_titles) movies = list(set(movie_titles + movie_titles_without_eps + short_movie_titles)) result = {} # for each tweet, find closest movie title for tweet in tweets: tweet_clean_text = clean_text(tweet['text']) if not tweet_clean_text.strip(): continue closest = tweet_clean_text closest_dist = sys.maxsize closest_dist_word = sys.maxsize for movie in movies: movie_title = clean_text(movie) # calculate edit distance dist = editdistance.eval(tweet_clean_text, movie_title) # split(' ') to measure edit distance using words instead of characters dist_word = editdistance.eval(tweet_clean_text.split(' '), movie_title.split(' ')) if dist_word < closest_dist_word: closest_dist = dist closest_dist_word = dist_word closest = movie_title elif dist_word == closest_dist_word and dist < closest_dist: closest_dist = dist closest_dist_word = dist_word closest = movie_title # at most 3 words difference and diff must be less than # of words in the tweet if closest_dist_word <= 3 and closest_dist_word < len(tweet_clean_text.split(' ')): print(tweet_clean_text, '\t', closest, '\t', closest_dist) result[tweet['id']] = { 'tweet': tweet['text'], 'tweet_clean': tweet_clean_text, 'matched_title': closest, 'dist_char': closest_dist, 'dist_word': closest_dist_word } end = time.time() json.dump(result, codecs.open("./data/mapped_titles.json", "w", encoding="utf-8"), indent=4) print(end - start)
def features_for_rank(self, proc, results): """Compute features for ranking results from ES/geonames Parameters ---------- proc : dict One dictionary from the list that comes back from geoparse or from make_country_features (doesn't matter) results : dict the response from a geonames query Returns -------- X : numpy matrix holding the computed features meta: list of dicts including feature information """ feature_list = [] meta = [] results = results['hits']['hits'] search_name = proc['word'] code_mention = proc['features']['code_mention'] class_mention = proc['features']['class_mention'] for rank, entry in enumerate(results): # go through the results and calculate some features # get population number and exists try: pop = int(entry['population']) has_pop = 1 except Exception as e: pop = 0 has_pop = 0 if pop > 0: logp = np.log(pop) else: logp = 0 ### order the results came back adj_rank = 1 / np.log(rank + 2) # alternative names len_alt = len(entry['alternativenames']) adj_alt = np.log(len_alt) ### feature class (just boost the good ones) if entry['feature_class'] == "A" or entry['feature_class'] == "P": good_type = 1 else: good_type = 0 #fc_score = 3 ### feature class/code matching if entry['feature_class'] == class_mention: good_class_mention = 1 else: good_class_mention = 0 if entry['feature_code'] == code_mention: good_code_mention = 1 else: good_code_mention = 0 ### edit distance ed = editdistance.eval(search_name, entry['name']) ed = ed # shrug # maybe also get min edit distance to alternative names... features = [ has_pop, pop, logp, adj_rank, len_alt, adj_alt, good_type, good_class_mention, good_code_mention, ed ] m = self.format_geonames(entry) feature_list.append(features) meta.append(m) #meta = geo.format_geonames(results) X = np.asmatrix(feature_list) return (X, meta)
def normalized_levenshtein(str_a, str_b): ''' Edit distance normalized to [0, 1]. ''' return min(editdistance.eval(str_a, str_b) / (len(str_b) + 1e-16), 1.0)
# birkbeck_predict_abs_path = os.path.join(script_dir, birkbeck_predict_rel_path) # read dict with open(dict_abs_path, "r") as f: dictionary_list = f.read().splitlines() # birkbeck with open(birkbeck_misspell_abs_path, "r") as fr_misspell: with open(birkbeck_correct_abs_path, "r") as fr_correct: birkbeck_misspell = fr_misspell.read().splitlines() birkbeck_correct = fr_correct.read().splitlines() for i in range(0, len(birkbeck_misspell)): birkbeck_predict = [] dist = math.inf for each in dictionary_list: temp = editdistance.eval(birkbeck_misspell[i], each) if temp < dist: dist = temp birkbeck_predict = [each] if temp == dist: if (each not in birkbeck_predict): birkbeck_predict.append(each) birkbeck_predict_new = [] dist_ngram = -math.inf for predict in birkbeck_predict: temp_ngram = ngram.NGram.compare(predict, birkbeck_misspell[i], N=2) if temp_ngram > dist_ngram: dist_ngram = temp_ngram birkbeck_predict_new = [predict]
def compute_score2(json_input_list, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) json_list = [] for each in json_input_list: with open(each) as json_input_file: json_list.extend(json.load(json_input_file)) # each object in json_list contains a key as file name and a value: as metadata JSON object metadata_dict = {} for entry in json_list: key = entry.keys()[0] metadata_dict[key] = entry[key] files_tuple = itertools.combinations(metadata_dict.keys(), 2) for file1, file2 in files_tuple: try: row_edit_distance = [file1, file2] file1_metadata = metadata_dict[file1] file2_metadata = metadata_dict[file2] intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_metadata[feature]) file2_feature_value = stringify(file2_metadata[feature]) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_metadata.keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_metadata.keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance) except ConnectionError: sleep(1) except KeyError: continue return
pickle.dump(plots_tasks_year_after, handle, protocol=pickle.HIGHEST_PROTOCOL) def intersection(lst1, lst2): lst3 = [value for value in lst1 if value in lst2] return lst3 intersecting_tasks = intersection(list(plots_tasks_year_after.keys()), list(plots_tasks_year_before.keys())) import editdistance, pickle similar_tasks = {} for elem1 in intersecting_tasks: elems = [] for elem2 in intersecting_tasks: if (editdistance.eval(elem1, elem2) / max(len(elem1), len(elem2)) < 0.3): if (elem1 != elem2): elems.append(elem2) if (len(elems) > 0): similar_tasks[elem1] = elems with open( '/mnt/c/Users/D.MONDAL/Downloads/tdm-kg-new/tdm-kg/ACLAnthTill2015_Prediction/similar_tasks.pickle', 'wb') as handle: pickle.dump(similar_tasks, handle, protocol=pickle.HIGHEST_PROTOCOL)
def wer(predict, truth): word_pairs = [(p[0].split(' '), p[1].split(' ')) for p in zip(predict, truth)] wer = [1.0*editdistance.eval(p[0], p[1])/len(p[1]) for p in word_pairs] return np.array(wer).mean()