def get_del_ins_num(errs, corrs): i, j, dp = levenshtein_distance(errs, corrs) del_num, ins_num = 0, 0 while i > 0 or j > 0: if i == 0: min_idx = 2 elif j == 0: min_idx = 1 else: dp_val = [dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1]] min_idx = dp_val.index(min(dp_val)) if dp[i][j] == dp[i - 1][j - 1] and min_idx == 0: i -= 1 j -= 1 elif min_idx == 0: del_num += 1 ins_num += 1 i -= 1 j -= 1 elif min_idx == 1: del_num += 1 i -= 1 else: ins_num += 1 j -= 1 return del_num, ins_num
def average_levenshtein(predicted_plain: List[str], correct_plain: List[str]) -> float: correct = 0 for predicted, plain in zip(predicted_plain, correct_plain): correct += levenshtein_distance(plain, predicted) return correct / len(correct_plain)
def align_err_cor(errs, corrs): i, j, dp = levenshtein_distance(errs, corrs) err_vals, corr_vals = [], [] while i > 0 or j > 0: if i == 0: min_idx = 2 elif j == 0: min_idx = 1 else: dp_val = [dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1]] min_idx = dp_val.index(min(dp_val)) if dp[i][j] == dp[i - 1][j - 1] and min_idx == 0: i -= 1 j -= 1 corr_vals.append(corrs[j]) err_vals.append(errs[i]) elif min_idx == 0: i -= 1 j -= 1 corr_vals.append(corrs[j]) err_vals.append(errs[i]) elif min_idx == 1: i -= 1 corr_vals.append(None) err_vals.append(errs[i]) else: j -= 1 corr_vals.append(corrs[j]) err_vals.append(None) corr_vals.reverse() err_vals.reverse() return err_vals, corr_vals
def process(text): a = None b = None err_corr = text.split("\t") if len(err_corr) == 2: err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False) err = mojimoji.han_to_zen(err, ascii=False, digit=False) corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False) corr = mojimoji.han_to_zen(corr, ascii=False, digit=False) err_lang = utils.lang_check(err, lang) corr_lang = utils.lang_check(corr, lang) if err_lang and corr_lang: errs = list(err) corrs = list(corr) del_num, ins_num = ld.levenshtein_distance(errs, corrs) del_portion = del_num / len(errs) ins_portion = ins_num / len(corrs) if (del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4)\ and (corrs[-1]== '。' or corrs[-1]== '?' or corrs[-1]== '!') \ and (corrs[-2] not in numlist) and ('__' not in corr) and (len(corr)>6): #cleaning the dataset like: 1) err = re.sub("\d+\)\s+", "", err) corr = re.sub("\d+\)\s+", "", corr) err = re.sub("\(\s", "", err) corr = re.sub("\(\s", "", corr) err = re.sub("\s\)", "", err) corr = re.sub("\s\)", "", corr) #cleaning the string like: 1.) err = re.sub("\d+\.\)\s*", "", err) corr = re.sub("\d+\.\)\s*", "", corr) #cleaning the string like: 1. err = re.sub("\d+\.\s*", "", err) corr = re.sub("\d+\.\s*", "", corr) #cleaning the strings begin with ・ err = re.sub("・\s+", "", err) corr = re.sub("・\s+", "", corr) # cleaning the strings begin with * err = re.sub("\*\s+", "", err) corr = re.sub("\*\s+", "", corr) # cleaning the strings begin with * err = re.sub("\*\*\s+", "", err) corr = re.sub("\*\*\s+", "", corr) # cleaning the strings begin with - err = re.sub("-\s+", "", err) corr = re.sub("-\s+", "", corr) # cleaning the tag for conversation: err = re.sub("A:\s*", "", err) corr = re.sub("A:\s*", "", corr) # cleaning the tag for conversation: err = re.sub("B:\s*", "", err) corr = re.sub("B:\s*", "", corr) a = err b = corr return a, b
def process(text): err_corr = text.split("\t") if len(err_corr) == 2: err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False) corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False) err_lang = utils.lang_check(err, lang) if check_ascii(err) else False corr_lang = utils.lang_check(corr, lang) if check_ascii(corr) else False if err_lang and corr_lang: errs = tokenize.word_tokenize(err) corrs = tokenize.word_tokenize(corr) del_num, ins_num = ld.levenshtein_distance(errs, corrs) del_portion = del_num / len(errs) ins_portion = ins_num / len(corrs) if del_num < d_num and ins_num < i_num and del_portion < 0.33 and ins_portion < 0.33: print(errs + "\t" + corrs)
def process(text): err_corr = text.split("\t") if len(err_corr) == 2: err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False) err = mojimoji.han_to_zen(err, ascii=False, digit=False) corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False) corr = mojimoji.han_to_zen(corr, ascii=False, digit=False) err_lang = utils.lang_check(err, lang) corr_lang = utils.lang_check(corr, lang) if err_lang and corr_lang: errs = list(err) corrs = list(corr) del_num, ins_num = ld.levenshtein_distance(errs, corrs) del_portion = del_num / len(errs) ins_portion = ins_num / len(corrs) if del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4: print(err + "\t" + corr)
def f(a, b): print a,b return levenshtein_distance(a, b)
def extractMostCommonGroup(url, html=None): ## REQUIRING A HOSTNAME MATCH DOESN'T WORK FOR RELATIVE URLS.. # match = url_hostname_re.match(url) # if match is not None: # hostname = match.group(2) # else: # raise Exception('Failed to extract hostname from the supplied url?') # a=array([[1,2,9,10,99,100], [3,4,10,11,99,150], [99, 100, 10, 13, 400, -3]]) # mask=array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]) # clusterid, error, nfound = kcluster(a, # nclusters=2, mask=mask, weight=array([1, 1, 1, 1, 1, 1]), # transpose=0, npass=1000, method='a', dist='e') # print clusterid # print error # print nfound # return if html is None: html = wget(url) bodyOnlyMatch = bodyOnlyRe.match(html) if bodyOnlyMatch: d = pq(bodyOnlyMatch.group(1)) else: print 'HTML: %s' % html print 'Warning: <body> tag region could not be extracted - this is bad' d = pq(html) def _print(x): print x return True #print [urlparse.urljoin(url, a.attrib['href']) for a in d('a')] lst = list(excludeDuplicates([ urlparse.urljoin(url, a.attrib['href']) for a in d('a') if ( #_print(pq(a).children('img')) and len(pq(a).children('img')) > 0 and a.attrib.has_key('href') and containsNumberRe.search(a.attrib['href']) ) ])) #print lst domain = urlparse.urlsplit(url).netloc diffs = dict([ (item, levenshtein_distance(lst[i].replace('http://', ''), domain)) for i, item in enumerate(lst) ]) # avg = mean(diffs) # print avg # print min(diffs) # print max(diffs) # pprint (lst) def f(a, b): return abs(a - b) numGroups = len(diffs) / 2 print 'Num groups:',numGroups if len(diffs) == 0: print 'LargestImage extract.. error: no diffs!' return [] groups = kmeans(diffs, numGroups, f) # Select and return the largest group of similar links. maxIdx = -1 conflictedIdx = -1 maxSz = 0 for idx, group in groups.items(): l = len(group) print l if l > maxSz: maxIdx = idx maxSz = l # Any previous conflict is no longer relevant. conflictedIdx = -1 elif l == maxSz: # Mark conflicted state. conflictedIdx = idx print 'groups = %s' % groups # Make sure we got a result. if maxIdx == -1: raise Exception('No groups were found? Very odd.. groups = %s' % groups) # Check to see if the largest group had conflicts. if conflictedIdx != -1: print 'WARNING: There was a group of equal size which was not selected.' imageLinks = False for link in groups[maxIdx]: if imageRe.match(link): imageLinks = True break if not imageLinks: print 'no image links were found.. for url=%s' % url out = [] for link in groups[maxIdx]: out.append(extractLargestImageUrlFromUrl(link)) return out #print 'imageLinks = %s' % imageLinks return groups[maxIdx]
#find incorrect words if token not in word2index: if verbose == 1: print ">> incorrect word:", token #generate context context = re.sub(token, '[]', tweet) context_proposition = explore_context2vec(context, w, word2index, index2word, model, target_exp, n_results) #find clother word in context min_dist = np.inf for proposition in context_proposition: dist = levenshtein_distance(token, proposition) if dist < min_dist: min_dist = dist correct_word = proposition[0] if verbose == 1: print ">> correction:", correct_word print '>> Levenshtein distance:', min_dist correct_tweet = re.sub(token, correct_word, tweet) else: correct_tweet = tweet if verbose == 1: print correct_tweet, '\n' normalised_rtweet_list.append(correct_tweet) print 'Writing file...' normalised_corpus = open('normalised_' + corpus_file, 'w')
def test_generic(self): for idx, ans in enumerate(self.distance): self.assertEqual( ans, levenshtein_distance(self.string1s[idx], self.string2s[idx]))