def song_comparison(s, t): s, t = s.split(), t.split() similarity = levenshtein(' '.join(s), ' '.join(t)) n, m = len(s), len(t) if n > m: return similarity if n > 2: #print(s, t) arr = [[10**9 for i in range(m)] for j in range(n)] #print(arr) for i, u in enumerate(s): for j, v in enumerate(t): #print(i, j) arr[i][j] = levenshtein(u, v) #print(arr) return min_weight_max_matching(n, m, arr) d = {(u, v): levenshtein(u, v) for u in s for v in t} for p in permutations(t, n): #print(s, p) similarity = min(similarity, sum(d[(u, v)] for (u, v) in zip(s, p))) return similarity
def check_first_insertion(query, good, bad): args = dict( deletion_cost=100, insertion_cost=1, first_insertion_cost=50, prepend_first_insertion_cost=5, append_first_insertion_cost=10, substitution_cost=100, ) assert_less( levenshtein(query, good, memo=[], precol=[], **args), levenshtein(query, bad, memo=[], precol=[], **args) )
def update(self, old, new): self.memo = self.memo[:_common_prefix_length(old, new) + 1] self.precol = [] if self.parent is None else [x[-1] for x in self.parent.memo] levenshtein( new, self.text, deletion_cost=self.tree.deletion_cost, insertion_cost=self.tree.insertion_cost, substitution_cost=self.tree.substitution_cost, transposition_cost=self.tree.transposition_cost, memo=self.memo, precol=self.precol )
def compareDataset(in1, in2, path): d1 = in1.get(path) d2 = in2.get(path) if toIgnore(d1): # print('Ignoring ' + path) return None if d2 is None: # print('Second misses path: ' +path) return 'Missing' diff = testSpecial(d1,d2,path) if diff != None: return diff if d1.dtype.type is np.string_: if d1[0] != d2[0]: return levenshtein(d1[0],d2[0]) else: return None elif d1.dtype.kind == 'f': diff = abs((d1[0] - d2[0])) if diff > .1: return diff else: return None else: # print('Integer DS: ' + path) diff = abs(d1[0] - d2[0]) if diff > 0: return diff else: return None
def find5(word,C): #word = request.args.get('q') minimum = len(word) top5 = ["","","","",""] edit_dist5 = [minimum]*5 for c in C: edit_dist = levenshtein.levenshtein(word,c) #print(edit_dist) if edit_dist > edit_dist5[4]: continue elif edit_dist == 0: continue elif edit_dist < edit_dist5[0]: edit_dist5= [edit_dist] + edit_dist5[:4] top5= [c] + top5[:4] elif edit_dist < edit_dist5[1]: edit_dist5= [edit_dist5[0]] + [edit_dist] + edit_dist5[1:4] top5= [top5[0]] + [c] + top5[1:4] elif edit_dist < edit_dist5[2]: edit_dist5= edit_dist5[:2] + [edit_dist] + edit_dist5[2:4] top5= top5[:2] + [c] + top5[2:4] elif edit_dist < edit_dist5[3]: edit_dist5= edit_dist5[:3] + [edit_dist] + [edit_dist5[3]] top5= top5[:3] + [c] + [top5[3]] elif edit_dist < edit_dist5[4]: edit_dist5= edit_dist5[:4] + [edit_dist] top5= top5[:4] + [c] return top5
def spellability(name, test=False): """ Return a 0-1 score representing how spellable this name is. 0 is bad. How? To start off, we look at all the names with the same Metaphone key, then assign a penalty depending on collisions with any common names. The size of the penalty will depend on how common the other name is relative to this name (so this is also somewhat of a popularity weighting), and the Levenshtein distance to any names with Metaphone collisions, since Metaphone ignores vowels and we do care about actual vowel and letter differences here. There might be more we could do here, but I can't think of it now. """ score = 1 for metaphone in name.metaphones: for other in _metaphone_index[metaphone]: if other is name: continue pop_ratio = (other.get_popularity(emphasize_recent=True) / (name.get_popularity(emphasize_recent=True) or 0.000001)) if pop_ratio < 0.01: continue # levenshtein is expensive distance = levenshtein(other.name, name.name) penalty = math.log(1 + pop_ratio) / (distance ** 2) if test and name.name == "Eliza" and penalty > 0.1: print (name.name, "took a hit of", penalty, "from", other.name, pop_ratio, distance) score *= max(0.5, (1 - penalty)) if test: print "%s vs. %s: pop_ratio %.5f"%(other, name, pop_ratio) if test: print "%s got spellability score %.3f"%(name, score) return score
def findPhonetics(subject, bookshelf): dictionary = open(bookshelf) try: subPhonetic = metaphone(subject) except TypeError: print("Soundex broke") return levenNumber = 0 similar = [] for line in dictionary: if "-" not in line: line = line.split("\n")[0] try: linePhonetic = metaphone(line) if line != subject and linePhonetic.find(subPhonetic) != -1: subjectIndex = findPhoneticIndex(subject, line) similar.append( (line, levenshtein( line[subjectIndex:subjectIndex + len(subject)], subject))) except TypeError: #print("Broke on " + line + "... Continuing") continue similar.sort(key=lambda tup: tup[1]) return similar
def submitGuess(phoneNumber, guess, self): currentPlayers = User.User.gql("") # for each current person in the db logging.info("Guess is " + guess) for player in currentPlayers: logging.info("player number " + str(player.number)) if player.number == phoneNumber or not player.word: continue distance = levenshtein(guess.lower(), player.word.lower()) logging.info("distance: " + str(distance)) if distance <= GUESS_THRESHOLD: r = twilio.Response() # Award points guesserScore = addPoints(phoneNumber, CORRECT_GUESS) guesserMessage = "You guessed " + player.word + " correct! Your score is now " + str(guesserScore) + "." if guesserScore >= MAX_SCORE: guesserMessage += " You win!" r.addSms(guesserMessage, to=phoneNumber) guessedScore = addPoints(player.number, NAME_GUESSED) guessedMessage = "You were guessed! Score: " + str(guessedScore) + "." if guessedScore >= MAX_SCORE: guessedMessage += " You win!" else: guessedMessage += " Next: " + assignNextName(player.number) r.addSms(guessedMessage, to=player.number) self.response.out.write(r) if guesserScore >= MAX_SCORE or guessedScore >= MAX_SCORE: resetGame() return player.word
def find5(word, C): #word = request.args.get('q') minimum = len(word) top5 = ["", "", "", "", ""] edit_dist5 = [minimum] * 5 for c in C: edit_dist = levenshtein.levenshtein(word, c) #print(edit_dist) if edit_dist > edit_dist5[4]: continue elif edit_dist == 0: continue elif edit_dist < edit_dist5[0]: edit_dist5 = [edit_dist] + edit_dist5[:4] top5 = [c] + top5[:4] elif edit_dist < edit_dist5[1]: edit_dist5 = [edit_dist5[0]] + [edit_dist] + edit_dist5[1:4] top5 = [top5[0]] + [c] + top5[1:4] elif edit_dist < edit_dist5[2]: edit_dist5 = edit_dist5[:2] + [edit_dist] + edit_dist5[2:4] top5 = top5[:2] + [c] + top5[2:4] elif edit_dist < edit_dist5[3]: edit_dist5 = edit_dist5[:3] + [edit_dist] + [edit_dist5[3]] top5 = top5[:3] + [c] + [top5[3]] elif edit_dist < edit_dist5[4]: edit_dist5 = edit_dist5[:4] + [edit_dist] top5 = top5[:4] + [c] return top5
def get_distance(variant1, variant2, distanceMatrix): if distanceMatrix.get(variant1, None) is not None: if distanceMatrix[variant1].get(variant2, None) is None: distanceMatrix[variant1][variant2] = levenshtein( variant1, variant2) if distanceMatrix.get(variant2, None) is None: distanceMatrix[variant2] = dict() distanceMatrix[variant2][variant1] = distanceMatrix[variant1][ variant2] else: distanceMatrix[variant1] = dict() distanceMatrix[variant1][variant2] = levenshtein(variant1, variant2) if distanceMatrix.get(variant2, None) is None: distanceMatrix[variant2] = dict() distanceMatrix[variant2][variant1] = distanceMatrix[variant1][variant2] return distanceMatrix[variant1][variant2]
def condition(movie, titles): if float(movie.rank) >= 8.0: if int(movie.votes) >= 1000: for needed in titles: if lnsh.levenshtein(movie.title, needed) <= 3: #print('is less than 3') return True return False
def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest( 10, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) print pr cr = zip(pr, test_pairs) print cr # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if levenshtein.levenshtein(incorrect, cr[1][1]) > 2: return 'gopdebate' else: return cr[1][1]
def test_mutar(self): sol = self.__tsp.generar() mutada = self.__tsp.mutar(sol) distancia = levenshtein(sol, mutada) # las soluciones no deben ser iguales self.assertGreater(distancia, 0) # las soluciones no deben variar en más del 50% self.assertLessEqual(distancia, ceil(len(sol) / 2))
def test_algorith(self): """Test the algorithm for some strings""" name = "abba" word_list = [ "abba", "acba", "abbba", "ackbar" ] answer_list = [0, 1, 1, 3] index = 0 for word in word_list: self.assertEqual(answer_list[index],levenshtein.levenshtein(name, word)) index += 1
def __getDistanceVariants(self, variant1, variant2): if variant1 not in self.__distanceMatrix: self.__distanceMatrix[variant1] = dict() if variant2 not in self.__distanceMatrix[variant1]: distance = levenshtein(variant1, variant2) self.__distanceMatrix[variant1][variant2] = distance else: distance = self.__distanceMatrix[variant1][variant2] return distance
def correct(self, incorrect): test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) return cr[1][1]
def build_alignment(input_file, ref_file_norm, ref_file_mut, quality): # here we will have a dict with gene names and how many reads aligns to norm and mutant form gene_list = collections.defaultdict(list) ref_norm_dict = parse_reference(ref_file_norm) ref_mut_dict = parse_reference(ref_file_mut) for seq in parse_fastq(input_file, quality): min_dist = 1000 ref_min = "" seq_small = seq #print('SEQ %s' % (seq)) for ref_norm in ref_norm_dict: dist = levenshtein(ref_norm_dict[ref_norm], seq_small) #print(dist, ref_norm) if dist < min_dist: min_dist = dist ref_min = ref_norm mut_dist = levenshtein(ref_mut_dict[ref_min], seq_small) gene_list[ref_min] = [min_dist, mut_dist] print(gene_list)
def stringCheck(x): whitelistRaw = open('whitelist.csv','r') whitelist = whitelistRaw.readlines() for w in range(len(whitelist)): whitelist[w] = whitelist[w].replace('\n','') for w in whitelist: if levenshtein(w, x) <= 1 and len(x) >= 6: # allows for minor misspellings of innocuous word, but only in longer words # print 'variant of ' + w return False return True
def correct(dictionary, word): lines = [line for line in codecs.open(dictionary, encoding="utf-8")] lines_map = {} i = 0 for line in lines: lines_map[line] = levenshtein(line, word) i += 1 if i % 10000 == 0: print(i * 100) / len(lines), "%" for line, score in nsmallest(5, lines_map.items(), key=itemgetter(1)): print line, " : ", score
def levenshtein_kmers(kmerfile, bh_pvalue_cutoff, outfile): enrichments = {} levenshteins = {} kmers = [] kmersfh = open(kmerfile, 'r') for line in kmersfh: line = line.strip().split('\t') kmer = line[0] enrichment = line[3] bh_pvalue = line[5] #Define case by enrichment (greater than or less than 1) if kmer != 'kmer': if float(bh_pvalue) <= float(bh_pvalue_cutoff): if enrichment == 'NA': kmers.append(kmer.upper()) enrichments[kmer] = enrichment elif float(enrichment) > 1: kmers.append(kmer.upper()) enrichments[kmer] = enrichment elif float(enrichment) < 1: kmers.append(kmer.lower()) enrichments[kmer] = enrichment kmersfh.close() #Compare every kmer to every other kmer. Make a list of distances for every kmer. #Make sure kmers are converted to upper case so distances do not include case. #Put that list in a dictionary with kmer as key for kmer in kmers: levenshtein_list = [] for KMER in kmers: levenshtein_list.append( levenshtein.levenshtein(kmer.upper(), KMER.upper())) levenshteins[kmer] = levenshtein_list #Turn these lists into an array levenshtein_array = np.array(levenshteins[kmers[0]]) for idx, kmer in enumerate(kmers): if idx > 0: levenshtein_array = np.vstack( [levenshtein_array, levenshteins[kmer]]) print levenshtein_array #Output array np.savetxt(outfile, levenshtein_array, delimiter='\t', fmt='%2f') #Put kmer list (in order) at top of output with open(outfile, 'r+') as f: old = f.read() f.seek(0) f.write(('\t').join(kmers) + '\n' + old)
def get_most_likely_individuals(self, query, distance=3): individuals = self.get_individuals_for_levenshtein() out = [] for element in individuals: calculated_distance = levenshtein(query, element) if calculated_distance <= distance: out.append((calculated_distance, element)) # remove dublettes while preserving list ordering out = [element[1] for element in list(dict.fromkeys(sorted(out)))] return out
def glueto(hw): global counter1, counter2 hw = hw.replace(' ','') hw = hw.replace('*','') m = re.search('(.*)[(](.+)[)](.*)',hw) pre, mid, post = re.sub('[^a-zA-Z]','',m.group(1)), re.sub('[^a-zA-Z]','',m.group(2)), re.sub('[^a-zA-Z]','',m.group(3)) # decide the place to change prelev = levenshtein.levenshtein(pre[-len(mid):],mid) postlev = levenshtein.levenshtein(post[:len(mid)],mid) out = hw+":404" if re.search('.{1}[(].{1}[)]',hw): # a(A)nEpuRa #print hw, "5" out = mid+post+":5" elif re.search('abBr.{1}[(]Br.{1}[)]',hw): # abBra(Bra)puzpa, abBro(Bro)tTa #print hw, "6" out = "a"+mid+post+":6" elif re.search('UrdDa[(]rdDva[)]',hw): # abBra(Bra)puzpa, abBro(Bro)tTa #print hw, "7" out = "UrdDva"+post+":7" elif prelev < postlev: if pre[-len(mid):].startswith(mid[0]) and len(pre)>=len(mid) and not pre[-len(mid):]==mid: #print hw, "1" out = pre[:-len(mid)]+mid+post+":1" elif pre[-len(mid):].endswith(mid[-1]) and len(pre)>=len(mid) and not pre[-len(mid):]==mid: #print hw, "2" out = pre[:-len(mid)]+mid+post+":2" elif postlev < prelev: if post[:len(mid)].startswith(mid[0]) and len(post)>=len(mid) and not post[:len(mid)]==mid: #print hw, "3" out = pre+mid+post[len(mid):]+":3" elif post[:len(mid)].endswith(mid[-1]) and len(post)>=len(mid) and not post[:len(mid)]==mid: #print hw, "4" out = pre+mid+post[len(mid):]+":4" elif alphabetdistance.distancescore(pre[:-len(mid)],mid) < alphabetdistance.distancescore(post[:len(mid)],mid): #print hw, "8" out = pre[:-len(mid)]+mid+post+":8" elif alphabetdistance.distancescore(pre[:-len(mid)],mid) > alphabetdistance.distancescore(post[:len(mid)],mid): #print hw, "9" out = pre+mid+post[len(mid):]+":9" return out
def compare(word1, word2): word1 = word1.strip().lower().replace(u' ', u'') word2 = word2.strip().lower().replace(u' ', u'') word1_len = len(word1) word2_len = len(word2) lev = levenshtein(word1, word2) if lev == 0: return 1.0 else: if word1_len > word2_len: return (float(lev) / word1_len) * 1 / (float(lev) / word2_len) else: return (float(lev) / word2_len) * 1 / (float(lev) / word1_len)
def findByORC(wordToFix, lexicon): sameORC = [ seq[1][4] + seq[1][7:11] for seq in lexicon if orcDecode.extractORC(seq[1]) == orcDecode.extractORC(wordToFix) ] print sameORC word = orcDecode.extractword(wordToFix) levDist = [levenshtein.levenshtein(word, possible) for possible in sameORC] print levDist levPerPossible = min(zip(sameORC, levDist), key=lambda x: x[1]) print levPerPossible return wordToFix[:4] + levPerPossible[0][0] + 'TT' + levPerPossible[0][ 1:] + wordToFix[11:]
def best_filename(year, abbreviated_court, court_url, citations): """Choose the best name for this judgment from the available citations. Is a generator, returning alternative versions.""" dummy_citation = "[%d] %s " % (year, abbreviated_court) (distance, name) = min((levenshtein.levenshtein(dummy_citation, s, deletion_cost=2,substitution_cost=2), s) for s in citations) basic_name = str(year)+"/"+name.replace(' ','_').replace('/','__') yield os.path.join(court_url,basic_name + ".html") for c in range(1,100): yield os.path.join(court_url, basic_name + "_%d"%c + ".html") raise StandardConversionError("something's going wrong: we can't give this a filename")
def main(): subject = sys.argv[1] dictionary = open("/usr/share/dict/web2") levenNumber = 2 similar = [] for line in dictionary: line = line.split("\n")[0] if subject != line and subject[0] == line[0] and levenshtein( subject, line) <= levenNumber: similar.append(line) print(similar)
def distance_to(self, text): return levenshtein( self.text, text, deletion_cost=self.deletion_cost, insertion_cost=self.insertion_cost, first_insertion_cost=self.first_insertion_cost, prepend_first_insertion_cost=self.prepend_first_insertion_cost, append_first_insertion_cost=self.append_first_insertion_cost, substitution_cost=self.substitution_cost, transposition_cost=self.transposition_cost, memo=self.memo, precol=self.precol )
def __combineTracesAndTree(self, traces): #We transform the set of sequences into a list and sort it, to discretize the behaviour of the algorithm sequencesTree = list(self.__getAllPotentialSequencesTree(self.tree,"")) sequencesTree.sort() for trace in traces: bestSequence = "" lowestDistance = sys.maxsize traceSequence = self.traceToSequenceDict[trace] for treeSequence in sequencesTree: currentDistance = levenshtein(traceSequence, treeSequence) if currentDistance < lowestDistance: bestSequence = treeSequence lowestDistance = currentDistance self.__addCaseToTree(trace, bestSequence)
def edit_distance(): """ Calculates the edit distance between two strings via JSON POST """ try: req_json = request.get_json() a, b = req_json['a'], req_json['b'] except KeyError: return api_error("Please enter two values") data = { 'result': levenshtein(a, b) } return api_success(data)
def calculate_trace(s1, s2, model): edits, trace_matrix = levenshtein(s1, s2, model)[1:] for i in reversed(range(len(trace_matrix))): for j in reversed(range(len(trace_matrix[i]))): try: pass if trace_matrix[i][j] and trace_matrix[i + 1][ j + 1] and trace_matrix[i][j - 1] and not trace_matrix[ i + 1][j] and not trace_matrix[i - 1][j]: trace_matrix[i + 1][j] = True trace_matrix[i][j] = False except KeyError: pass return Trace(s1, s2, trace_matrix)
def closest_match(input_word: str, words: Set) -> str: real_word(input_word) print("Calculating edit distance...") # Compute the edit distance between the input word and all words. ld_values_list = [levenshtein(input_word, w) for w in words] # Merge the computed edit values list with the words list. ld_dict = dict(zip(ld_values_list, words)) # Get the min value of the edit values list. min_key = min(ld_dict.keys()) return ld_dict.get(min_key)
def default(self, name): name = name.decode('utf-8') map_kaomoji[name] similar_kaomoji = sorted( (levenshtein(name, i), abs(len(i)-len(name)), i) for i in kaomoji_guess_list )[:100] similar_kaomoji = [i[-1] for i in similar_kaomoji] return self._template('kaomoji/kaomoji_item.html', { 'cur_kaomoji': name, 'kaomoji_list': map_kaomoji[name], 'similar_kaomoji': similar_kaomoji })
def test_values(self): """ Tests for levenshtein. """ values = [ ['Nebraska', 'Bill Brasky', 7], ['aa', '', 2], ['', 'aa', 2], ['AA', 'Aa', 1], ['ab', 'Aa', 2], ['aa', 'ab', 1], ['a', 'abc', 2], ] for a, b, expected in values: distance = levenshtein(a, b) self.assertEqual(distance, expected)
def get_features_tgt(self, target, parallelsentence): """ Calculates Levenshtein distance for the given target sentence, against the reference sentence @param simplesentence: The target sentence to be scored @type simplesentence: sentence.sentence.SimpleSentence @rtype: dict @return: dictionary containing Levenshtein distance as an attribute """ target_untokenized = target.get_string() try: ref_untokenized = parallelsentence.get_reference().get_string() wer_value = levenshtein(target_untokenized, ref_untokenized) return {'ref-lev': str(wer_value)} except: return {}
def find10(word,C): minimum = len(word) top10 = ["","","","","","","","","",""] edit_dist10 = [minimum]*10 for c in C: edit_dist = levenshtein.levenshtein(word,c) #print(edit_dist) if edit_dist > edit_dist10[9]: continue elif edit_dist == 0: continue elif edit_dist < edit_dist10[0]: edit_dist10= [edit_dist] + edit_dist10[:9] top10= [c] + top10[:9] elif edit_dist < edit_dist10[1]: edit_dist10= [edit_dist10[0]] + [edit_dist] + edit_dist10[1:9] top10= [top10[0]] + [c] + top10[1:9] elif edit_dist < edit_dist10[2]: edit_dist10= edit_dist10[:2] + [edit_dist] + edit_dist10[2:9] top10= top10[:2] + [c] + top10[2:9] elif edit_dist < edit_dist10[3]: edit_dist10= edit_dist10[:3] + [edit_dist] + edit_dist10[3:9] top10= top10[:3] + [c] + top10[3:9] elif edit_dist < edit_dist10[4]: edit_dist5= edit_dist10[:4] + [edit_dist] + top10[4:9] top10= top10[:4] + [c] + top10[4:9] elif edit_dist < edit_dist10[5]: edit_dist10= edit_dist10[:5] + [edit_dist] + edit_dist10[5:9] top10= top10[:5] + [c] + top10[5:9] elif edit_dist < edit_dist10[6]: edit_dist10= edit_dist10[:6] + [edit_dist] + edit_dist10[6:9] top10= top10[:6] + [c] + top10[6:9] elif edit_dist < edit_dist10[7]: edit_dist10= edit_dist10[:7] + [edit_dist] + edit_dist10[7:9] top10= top10[:7] + [c] + top10[7:9] elif edit_dist < edit_dist10[8]: edit_dist10= edit_dist10[:8] + [edit_dist] + edit_dist10[8:9] top10= top10[:8] + [c] + top10[8:9] elif edit_dist < edit_dist10[9]: edit_dist10= edit_dist10[:-1] + [edit_dist] top10= top10[:-1] + [c] print(top10) return top10
def suggesthw(inputword): #hw1 = h.hw1() fin = codecs.open('hw11.txt','r','utf-8') hw1 = fin.readlines() hw1 = map(triming,hw1) if inputword in hw1: print "word found in hw1" return inputword else: output = [] typicalheadwords = [member for member in hw1 if (re.search('^'+inputword[0],member) and len(inputword)==len(member))] for headword in typicalheadwords: output.append( (headword,lev.levenshtein(inputword,headword),initmatch(inputword,headword)) ) output = sorted(output,key=lambda x: x[2], reverse=True) output = sorted(output,key=lambda x: x[1]) leasteditdistance = output[0][1] leastinitmatch = output[0][2] return [(hw,edit,init) for (hw,edit,init) in output if edit==leasteditdistance]
def test(self): count = 0 for incorrect, correct in self.ppairs_test: # Get the top 100 candidats with smallest levenshtein distance test_pairs = [(incorrect, candidate) for candidate in heapq.nsmallest(100, self.dictionary, key=lambda x: levenshtein.levenshtein(incorrect, x))] gx_test = self.fe.transform(test_pairs) # Pr is a list of probability, corresponding to each correction pair in test_pairs pr = self.m.predict_proba(gx_test) cr = zip(pr, test_pairs) # We use the one with largest probability as the correction of the incorrect word cr = max(cr, key=lambda x: x[0][0]) if cr[1][1] == correct: count += 1 else: print (incorrect, correct), print cr[1][1] print print count/float(len(self.ppairs_test))
def __init__(self, corpfile, formcount, errorfile): self.word_count = 0 self.word_occurences = {} self.accepted_form_count = formcount self.error_occurences = {} f = codecs.open(corpfile, encoding="utf-8") for line in f: line = self._prepare_line(line) for word in re.split("\\s", line): self.word_occurences[word] = self.word_occurences.get(word, 0) + 1 self.word_count += 1 f = codecs.open(errorfile, encoding="utf-8") self.error_count = 0 for line in f: self.error_count += 1 splited = line.split(";") lev = levenshtein(splited[0], splited[1]) self.error_occurences[lev] = self.error_occurences.get(lev, 0) + 1
def Results(self): ''' Display all identified unique log event types @return None ''' #if options.outfile == true: dump to file print "\n========== Potential Unique Log Events ==========\n" self.BuildResultsTree(self.rootNode) #Todo - commandline args to toggle levenshtein identification of dupes previous = '' for entry in self.entries: if levenshtein.levenshtein(entry, previous) < ClusterGroup.VarDistance : print "\t" + entry else: print entry previous = entry
def computeDifference(entry, malBehavior): """ Compute the difference between the malicious segments and the inserted malicious behavior and check if the segment contains the malicious behavior :param entry: the most malicious segment of a trace :param malBehavior: malicious behavior that was inserted into the benign traces :type entry: string :type malBehavior: list :return: the results of the appearance and difference metric for the currently regarded malicious segment :rtype: string """ # convert string of calls of the most malicious segment into and array and compute the Levenshtein difference to the inserted malicious behavior calls = entry.split(';')[0] callsArray = ast.literal_eval(calls) distance = lev.levenshtein(callsArray, malBehavior) # check if the most malicious segment contains the inserted malicious behavior if checkForBehavior(callsArray, malBehavior): return '1;' + str(distance) else: return '0;' + str(distance)
def guess_simple2(data, word, lim): words = [] distances = [] smallest = 100 for w in data: distance = levenshtein.levenshtein(word, w) if distance <= smallest: words.insert(0, w) distances.insert(0, distance) smallest = distance else: words.append(str(w)) distances.append(distance) output = {} for i in range(lim): output[words[i]] = data[words[i]] i += 1 return output
def download_subtitles_for_path(path): video_filenames = list(get_videos(path)) print "checking against %d video files" % len(video_filenames) for title, sub_id in get_last_subs(): print "checking", title for video_filename in video_filenames: basename, _ = os.path.splitext(video_filename) score = levenshtein(title, basename) if score < LEVENSHTEIN_DIST_BOUND or basename.startswith(title) or title.startswith(basename): print title, basename, score [downloaded] = list(get_subtitle(sub_id, path)) full_path_sub = os.path.join(path, downloaded) _, sub_ext = os.path.splitext(downloaded) full_path_vid = os.path.join(path, basename) name_for_sub = full_path_vid + sub_ext i = 0 while os.path.exists(name_for_sub): name_for_sub = full_path_vid + '.' + `i` + sub_ext i += 1 os.rename(full_path_sub, full_path_vid + sub_ext)
def guess_simple2(data,word,lim): words = [] distances = [] smallest = 100 for w in data: distance = levenshtein.levenshtein(word,w) if distance <= smallest: words.insert(0,w) distances.insert(0,distance) smallest = distance else: words.append(str(w)) distances.append(distance) output = {} for i in range(lim): output[words[i]] = data[words[i]] i += 1 return output
def Results(self): ''' Display all identified unique log event types @return None ''' #if options.outfile == true: dump to file print "\n========== Potential Unique Log Events ==========\n" self.BuildResultsTree(self.rootNode) #Todo - commandline args to toggle levenshtein identification of dupes previous = '' for entry in self.entries: if levenshtein.levenshtein(entry, previous) < ClusterGroup.VarDistance: print "\t" + entry else: print entry previous = entry
def grouping_list_by_levenstein(urls, key=lambda x: x): ret = [] prev = None urls = sorted(urls, key=key) lret = [] for url in urls: if prev: if levenshtein(key(prev), key(url)) == 1: if not lret: lret.append(prev) lret.append(url) else: if lret: ret.append(lret) lret = [] #urls.insert(0, url) prev = url else: if lret: ret.append(lret) return ret
def analytic_score_sentences(self, sentence_tuples): return {'ref-lev': average([levenshtein(h, r) for h, r in sentence_tuples])}
def test_blank(self): self.assertEquals(levenshtein('', ''), 0)
def levenshtein(text1, text2): value = lev.levenshtein(text1, text2) return float(value)/float(len(text1))
def download_movies(page): url = "https://api.redbox.com/v3/products/movies?pageSize=10&pageNum=%s&apiKey=%s"\ % (page, REDBOX_APIKEY) logging.info("Fetching products...") try: response = fetch(url, headers={'Accept': 'application/json'}) logging.info("complete!") movies = json.loads(response.content) except: movies = {} if 'Products' not in movies or \ 'Movie' not in movies['Products'] or \ len(movies['Products']['Movie']) == 0: logging.info("Download complete!") return for obj in movies['Products']['Movie']: movie_id = obj['@productId'] movie = Movie.get_by_id(movie_id) if movie is None: movie = Movie(id=movie_id) properties = {} for key in obj: if type(obj[key]) != dict: properties[key.replace('@','').lower()] = obj[key] movie.populate(**properties) if type(movie.title) != str and type(movie.title) != unicode: movie.title = unicode(movie.title) if 'RatingContext' in obj and \ '@ratingReason' in obj['RatingContext']: movie.ratingReason = obj['RatingContext']['@ratingReason'] if 'Actors' in obj and 'Person' in obj['Actors']: movie.actors = ", ".join(obj['Actors']['Person']) if 'BoxArtImages' in obj and 'link' in obj['BoxArtImages'] \ and type(obj['BoxArtImages']['link']) == list \ and len(obj['BoxArtImages']['link']) >= 3 \ and '@href' in obj['BoxArtImages']['link'][2]: movie.thumb = obj['BoxArtImages']['link'][2]['@href'] movie.put() # Don't recalc score if it's really bad if hasattr(movie, 'score') and movie.score < 50 and movie.score > 0: continue movie.score = -1 # Then look up Rotten Tomatoes scores url = "http://api.rottentomatoes.com/api/public/v1.0/movies.json?q=%s&apikey=%s"\ % (urllib.quote(unicodedata.normalize('NFKD', movie.title).encode('ascii', 'ignore')), RT_APIKEY) response = fetch(url) if response.status_code != 200: logging.error("Could not retrieve Rotten Tomatoes information for %s: %s" % (obj['Title'], url)) content = '{"movies":{}}' if response.status_code == 403: return else: content = response.content for result in json.loads(content.strip())['movies']: if (not hasattr(movie, 'score') or movie.score == -1) and \ levenshtein(movie.title, unicode(result['title'])) / \ len(movie.title) < 0.2: # This is where the magic happens logging.info("Recalculating score for %s" % obj['Title']) movie.critics_score = result['ratings']['critics_score'] movie.critics_consensus = result['critics_consensus'] \ if 'critics_consensus' in result else '' movie.audience_score = result['ratings']['audience_score'] movie.score = int(( result['ratings']['critics_score'] + result['ratings']['audience_score'] ) / 2) if 'release_dates' in result and \ 'dvd' in result['release_dates']: movie.dvdreleasedate = result['release_dates']['dvd'] if 'release_dates' in result and \ 'theatre' in result['release_dates']: movie.theatrereleasedate = \ result['release_dates']['theatre'] # Adjust score based on release date try: daysago = (datetime.now() - \ datetime.strptime(movie.dvdreleasedate, \ "%Y-%m-%d")).days except: daysago = 90 movie.daysago = daysago if daysago <= 30: movie.score += 5 if daysago <= 7: movie.score += 10 if daysago > 90: movie.score -= 20 if not hasattr(movie, 'score'): movie.score = 0 # Save Rotten Tomatoes metadata try: movie.rottentomatoeslink = result['links']['alternate'] except: # This way, it always goes *at least* to the RT site, # and we avoid putting more logic in the template. movie.rottentomatoeslink = 'http://www.rottentomatoes.com/' # Save and return movie movie.put()
def similarity(self,val): import levenshtein return 1 - (levenshtein.levenshtein(self._val,val.get_val()) / float(max(len(self._val),len(val.get_val()))))
def write_metadata_to_sql(d,cursor,rel_judgment_dir): "Inserts judgment metadata to SQL database" # make sure there's a record for the court def get_court(court_name): cursor.execute('SELECT courtid,abbreviated_name,url FROM courts WHERE name = ?', (court_name,)) result = cursor.fetchone() return result result = get_court(d["court_name"]) if not result: abbreviated_court,d["court_name"] = min((levenshtein.levenshtein(d["court_name"], long), short, long) for (short, long) in courts.courts)[1:] result = get_court(d["court_name"]) if result: (courtid,abbreviated_court,court_url) = result else: court_url = os.path.join(rel_judgment_dir,abbreviated_court+'/') # Find the correct court category courtcategory = False for category,l in courts.categories.iteritems(): if abbreviated_court in l: courtcategory = category break if not courtcategory: raise StandardConversionError("something's going wrong: we can't find a courtcategory for " + abbreviated_court) cursor.execute('SELECT courtcategoryid FROM courtcategories WHERE name = ?', (courtcategory,)) result = cursor.fetchone() if result: courtcategoryid = result[0] else: cursor.execute('INSERT INTO courtcategories (name) VALUES (?)',(courtcategory,)) courtcategoryid = cursor.lastrowid cursor.execute('INSERT INTO courts(name, courtcategoryid, abbreviated_name,url) VALUES (?,?,?,?)', (d["court_name"],courtcategoryid,abbreviated_court,court_url)) courtid = cursor.lastrowid # insert a record for judgmental_url in best_filename(d["date"].year, abbreviated_court, court_url, d["citations"]): try: cursor.execute('INSERT INTO judgments(title, date, courtid, filename, bailii_url, judgmental_url) VALUES (?, ?, ?, ?, ?, ?)', (d["title"], d["date"], courtid, d["filename"], d["bailii_url"], judgmental_url)) break except sqlite.IntegrityError: pass judgmentid = cursor.lastrowid # store the citations for c in d["citations"]: cursor.execute('SELECT citationcodeid FROM citationcodes WHERE citationcode = ?', (c,)) result = cursor.fetchone() if result: i = result[0] else: cursor.execute('INSERT INTO citationcodes(citationcode) VALUES (?)', (c,)) i = cursor.lastrowid cursor.execute('INSERT INTO judgmentcodes(citationcodeid, judgmentid) VALUES (?, ?)', (i,judgmentid)) # store the parties for (i,n) in d["parties"]: cursor.execute('INSERT INTO parties(position, name, judgmentid) VALUES (?, ?, ?)', (i,n,judgmentid))