def get_most_matching(prefix, suffix, threshold, conn, cur, is_subject): # is_subject: TRUE if the user inputs a non-existent department. FALSE otherwise. # TODO: remove redunandices about is_subject call matchers = [] if is_subject: subject_list = cur.execute("SELECT subject FROM subjects").fetchall() unpacked_list = [subject[0] for subject in subject_list] for subject in unpacked_list: result = damerauLevenshtein(prefix, subject, similarity=True) if result >= threshold: matchers.append(subject) else: related_subjects = literal_eval( cur.execute("SELECT relatives FROM subjects WHERE subject=?", (prefix, )).fetchone()[0]) course_list = cur.execute( "SELECT * FROM classes WHERE subject IN (%s)" % ", ".join("?" * len(related_subjects)), related_subjects, ).fetchall() for course in course_list: num = course[1] result = damerauLevenshtein(suffix, num, similarity=True) if result >= threshold: matchers.append(course[0] + " " + course[1]) return matchers[:4]
def test_damerauLevenshtein(self): assert fastDamerauLevenshtein.damerauLevenshtein("ca", "abc", False) == 2.0 assert fastDamerauLevenshtein.damerauLevenshtein( "a cat", "a abct", False) == 2.0 assert fastDamerauLevenshtein.damerauLevenshtein(["ab", "cd"], ["ab"], False) == 1.0 assert fastDamerauLevenshtein.damerauLevenshtein("car", "cars") == 0.75 assert fastDamerauLevenshtein.damerauLevenshtein("", "", False) == 0.0 assert fastDamerauLevenshtein.damerauLevenshtein("", "") == 1.0 assert fastDamerauLevenshtein.damerauLevenshtein([], [], False) == 0.0 assert fastDamerauLevenshtein.damerauLevenshtein([], []) == 1.0
def match_score(alpha, beta): sim = damerauLevenshtein(alpha, beta, similarity=True, replaceWeight=1 ) return (sim - 0.5)
# This software is a free software. Thus, it is licensed under GNU General Public License. # Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class. # Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com> from fastDamerauLevenshtein import damerauLevenshtein from texttable import Texttable damerauLevenshtein('car', 'cars', similarity=True) # expected result: 0.75 # zeros() was origianlly from NumPy. # This version is implemented by alevchuk 2011-04-10 def zeros(shape): retval = [] for x in range(shape[0]): retval.append([]) for y in range(shape[1]): retval[-1].append(0) return retval gap_penalty = -1 def match_score(alpha, beta): sim = damerauLevenshtein(alpha, beta, similarity=True, replaceWeight=1 ) return (sim - 0.5)
def get_median_error(error_function, row, ID, objects, coordinates, start_coordinates, c, k, dimension, sequence, distances_dict, n=1): ''' Return median error for chosen error measure (editdist or prequential) for n trials. Parameters ---------- error_function : function Error measure to use: editdist or prequential. row : int Row number in dataframe. ID : str Identifier for episode. objects : list Objects in episode. coordinates : dictionary Coordinates of objects. start_coordinates : list List of coordinates where subject is standing before each picking-up action. c : dictionary Parameter values for containment for all objects. k : dictionary Parameter values for relational dependencies for all objects. dimension : list [int, str] Dimension in which to consider distances. The default is [3, ]. sequence : str Observed sequence of objects in episode. distances_dict : dictionary Dictionary containing distances between objects in all dimensions. n : int, optional Number of iterations. The default is 1. Returns ------- median : float Median error value. ''' error_list = [] for x in range(0, n): # get median error using edit distance (predict whole sequence, then compare) if error_function == 'editdist': # get predicted sequence for list of objects prediction = ''.join( predict_editdist(distances_dict, ID, objects, coordinates, start_coordinates, sequence, c, k, dimension)) # calculate normalized error between predicted and given sequence dl = 1 - damerauLevenshtein(sequence, prediction) error_list.append(dl) # get median summed error using prequential method (predict only for each next step) elif error_function == 'prequential': errors = predict_prequential(distances_dict, ID, objects, coordinates, start_coordinates, sequence, c, k, dimension) summed = sum(errors) error_list.append(summed) median = np.nanmedian(error_list) return median
from spellchecker import SpellChecker spell = SpellChecker() # find those words that may be misspelled misspelled = spell.unknown(['something', 'is', 'hapenning', 'here']) for word in misspelled: # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word)) from Levenshtein import distance as levenshtein_distance from fastDamerauLevenshtein import damerauLevenshtein dist = levenshtein_distance('aaa_cb', 'aaa_bc') print(dist) dist2 = damerauLevenshtein('aaa_cb', 'aaa_bc', similarity=False) print(dist2)
with open('data/val_ocr_2823.pickle', 'rb') as f: input_val, output_val = pickle.load(f) with open('data/test_ocr_707.pickle', 'rb') as f: input_test, output_test = pickle.load(f) print('Predict the test set') POCR1 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_11.h5', optimizers=Adadelta()) POCR2 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_12.h5', optimizers=Adadelta()) POCR3 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_13.h5', optimizers=Adadelta()) y_hat1 = POCR1.predict(input_test['train_input']) y_hat2 = POCR2.predict(input_test['train_input']) y_hat3 = POCR3.predict(input_test['train_input']) y_hat = y_hat1 * y_hat2 y_hat = y_hat1 * y_hat2 * y_hat3 pred = decode_batch(y_hat) true = decode_true(input_test['the_labels']) res = pd.DataFrame() res['true'] = true res['pred'] = pred res['score'] = [damerauLevenshtein(true[i], pred[i], similarity=False) for i in range(len(pred))] score = np.mean(res['score'].values) print(score)
def StringDistance(s1, s2): return damerauLevenshtein(s1, s2, similarity=False) return d[lenstr1 - 1, lenstr2 - 1]