예제 #1
0
def get_most_matching(prefix, suffix, threshold, conn, cur, is_subject):
    # is_subject: TRUE if the user inputs a non-existent department. FALSE otherwise.
    # TODO: remove redunandices about is_subject call

    matchers = []

    if is_subject:
        subject_list = cur.execute("SELECT subject FROM subjects").fetchall()

        unpacked_list = [subject[0] for subject in subject_list]

        for subject in unpacked_list:
            result = damerauLevenshtein(prefix, subject, similarity=True)
            if result >= threshold:
                matchers.append(subject)
    else:
        related_subjects = literal_eval(
            cur.execute("SELECT relatives FROM subjects WHERE subject=?",
                        (prefix, )).fetchone()[0])

        course_list = cur.execute(
            "SELECT * FROM classes WHERE subject IN (%s)" %
            ", ".join("?" * len(related_subjects)),
            related_subjects,
        ).fetchall()

        for course in course_list:
            num = course[1]
            result = damerauLevenshtein(suffix, num, similarity=True)
            if result >= threshold:
                matchers.append(course[0] + " " + course[1])

    return matchers[:4]
 def test_damerauLevenshtein(self):
     assert fastDamerauLevenshtein.damerauLevenshtein("ca", "abc",
                                                      False) == 2.0
     assert fastDamerauLevenshtein.damerauLevenshtein(
         "a cat", "a abct", False) == 2.0
     assert fastDamerauLevenshtein.damerauLevenshtein(["ab", "cd"], ["ab"],
                                                      False) == 1.0
     assert fastDamerauLevenshtein.damerauLevenshtein("car", "cars") == 0.75
     assert fastDamerauLevenshtein.damerauLevenshtein("", "", False) == 0.0
     assert fastDamerauLevenshtein.damerauLevenshtein("", "") == 1.0
     assert fastDamerauLevenshtein.damerauLevenshtein([], [], False) == 0.0
     assert fastDamerauLevenshtein.damerauLevenshtein([], []) == 1.0
예제 #3
0
def match_score(alpha, beta):
    sim = damerauLevenshtein(alpha, beta, similarity=True,
                             replaceWeight=1
                             )
    return (sim - 0.5)
예제 #4
0
# This software is a free software. Thus, it is licensed under GNU General Public License.
# Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
# Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>

from fastDamerauLevenshtein import damerauLevenshtein
from texttable import Texttable

damerauLevenshtein('car', 'cars', similarity=True)  # expected result: 0.75


# zeros() was origianlly from NumPy.
# This version is implemented by alevchuk 2011-04-10
def zeros(shape):
    retval = []
    for x in range(shape[0]):
        retval.append([])
        for y in range(shape[1]):
            retval[-1].append(0)
    return retval


gap_penalty = -1


def match_score(alpha, beta):
    sim = damerauLevenshtein(alpha, beta, similarity=True,
                             replaceWeight=1
                             )
    return (sim - 0.5)

예제 #5
0
def get_median_error(error_function,
                     row,
                     ID,
                     objects,
                     coordinates,
                     start_coordinates,
                     c,
                     k,
                     dimension,
                     sequence,
                     distances_dict,
                     n=1):
    '''
    Return median error for chosen error measure (editdist or prequential) for n trials.

    Parameters
    ----------
    error_function : function
        Error measure to use: editdist or prequential.
    row : int
        Row number in dataframe.
    ID : str
        Identifier for episode.
    objects : list
        Objects in episode.
    coordinates : dictionary
        Coordinates of objects.
    start_coordinates : list
        List of coordinates where subject is standing before each picking-up action.
    c : dictionary
        Parameter values for containment for all objects.
    k : dictionary
        Parameter values for relational dependencies for all objects.
    dimension : list [int, str]
        Dimension in which to consider distances. The default is [3, ].
    sequence : str
        Observed sequence of objects in episode.
    distances_dict : dictionary
        Dictionary containing distances between objects in all dimensions.
    n : int, optional
        Number of iterations. The default is 1.

    Returns
    -------
    median : float
        Median error value.

    '''

    error_list = []

    for x in range(0, n):
        # get median error using edit distance (predict whole sequence, then compare)
        if error_function == 'editdist':
            # get predicted sequence for list of objects
            prediction = ''.join(
                predict_editdist(distances_dict, ID, objects, coordinates,
                                 start_coordinates, sequence, c, k, dimension))

            # calculate normalized error between predicted and given sequence
            dl = 1 - damerauLevenshtein(sequence, prediction)

            error_list.append(dl)

        # get median summed error using prequential method (predict only for each next step)
        elif error_function == 'prequential':
            errors = predict_prequential(distances_dict, ID, objects,
                                         coordinates, start_coordinates,
                                         sequence, c, k, dimension)
            summed = sum(errors)
            error_list.append(summed)

    median = np.nanmedian(error_list)
    return median
예제 #6
0
from spellchecker import SpellChecker

spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(['something', 'is', 'hapenning', 'here'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))

from Levenshtein import distance as levenshtein_distance
from fastDamerauLevenshtein import damerauLevenshtein

dist = levenshtein_distance('aaa_cb', 'aaa_bc')

print(dist)
dist2 = damerauLevenshtein('aaa_cb', 'aaa_bc', similarity=False)
print(dist2)
with open('data/val_ocr_2823.pickle', 'rb') as f:
    input_val, output_val = pickle.load(f)

with open('data/test_ocr_707.pickle', 'rb') as f:
    input_test, output_test = pickle.load(f)

print('Predict the test set')
POCR1 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_11.h5', optimizers=Adadelta())
POCR2 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_12.h5', optimizers=Adadelta())
POCR3 = PlaqueOCR(shape=(128, 64, 3), shapes=[10], gru=512, weight='data/weight/OCR_13.h5', optimizers=Adadelta())
y_hat1 = POCR1.predict(input_test['train_input'])
y_hat2 = POCR2.predict(input_test['train_input'])
y_hat3 = POCR3.predict(input_test['train_input'])

y_hat = y_hat1 * y_hat2

y_hat = y_hat1 * y_hat2 * y_hat3

pred = decode_batch(y_hat)
true = decode_true(input_test['the_labels'])

res = pd.DataFrame()
res['true'] = true
res['pred'] = pred
res['score'] = [damerauLevenshtein(true[i], pred[i], similarity=False) for i in range(len(pred))]

score = np.mean(res['score'].values)
print(score)

예제 #8
0
def StringDistance(s1, s2):
    return damerauLevenshtein(s1, s2, similarity=False)

    return d[lenstr1 - 1, lenstr2 - 1]