예제 #1
0
    def edit_distance(self):
        ed = EditDistance()

        total_dist = 0
        total_norm_dist = 0
        op_count = {'m': 0, 'i': 0, 'd': 0, 'r': 0}
        op_count_norm = {'m': 0, 'i': 0, 'd': 0, 'r': 0}
        num_examples = len(self.gt)
        num_examples = max(num_examples, 1)
        for i in self.gt.keys():
            gt = self.gt[i][0].split()
            gen = self.gen[i][0].split()

            max_len = float(max(len(gt), len(gen)))
            max_len = max(max_len, 1.0)
            dist = ed.compute(gt, gen)
            total_dist += dist
            total_norm_dist += dist / max_len

            ops = ed.operations()
            for op in ops:
                op_count[op] += 1
                op_count_norm[op] += 1.0 / max_len

        mean_dist = total_dist / float(num_examples)
        mean_norm_dist = total_norm_dist / float(num_examples)

        for op in op_count:
            op_count[op] /= float(num_examples)
            op_count_norm[op] /= float(num_examples)

        return mean_dist, mean_norm_dist, op_count, op_count_norm
예제 #2
0
    def exec_second(self, parole):

        e = EditDistance()
        a = Ngram()

        tempi = []
        n_vicine_trovate = []

        for parola in parole:
            with open('60000_parole_italiane.txt', 'r') as f:
                # print 'parola --> ', parola

                # edit distance
                # print '----- EDIT DISTANCE'
                e_results = []
                start = timer()
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < self.sogliaCosto:
                        e_results.append((p, costo))
                end = timer()

                time_edit = end - start
                n_edit = len(e_results)

                # print 'risultati (%s)' % n_edit,  '-->', sorted(e_results, key=get(1))
                # print 'tempo -->', time_edit

                # ngrams
                # print '----- NGRAMS'
                g_results = []
                b = a.ngram(parola, self.numberOfGrams)
                with open("%s_grams.txt" % self.numberOfGrams, 'r') as r:
                    start = timer()
                    for line in r:
                        s = line.split(' -> ')
                        p, g = s[0], s[1]
                        f = a.jaccard(b, g)
                        if f > self.sogliaJaccard:
                            g_results.append((p, f))
                    end = timer()

                time_gram = end - start
                n_gram = len(g_results)

                # print 'risultati (%s)' % n_gram, '-->', sorted(g_results, key=get(1), reverse=True)
                # print 'tempo -->', time_gram
                # print '\n'

                tempi.append([time_edit, time_gram])
                n_vicine_trovate.append([n_edit, n_gram])

        return [tempi, n_vicine_trovate]
def main():
    while True:
        string_a = gen_random_string()
        string_b = gen_random_string()
        string_a = "bread"
        string_b = "really"
        print("String 1: {}\tString 2:{}".format(string_a, string_b))
        edit_distance = EditDistance(string_a, string_b)
        ed = edit_distance.get_edit_dist()
        print("Edit Dist: {}".format(ed))
        break
 def search_candidates(self, input_string):
     editDistance = EditDistance()
     for leaf in self.leaves:
         for index in leaf.indices:
             i = leaf.start
             parent = leaf.parent
             cand = True
             p1 = -1
             while cand and parent != None:
                 p1 = index - (i - parent.start)
                 p2 = index + (parent.end - i) + 1
                 if p1 < 0:
                     p1 = 0
                 if p2 > len(input_string):
                     p2 = len(input_string)
                 distance = editDistance.compute(input_string[p1:p2],
                                                 parent.pattern)
                 if distance <= parent.error:
                     parent = parent.parent
                 else:
                     p1 -= parent.error
                     counter = parent.error
                     withp1 = False
                     while counter != 0 and not withp1:
                         distance = editDistance.compute(
                             input_string[p1:p2], parent.pattern)
                         if distance <= parent.error:
                             parent = parent.parent
                             withp1 = True
                         else:
                             counter -= 1
                             p1 += 1
                     if not withp1:
                         p2 += parent.error
                         counter = parent.error
                         withp2 = False
                         while counter != 0 and not withp2:
                             distance = editDistance.compute(
                                 input_string[p1:p2], parent.pattern)
                             if distance <= parent.error:
                                 parent = parent.parent
                                 withp2 = True
                             else:
                                 counter -= 1
                                 p2 -= 1
                         if not withp2:
                             cand = False
             if cand:
                 self.indicesDict[p1] = input_string[p1:p2]
예제 #5
0
    def exec_fifth(self):

        e = EditDistance()
        a = Ngram()

        originale = raw_input("**** Inserisci parola --> ")
        parola = self.storpia(originale)
        print '**** Parola storpiata -->', parola

        # edit distance
        print '----- EDIT DISTANCE'
        # costi: 1, 2, 3, 4, 5
        for c in range(1, 6):
            with open('60000_parole_italiane.txt', 'r') as f:
                e_results = []
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < c:
                        e_results.append((p, costo))
                if any(originale in a for a in e_results):
                    w = 'parola originale trovata!'
                else:
                    w = 'parola originale non trovata!'
                print w, '(soglia costo %s, %s risultati)' % (
                    c, len(e_results)), '-->', sorted(e_results, key=get(1))

        # ngram
        print '----- NGRAM'
        b = a.ngram(parola, self.numberOfGrams)
        # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9
        for j in np.arange(0.5, 1.0, 0.1):
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as f:
                g_results = []
                for line in f:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    f = a.jaccard(b, g)
                    if f > j:
                        g_results.append((p, f))
                if any(originale in a for a in g_results):
                    w = 'parola originale trovata!'
                else:
                    w = 'parola originale non trovata!'
                print w, '(jaccard %s, %s risultati)' % (
                    j, len(g_results)), '-->', sorted(g_results,
                                                      key=get(1),
                                                      reverse=True)
예제 #6
0
    def exec_third(self):
        e = EditDistance()
        a = Ngram()

        costi = []
        coefficienti = []
        risultati_edit = []
        risultati_gram = []

        parola = raw_input("**** Inserisci parola --> ")

        # edit distance
        # print '----- EDIT DISTANCE'
        # costi: 1, 2, 3, 4, 5
        for c in range(1, 6):
            costi.append(c)
            with open('60000_parole_italiane.txt', 'r') as f:
                e_results = []
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < c:
                        e_results.append((p, costo))
                risultati_edit.append(len(e_results))
                # print 'ho trovato %s risultati per soglia costo %s' % (len(e_results), c), '-->', sorted(e_results, key=get(1))

        # ngram
        # print '----- NGRAM'
        b = a.ngram(parola, self.numberOfGrams)
        # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9
        for j in np.arange(0.5, 1.0, 0.1):
            coefficienti.append(j)
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as f:
                g_results = []
                for line in f:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    f = a.jaccard(b, g)
                    if f > j:
                        g_results.append((p, f))
                risultati_gram.append(len(g_results))
                # print 'ho trovato %s risultati per jaccard maggiore di %s' % (len(g_results), j), '-->', sorted(g_results, key=get(1), reverse=True)

        return [costi, coefficienti, risultati_edit, risultati_gram]
예제 #7
0
    def exec_first(self):
        with open('60000_parole_italiane.txt', 'r') as f:

            e = EditDistance()
            a = Ngram()

            lines = f.readlines()
            rand = random.randint(0, len(lines))
            word = lines[rand].rstrip()
            print 'random word -->', word

            # test edit distance
            start = timer()
            for line in lines:
                p = line.rstrip()
                if p == word:
                    break
                _, op = e.edit_distance(word, p)
                _ = e.op_sequence(op, len(word) - 1, len(p) - 1, [])
            end = timer()
            time_edit = end - start
            # print 'tempo trascorso edit distance -->', time_edit

            # test ngrams
            b = a.ngram(word, self.numberOfGrams)
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as r:
                start = timer()
                for line in r:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    if p == word:
                        break
                    _ = a.jaccard(b, g)
                end = timer()
            time_ngram = end - start
            # print 'tempo trascorso ngrams -->', time_ngram

            return [word, time_edit, time_ngram]
예제 #8
0
class Candidate():
    def __init__(self, dictionary):
        self.edit_distance = EditDistance(dictionary)
        self.telex = Telex()
        self.teencode = Teencode(dictionary)
    def generate_candidate(self, word):
        word_candidates =  list(self.edit_distance.candidates_e1(word))
        fix_telex = self.telex.uni2telex(word)
        if fix_telex != word:
            word_candidates.insert(0,fix_telex)
        candidates_acr = self.teencode.candidate_acronym(word)
        candidates_teen = self.teencode.candidate_teen(word)
        word_candidates += candidates_acr
        word_candidates += candidates_teen
        return word_candidates
예제 #9
0
 def __init__(self, dictionary):
     self.edit_distance = EditDistance(dictionary)
     self.telex = Telex()
     self.teencode = Teencode(dictionary)
예제 #10
0
 def test_case_1(self):
     word1 = 'abcd'
     word2 = 'abdd'
     e = EditDistance()
     result = e.calculateNumberOfChanges(word1, word2)
     self.assertEqual(result, 1)
예제 #11
0
 def test_case_6(self):
     word1 = 'dinitrophenylhydrazine'
     word2 = 'benzalphenylhydrazone'
     e = EditDistance()
     result = e.calculateNumberOfChanges(word1, word2)
     self.assertEqual(result, 7)
예제 #12
0
 def test_case_5(self):
     word1 = 'intention'
     word2 = 'execution'
     e = EditDistance()
     result = e.calculateNumberOfChanges(word1, word2)
     self.assertEqual(result, 5)