예제 #1
0
def get_similarities(Features, url_input):
    """
    similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein,
    and hamming distance
    :param Features: input dictionary to add things to
    :param url_input
    :return: Features: after adding all similarity metrics
    """
    for n in itertools.chain(product_domain_names, brand_names):
        Features['url_levenshtein_distance_' + n] = Levenshtein.distance(
            url_input, n)
        Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance(
            Features['fqdn'], n)
        Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            url_input, n)
        Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance(
            Features['fqdn'], n)
        Features['url_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_distance_' +
                 n] = dl.damerau_levenshtein_distance(Features['fqdn'], n)
        Features['url_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(url_input, n)
        Features['fqdn_damerau_levenshtein_normalized_distance_' +
                 n] = dl.normalized_damerau_levenshtein_distance(
                     Features['fqdn'], n)
        if len(n) == len(url_input):
            Features['url_length_equals_' + n] = 1
            Features['url_hamming_distance_' + n] = hamming(url_input, n)
            Features['fqdn_hamming_distance_' + n] = hamming(
                Features['fqdn'], n)
        else:
            Features['url_length_equals_' + n] = 0
    return Features
예제 #2
0
    def spell_score(misspelling, candidates, method=1):
        """
        Calculates the edit distance between a misspelling and each candidate according to the chosen method
        :param misspelling: misspelling
        :param candidates: list of candidates
        :param method: chosen method from [1, 2, 3, 4]
        :return: list of edit distances between misspelling and each candidate
        """
        lexical_scores = [damerau_levenshtein_distance(misspelling, candidate)
                          for candidate in candidates]

        if method == 1:
            return lexical_scores
        else:
            phonetic_scores = [damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0])
                               for candidate in candidates]

        if method == 2:
            return [phonetic_score if phonetic_score != 0 else 1 for phonetic_score in phonetic_scores]
        elif method == 3:
            return [0.5 * (a + b) for a, b in zip(lexical_scores, phonetic_scores)]
        elif method == 4:
            return [(2 * a + b) ** 2 for a, b in zip(lexical_scores, phonetic_scores)]
        else:
            raise ValueError('Method must be element from [1, 2, 3, 4]')
예제 #3
0
def riclassifica_per_tag(p, tags, classi_da_riclassificare):
    parola = p[0]
    tags.append(parola)
    res = {}
    if len(classi_da_riclassificare) > 0:
        print 'riclassifica per tag solo per alcune classi'
        for classe in classi_da_riclassificare:  # solo le classi con distanza simile o uguale
            dam = []
            for campo_misurato in dictionary[classe]:
                for tag in tags:
                    dam.append(
                        damerau_levenshtein_distance(tag, campo_misurato))

            res[classe] = min(dam)
            # print 'dam', classe, '(', min(dam), ')-->', dam
        return res
    else:
        print 'riclassifica per tag per tutte le classi'
        for classe in dictionary.keys():  # tutte le classi
            dam = [[(damerau_levenshtein_distance(tag, campo_misurato))
                    for tag in tags] for campo_misurato in dictionary[classe]]
            res[classe] = min(dam)
            # print 'dam', classe, '----->', dam  # TODO !?!?!??!

        return res
예제 #4
0
    def noisychannel_ranking(self, detection_list, candidates_list):
        """
        An approximate implementation of the ranking method described in (Lai et al. 2015)
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :param frequency_dict: corpus frequencies from training data
        :param k_best: if True, return k highest ranked candidates instead of single one
        :return: list with corrections or k-best corrections
        """

        correction_list = []
        confidences = []

        for misspelling, candidates in zip(detection_list, candidates_list):
            score_list = []
            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0])

                spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance) ** 2  # P(m|c)

                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1

                frequency_score = 1 / (1 + log(frequency))  # P(c)

                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)

            if len(score_list) > 1:
                sorted_distances = [score_list[i] for i in np.argsort(score_list)]
                top1 = sorted_distances[0]
                top2 = sorted_distances[1]
                confidence = abs(top1 - top2) / top1
                confidences.append(confidence)
            else:
                confidences.append(0)

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append([candidates[i] for i in np.argsort(score_list)[:self.k]])
            else:
                raise ValueError('k must be positive natural number')

        self.confidences = confidences

        return correction_list
예제 #5
0
def merge(ci, cj):
    cluster1 = cluster_list[ci]
    cluster2 = cluster_list[cj]

    # get lcs between patrons and receive difference ids
    pi = cluster1.p  # patron of cluster1
    pj = cluster2.p

    # pi_ids has ids from events in (pi - p)
    (p, pi_ids, pj_ids) = lcs(pi, pj)  # get common patron
    p_ = copy.deepcopy(p)

    # Candidate Events Ec = (Pi - P) U (Pj - P)
    cE = candidateEvents(pi, pj, pi_ids, pj_ids)

    # Sort Ec by frequency in desc order
    e, f = frecuencySort(cE)  # e: event id, f: frecuency

    L = -1

    ## Gi U Gj
    #G_u = np.concatenate(cluster1.g, cluster2.g)
    gi = cluster1.g  # event sequences ids forming cluster1
    gj = cluster2.g

    G_u = gi + gj

    n_edits_gi = sum([
        damerau_levenshtein_distance(list(Secuences[e_]), list(pi))
        for e_ in gi
    ])
    n_edits_gj = sum([
        damerau_levenshtein_distance(list(Secuences[e_]), list(pj))
        for e_ in gj
    ])

    # Pattern buildup phase
    for e in cE:
        p = add(p, e)
        n_edits_p = sum([
            damerau_levenshtein_distance(list(Secuences[e_]), list(p))
            for e_ in G_u
        ])
        L_ = len(pi) + len(pj) - len(p) + alpha * (n_edits_gi) + alpha * (
            n_edits_gj) - alpha * (n_edits_p) + lamda

        if L_ < 0 or L_ < L:
            break
        else:
            L = L_
            p_ = p
    c_ = Cluster(p_, G_u)
    return (L, c_)
예제 #6
0
def score(network, dataset):
    score = 0.
    for x, y in dataset:
        predict = unconvert(network.activate(x))
        score += damerau_levenshtein_distance(predict, unconvert(y))
    score /= float(len(dataset))
    return score
예제 #7
0
def is_same(u1, u2):
    """Determine whether Djk ≤ 0.2 × Min.[|Tj|,|Tk|]"""
    D_jk = damerau_levenshtein_distance(u1, u2)
    t_j = len(u1)
    t_k = len(u2)
    min_ = min(t_j, t_k)
    return D_jk < 0.2 * min_
예제 #8
0
def score(network, dataset):
	score = 0.
	for x, y in dataset:
		predict = unconvert(network.activate(x))
		score += damerau_levenshtein_distance(predict,unconvert(y))
	score /= float(len(dataset))
	return score
예제 #9
0
    def scan(self, filePath):
        '''
    Read the content content of filename, extract the comments and preprocess them.
    Find the Damerau Levenshtein distance between the preprocessed file content
    and the license text.

    :param filePath: Path of the file to scan
    :return: Returns the license's short name with least damerau levenshtien distance
    '''
        processedData = super().loadFile(filePath)

        temp = exactMatcher(processedData, self.licenseList)
        if temp == -1:
            # Classify the license with minimum distance with scanned file
            globalDistance = sys.maxsize
            result = 0
            for idx in range(len(self.licenseList)):
                distance = damerau_levenshtein_distance(
                    processedData.split(" "),
                    self.licenseList.iloc[idx]['processed_text'].split(" "))
                if self.verbose > 0:
                    print(
                        str(idx) + "  " +
                        self.licenseList.iloc[idx]['shortname'] + "  " +
                        str(distance))
                if distance < globalDistance:
                    globalDistance = distance
                    result = idx

            return str(self.licenseList.iloc[result]['shortname'])
        else:
            return temp[0]
예제 #10
0
def correct(string,typo_dictionary):
    corrections_dict = {}
    min_correct_len = float('inf')
    queue = sorted(list(set([string] + generate_deletes(string, threshold_levensthein))), key=len, reverse=True)
    while len(queue) > 0:
        q_item = queue.pop(0)
        if ((len(corrections_dict) > 0) and ((len(string) - len(q_item)) > min_correct_len)):
            break
        if (q_item in typo_dictionary) and (q_item not in corrections_dict):
            if (typo_dictionary[q_item][1] > 0):
                corrections_dict[q_item] = (typo_dictionary[q_item][1], len(string) - len(q_item))
                if len(string) == len(q_item):
                    break

                elif (len(string) - len(q_item)) < min_correct_len:
                    min_correct_len = len(string) - len(q_item)
            for sc_item in typo_dictionary[q_item][0]:
                if (sc_item not in corrections_dict):
                    if len(q_item) == len(string):
                        item_dist = len(sc_item) - len(q_item)

                    item_dist = damerau_levenshtein_distance(sc_item, string)

                    if item_dist > min_correct_len:
                        pass
                    elif item_dist <= threshold_levensthein:
                        corrections_dict[sc_item] = (typo_dictionary[sc_item][1], item_dist)
                        if item_dist < min_correct_len:
                            min_correct_len = item_dist

                    corrections_dict = {k: v for k, v in corrections_dict.items() if v[1] <= min_correct_len}

    return corrections_dict
예제 #11
0
def calcola_precisione_classe(lista_cluster):
    totale_campi = len(lista_cluster)
    classi_attribuite = {}
    for elem in lista_cluster:
        # Aggiungo la classe vera di questa parola tra le classi effettive
        count_metriche[classe_reale(elem)]['effettive'] += 1  # TODO la classe effettiva si riferisce al cluster o al campo ?!!
        res = {}
        for classe in dictionary.keys():
            dam = (damerau_levenshtein_distance(elem, classe))
            res[classe] = dam

        classe_attribuita = min(res, key=res.get)
        count_metriche[classe_attribuita]['classificate'] += 1

        if classe_reale(elem) == classe_attribuita:
            count_metriche[classe_reale(elem)]['classificate_esatte'] += 1

        if classi_attribuite.get(classe_attribuita) is None:
            classi_attribuite[classe_attribuita] = 1
        else:
            classi_attribuite[classe_attribuita] += 1

    classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get)
    # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente]
    percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi

    return classe_piu_frequente, percentuale
예제 #12
0
def calcola_precisione_dizionario(lista_cluster):
    totale_campi = len(lista_cluster)
    classi_attribuite = {}
    # riempi_dizionario('coppieCampoClasse.csv')
    riempi_dizionario('training_set_1.csv')
    for elem in lista_cluster:
        res = {}
        for classe in dictionary.keys():
            dam = [(damerau_levenshtein_distance(elem, campo_misurato))
                   for campo_misurato in dictionary[classe]]
            res[classe] = min(dam)

        classe_attribuita = min(res, key=res.get)

        if classi_attribuite.get(classe_attribuita) is None:
            classi_attribuite[classe_attribuita] = 1
        else:
            classi_attribuite[classe_attribuita] += 1

    classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get)
    # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente]
    percentuale = (100 *
                   classi_attribuite[classe_piu_frequente]) / totale_campi

    return classe_piu_frequente, percentuale
    def rectify(self, word, prev_word):
        """
            Предсказания спеллера
        """

        # подбираем число кандидатов по длине запроса
        self.n_candidates = 350 if len(word) <= 4 else 250 if len(word) <= 7 else self.n_candidates

        # для каждого терма считаем расстояние Левенштейна
        candidates = sorted(self.words_list.find(word, 7))
        
        # среди топа кандидатов ищем "хорошее" исправление
        # используем модифицированное расстояние Дамерау-Левенштейна (с перестановками)
        # а также ищем слово с минимальным количеством новых букв
        suggests = list()
        for _, sugg in candidates[:self.n_candidates]:
            dist = damerau_levenshtein_distance(sugg, word)
            context_list = self.voc_vectorizer.transform([f"{prev_word} {sugg}"]).tocoo().col.tolist()
            if dist <= 5:
                suggs = [(sugg, dist, 0.0)]
                if context_list:
                    suggs = [(sugg, dist, self.voc.get(context, 0.0)) for context in context_list]

                suggests.extend(suggs)

        suggests = sorted(suggests, key=lambda tup: tup[1])

        minimal_distance = min(suggest[1] for suggest in suggests)
        candidates = sorted(
            [(suggest[0], suggest[2]) for suggest in suggests
             if suggest[1] == minimal_distance and set(suggest[0]) == set(word)],
            key=lambda tup: -tup[1])

        return candidates[0][0] if candidates and candidates[0][1] > 0 else suggests[0][0]
예제 #14
0
def calcola_precisione_classe(lista_cluster):
    totale_campi = len(lista_cluster)
    classi_attribuite = {}
    for elem in lista_cluster:
        # Aggiungo la classe vera di questa parola tra le classi effettive
        count_metriche[classe_reale(
            elem
        )]['effettive'] += 1  # TODO la classe effettiva si riferisce al cluster o al campo ?!!
        res = {}
        for classe in dictionary.keys():
            dam = (damerau_levenshtein_distance(elem, classe))
            res[classe] = dam

        classe_attribuita = min(res, key=res.get)
        count_metriche[classe_attribuita]['classificate'] += 1

        if classe_reale(elem) == classe_attribuita:
            count_metriche[classe_reale(elem)]['classificate_esatte'] += 1

        if classi_attribuite.get(classe_attribuita) is None:
            classi_attribuite[classe_attribuita] = 1
        else:
            classi_attribuite[classe_attribuita] += 1

    classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get)
    # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente]
    percentuale = (100 *
                   classi_attribuite[classe_piu_frequente]) / totale_campi

    return classe_piu_frequente, percentuale
예제 #15
0
def makeNet(learning_rate):
	ds = SupervisedDataSet(20, 20)
	with open('data/misspellingssmall.csv', 'rbU') as f:
		reader = csv.reader(f)
		for row in reader:
			ds.addSample(convert(row[0]),convert(row[1]))

	#testds, trainds = ds.splitWithProportion(0.2)

	net = buildNetwork(20, 20, 20)
	#trainer = BackpropTrainer(net, dataset=trainds, learningrate=learning_rate)
	trainer = BackpropTrainer(net, dataset=ds, learningrate=learning_rate)
	#trainer.train()
	#trainer.trainEpochs(5)
	trainer.trainUntilConvergence()

	score = 0
	for x, y in testds:
		predict = unconvert(net.activate(x))
		score += damerau_levenshtein_distance(predict,unconvert(y))

	global lastNet
	lastNet = net

	global netNum
	netNum += 1

	print "Network " + str(netNum) + " done with score " + str(score)
	
	return score
예제 #16
0
파일: test_pyxdl.py 프로젝트: dpolad/ugpHT
 def test_damerau_levenshtein_distance(self):
     assert damerau_levenshtein_distance('smtih', 'smith') == 1
     assert damerau_levenshtein_distance('snapple', 'apple') == 2
     assert damerau_levenshtein_distance('testing', 'testtn') == 2
     assert damerau_levenshtein_distance('saturday', 'sunday') == 3
     assert damerau_levenshtein_distance('Saturday', 'saturday') == 1
     assert damerau_levenshtein_distance('orange', 'pumpkin') == 7
     assert damerau_levenshtein_distance('gifts', 'profit') == 5
     assert damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 1
예제 #17
0
    def candidate_search(self, We, threshold):
        candidate = {}
        for Wc in self.dictionary:
            dist = damerau_levenshtein_distance(Wc, We)
            if dist <= threshold:
                candidate[Wc] = self.dictionary[Wc]

        return (candidate)
예제 #18
0
def wordsim2(word1, word2):
    '''return a similarity score for between the two words
    attempt to use a behavioral model of edit distances
    '''
    dist = damerau_levenshtein_distance(str(word1), str(word2))
    if dist > 3:
        return 0
    edit = editops(str(word1), str(word2))
    return match_model(edit)
예제 #19
0
    def __call__(cls, *args, **kwargs):
        """ Checks for misprints in argument names """
        obj = super(UnitRegistry, cls).__call__(*args, **kwargs)
        if hasattr(cls, "DISABLE_KWARGS_CHECK") or not UnitRegistry.enabled:
            return obj

        def warning(*largs):
            obj.warning(*largs)
            if root.common.trace.misprints:
                obj.warning(
                    "Stack trace:\n%s", "".join(
                        format_list(
                            extract_stack(
                                inspect.currentframe().f_back.f_back))))

        # Build the matrix of differences
        matrix = {}
        matched = set()
        for given_kwarg in kwargs:
            for kwattr in cls.KWATTRS:
                if (kwattr, given_kwarg) in matrix:
                    continue
                matrix[(given_kwarg, kwattr)] = d = \
                    damerau_levenshtein_distance(given_kwarg, kwattr)
                if d == 0:
                    # perfect match, stop further comparisons
                    matched.add(given_kwarg)
                    break
        if len(matched) < len(kwargs):
            # Find replacement candidates with distance = 1
            ignored_kwargs = set()
            for given_kwarg in set(kwargs).difference(matched):
                candidates = []
                for kwattr in cls.KWATTRS:
                    d = matrix.get((given_kwarg, kwattr))
                    if d == 1:
                        candidates.append(kwattr)
                if len(candidates) == 0:
                    ignored_kwargs.add(given_kwarg)
                else:
                    warning(
                        "Creating %s: potential misprint in keyword argument "
                        "name: expected %s - got %s", obj,
                        " or ".join(candidates), given_kwarg)
            try:
                __IPYTHON__  # pylint: disable=E0602
                from IPython.terminal.interactiveshell import \
                    InteractiveShell
                ignored_kwargs -= set(InteractiveShell.instance().user_ns)
            except NameError:
                pass
            if len(ignored_kwargs) > 0:
                warning(
                    "Creating %s: ignored the following keyword arguments: %s",
                    obj, ", ".join(sorted(ignored_kwargs)))
        return obj
예제 #20
0
    def noisychannel_ranking(self, candidates_list):
        """
        An approximate implementation of the ranking method described in Lai et al. (2015), 'Automated Misspelling Detection and Correction in Clinical Free-Text Records'
        :param candidates_list: list of candidate list per misspelling
        :return: list with corrections or k-best corrections
        """
        correction_list = []
        confidences = []

        for misspelling, candidates in zip(self.misspellings, candidates_list):

            if not candidates:
                correction_list.append('')
                continue

            score_list = []
            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(
                    misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(
                    dm(misspelling)[0],
                    dm(candidate)[0])
                spell_score = (2 * orthographic_edit_distance +
                               phonetic_edit_distance)**2  # P(m|c)
                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1
                frequency_score = 1 / (1 + log(frequency))  # P(c)
                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)
            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            else:
                correction_list.append(
                    [candidates[i] for i in np.argsort(score_list)[:self.k]])

        return correction_list
예제 #21
0
def best(string,typo_dictionary):
    for word in attention_words:
        if damerau_levenshtein_distance(word, string) <= threshold_levensthein:
            return word
    try:
        as_list = correct(string,typo_dictionary).items()
        outlist = sorted(as_list, key=lambda item: (item[1][1], -item[1][0]))
        return outlist[0][0]
    except:
        return string
예제 #22
0
파일: unit_registry.py 프로젝트: 2php/veles
    def __call__(cls, *args, **kwargs):
        """ Checks for misprints in argument names """
        obj = super(UnitRegistry, cls).__call__(*args, **kwargs)
        if hasattr(cls, "DISABLE_KWARGS_CHECK") or not UnitRegistry.enabled:
            return obj

        def warning(*largs):
            obj.warning(*largs)
            if root.common.trace.misprints:
                obj.warning("Stack trace:\n%s",
                            "".join(format_list(extract_stack(
                                inspect.currentframe().f_back.f_back))))

        # Build the matrix of differences
        matrix = {}
        matched = set()
        for given_kwarg in kwargs:
            for kwattr in cls.KWATTRS:
                if (kwattr, given_kwarg) in matrix:
                    continue
                matrix[(given_kwarg, kwattr)] = d = \
                    damerau_levenshtein_distance(given_kwarg, kwattr)
                if d == 0:
                    # perfect match, stop further comparisons
                    matched.add(given_kwarg)
                    break
        if len(matched) < len(kwargs):
            # Find replacement candidates with distance = 1
            ignored_kwargs = set()
            for given_kwarg in set(kwargs).difference(matched):
                candidates = []
                for kwattr in cls.KWATTRS:
                    d = matrix.get((given_kwarg, kwattr))
                    if d == 1:
                        candidates.append(kwattr)
                if len(candidates) == 0:
                    ignored_kwargs.add(given_kwarg)
                else:
                    warning(
                        "Creating %s: potential misprint in keyword argument "
                        "name: expected %s - got %s", obj,
                        " or ".join(candidates), given_kwarg)
            try:
                __IPYTHON__  # pylint: disable=E0602
                from IPython.terminal.interactiveshell import \
                    InteractiveShell
                ignored_kwargs -= set(InteractiveShell.instance().user_ns)
            except NameError:
                pass
            if len(ignored_kwargs) > 0:
                warning(
                    "Creating %s: ignored the following keyword arguments: %s",
                    obj, ", ".join(sorted(ignored_kwargs)))
        return obj
예제 #23
0
파일: TSparsing.py 프로젝트: lbedogni/iot
def classificate(stream,x,y):
    
    # res è un dict che conterrà (per ogni parola da testare) tutte le classi
    # e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono')
    res = {}
    # **************** PRIMA CLASSIFICAZIONE *****************
    # Per ogni classe calcolo la diumbstanza tra le parole di cui è composta e la parola da testare
    for classe in dictionary.keys():

        # For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class
        dam = [(damerau_levenshtein_distance(stream, campo_misurato)) for campo_misurato in
            dictionary[classe]]  # array di distanze per classe

        if len(dam) > 0:
            res[classe] = min(dam) # I pick the minimum distance for each class

        #else:
            #res[classe] = 50 #Da modificare, ma per ora serve per evitare problemi con le classi senza parole
        distanza_minima = res[min(res, key=res.get)]
        classi_con_stessa_distanza_minima = []  # riempio una lista per vedere se la distanza minima trovata è duplicata
        for key, value in res.iteritems():  # TODO casi in cui ci sono distanze uguali !!
            if value == distanza_minima:
                # print 'distanza minima =', key
                classi_con_stessa_distanza_minima.append(key)

        if distanza_minima is 0:
        	# TODO non so se riclassificare -> può venir fuori lo stesso risultato
        	if len(classi_con_stessa_distanza_minima) > 1:  # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi
        		#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)
        		res = res #toglilo
        else:
        	"""
        	A questo punto, verifico due condizioni:
        	- se la distanza minima trovata tra tutte le classi è maggiore del x% di len(strea,)
        	- se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza
        	"""
        	percent_lunghezza = (len(stream) * x) / 100
            #se non rispetta la condizione la assumo come buona
        	if distanza_minima > percent_lunghezza:
        		# riclassifico solo per alcune classi !?
        		# TODO cerco le classi con distanze simili alla distanza minima
        		# aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili
        		for classe, dist in res.iteritems():
        			diff = (abs((distanza_minima - dist)) * y) / 100
        			if diff < percent_lunghezza and (dist != distanza_minima):
        				classi_con_stessa_distanza_minima.append(classe)
        				
        			#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)

        # We decide finally the class and check whether is right or wrong
    
	
    classe_attribuita = min(res, key=res.get)
    return classe_attribuita
예제 #24
0
def wordsim1(word1, word2):
    '''return a similarity score for between the two words
    TODO: stem the words
    '''
    probs = [1, 0.7669349429912811, 0.1784037558685446,
             0.03386988598256204, 0.015090543259557344,
             0.004024144869215292, 0.001676727028839705]
    dist = damerau_levenshtein_distance(str(word1), str(word2))
    if dist<len(probs):
        return probs[dist]
    else:
        return 0
 def get_candidates(self, word):
     """
     Damerau-Levenshtein edit distance is used to generate a candidate set of this word.
     :param word: source word used to generate a candidate set
     :return: the candidate set of this word
     """
     candidates = dict()
     for word_list_item in self.vocab_list:
         edit_distance = damerau_levenshtein_distance(word, word_list_item)
         if edit_distance <= 1:
             candidates[word_list_item] = edit_distance
     return sorted(candidates, key=candidates.get, reverse=False)
예제 #26
0
    def rectify(self, word, prev_word):
        """
            Предсказания спеллера
        """

        if word == ',,':
            return ','
        if word == '..':
            return '...'

        # запрос, преобразованный в нграммы
        char_ngrams_list = self.vectorizer.transform([word]).tocoo().col

        # подбираем число кандидатов по длине запроса
        self.n_candidates = 350 if len(word) <= 4 else 250 if len(
            word) <= 7 else self.n_candidates

        # для каждого терма считаем совпадение по нграммам
        counter = Counter()

        for token_id in char_ngrams_list:
            for word_id in self.index[token_id]:
                counter[word_id] += 1

        # среди топа по совпадениям по нграммам ищем "хорошее" исправление

        # используем модифицированное расстояние Левенштейна (с перестановками)
        # а также ищем слово с минимальным количеством новых букв
        suggests = list()
        for suggest in counter.most_common(n=self.n_candidates):
            sugg = self.words_list[suggest[0]]
            dist = damerau_levenshtein_distance(sugg, word)
            context_list = self.voc_vectorizer.transform(
                [f"{prev_word} {sugg}"]).tocoo().col.tolist()
            if dist <= 5:
                suggs = [(sugg, dist, 0.0)]
                if context_list:
                    suggs = [(sugg, dist, self.voc.get(context, 0.0))
                             for context in context_list]

                suggests.extend(suggs)

        suggests = sorted(suggests, key=lambda tup: tup[1])

        minimal_distance = min(suggest[1] for suggest in suggests)
        candidates = sorted([
            (suggest[0], suggest[2]) for suggest in suggests
            if suggest[1] == minimal_distance and set(suggest[0]) == set(word)
        ],
                            key=lambda tup: -tup[1])

        return candidates[0][
            0] if candidates and candidates[0][1] > 0 else suggests[0][0]
예제 #27
0
def compare(a, b):
    results = {
        'editdistance':
        editdistance.eval(a, b),
        'pylev':
        pylev.levenshtein(a, b),
        'python-Levenshtein':
        Levenshtein.distance(a, b),
        'pyxdameraulevenshtein':
        pyxdameraulevenshtein.damerau_levenshtein_distance(a, b),
    }
    return results
def calculate_similarity(docs_list, similarity_type, threshold):
    ''' Calculate vector similarity of all possible pairs in list '''
    results = list()
    counter = 0

    # Get all possible combinations of tweets that have same NER
    all_combinations = list(combinations(docs_list, 2))

    # Filter handles, hashtags, emoticons, etc.

    for tweet_pair in all_combinations:
        tweet_pair[0].filter("*")
        tweet_pair[1].filter("*")

        # Filter out pairs with exact sentences
        if tweet_pair[0].clean_text != tweet_pair[1].clean_text:

            # Filter out sentences shorter than 4 words
            if tweet_pair[0].tweet_len() > 3 and tweet_pair[1].tweet_len() > 3:

                # Filter out those combinations with excesive word number differences
                if abs(tweet_pair[0].tweet_len() - tweet_pair[1].tweet_len()) < 4:


                    if similarity_type == "jaccard":
                        settext1 = tweet_pair[0].word_set()
                        settext2 = tweet_pair[1].word_set()
                        d = jaccard_distance(settext1, settext2)

                    if similarity_type == "jaro_winkler":
                        d = 1 - distance.jaro_winkler_similarity(tweet_pair[0].clean_text, tweet_pair[1].clean_text)

                    if similarity_type == "levenshtein":
                        d = damerau_levenshtein_distance(tweet_pair[0].clean_text, tweet_pair[1].clean_text)

                    # Only return those results above the threshold
                    if d < threshold:

                        # Put in source sentences with more oov words and extra filter target
                        if tweet_pair[0].oov_words() > tweet_pair[1].oov_words():
                            bi_combination = tweet_pair[0].source_filter(), tweet_pair[1].target_filter()
                        else:
                            bi_combination = tweet_pair[1].source_filter(), tweet_pair[0].target_filter()

                        if bi_combination not in results:
                            results.append(bi_combination)
                            counter += 1


        sys.stdout.write(f"\rAdding combinations...")
        sys.stdout.flush()
    return results
예제 #29
0
파일: cluster.py 프로젝트: smilli/clust
def dl_ngram_dist(ngram1, ngram2):
    """
    Compute distance between ngrams by summing the Damerau-Levenshtein distance
    for consecutive words in ngrams.

    Params:
        ngram1: [tuple] Tuple of words.
        ngram2: [tuple] Tuple of words.

    Returns:
        distance [int] Measure of distance between two ngrams.
    """
    return sum(damerau_levenshtein_distance(w1, w2) for w1, w2 in zip(ngram1,
        ngram2))
예제 #30
0
def dl_ngram_dist(ngram1, ngram2):
    """
    Compute distance between ngrams by summing the Damerau-Levenshtein distance
    for consecutive words in ngrams.

    Params:
        ngram1: [tuple] Tuple of words.
        ngram2: [tuple] Tuple of words.

    Returns:
        distance [int] Measure of distance between two ngrams.
    """
    return sum(
        damerau_levenshtein_distance(w1, w2) for w1, w2 in zip(ngram1, ngram2))
예제 #31
0
def riclassifica_per_tag(p, tags, classi_da_riclassificare):
    parola = p[0]
    tags.append(parola)
    res = {}
    if len(classi_da_riclassificare) > 0:
        print 'riclassifica per tag solo per alcune classi'
        for classe in classi_da_riclassificare:  # solo le classi con distanza simile o uguale
            dam = []
            for campo_misurato in dictionary[classe]:
                for tag in tags:
                    dam.append(damerau_levenshtein_distance(tag,campo_misurato))

            res[classe] = min(dam)
            # print 'dam', classe, '(', min(dam), ')-->', dam
        return res
    else:
        print 'riclassifica per tag per tutte le classi'
        for classe in dictionary.keys():  # tutte le classi
            dam = [[(damerau_levenshtein_distance(tag, campo_misurato)) for tag in tags] for campo_misurato in dictionary[classe]]
            res[classe] = min(dam)
            # print 'dam', classe, '----->', dam  # TODO !?!?!??!

        return res
예제 #32
0
def convert_candidates(metaphone_candidates, detection, metaphone_dict):
    """
    :param candidates: replacement candidates
    :param detection: misspelling
    :param metaphone_dict: output of load_metaphones()
    :return: candidates converted from Double Metaphone representation to normal lexical representation
    """

    converted_candidates = []
    for i, candidate in enumerate(metaphone_candidates):
        for item in metaphone_dict[candidate]:
            if len(set(item).intersection(set(
                    candidate))) >= 1:  # have at least one character in common
                if damerau_levenshtein_distance(
                        item, detection) <= 3:  # enough overlap
                    converted_candidates.append(item)

    return converted_candidates
예제 #33
0
def calculate_global_dissimilarity_score(test_fingerprint,
                                         sampled_fingerprints):

    scores_list = [
    ]  # store scores between each test_fingerprint,sampled_fingerprint pajrs
    test_fingerprint_word, sampled_fingerprint_word_list = fingerprint2word(
        test_fingerprint, sampled_fingerprints)

    for sampled_fingerprint_word in sampled_fingerprint_word_list:

        distance = damerau_levenshtein_distance(sampled_fingerprint_word,
                                                test_fingerprint_word)
        scores_list.append(distance)

    normalized_scores_list = scores_list / float(max(scores_list))

    global_score = sum(normalized_scores_list)
    return global_score
예제 #34
0
def compute_distances(file_path):
    hashes_dic = read_fuzzies("ris_androdump_safe/hashes.txt")
    hashes_dic_db = read_fuzzies("ris_androdump_safe/hashes_database.txt")

    with open(file_path, "w") as h_file:
        for apk_path, fuzzies_list in hashes_dic.items():
            for apk_path2, fuzzies_list2 in hashes_dic_db.items():
                sim_list = list()
                fam = apk_path.split("/")[-2]
                fam2 = apk_path2.split("/")[-2]
                if fam != fam2:
                    for fuzzy in fuzzies_list:
                        for fuzzy2 in fuzzies_list2:
                            #  Compute edit distance between two sub-fuzzies
                            dist = damerau_levenshtein_distance(fuzzy, fuzzy2)
                            if 0 < dist < 5:
                                sim_list.append((dist, (fuzzy, fuzzy2)))
                    # write down couple with delimiter
                    couple_str = fam + "/" + apk_path.split("/")[-1] + "@" + fam2 + "/" + apk_path2.split("/")[-1]
                    h_file.write(str((couple_str, sim_list))+"\n")
예제 #35
0
def processtxn(txn, choices):
    maxscoreJ = 0
    matchstrJ = ""
    maxscoreDL = 0
    matchstrDL = ""
    maxscoreNDL = 0
    matchstrNDL = ""
    for c in choices:
        scoreJ = jaro.jaro_metric(txn, c)
        scoreDL = 1000 - damerau_levenshtein_distance(txn, c)
        scoreNDL = 1 - normalized_damerau_levenshtein_distance(txn, c)
        if scoreJ > maxscoreJ:
            matchstrJ = c
            maxscoreJ = scoreJ
        if scoreDL > maxscoreDL:
            matchstrDL = c
            maxscoreDL = scoreDL
        if scoreNDL > maxscoreNDL:
            matchstrNDL = c
            maxscoreNDL = scoreNDL
    return {'jaro': matchstrJ, 'dl': matchstrDL, 'ndl': matchstrNDL}
예제 #36
0
def CorrectSpelling(speech, vocab_init, vocab_endanlegt):
    """Use Damerau Levenshtein distance to correct the spelling
    in the intermediate texts"""

    for word in vocab_init:
        #word_dict={}
        replaced = 0
        for w_endanlegt in vocab_endanlegt:
            #dist=MinEditDist(word,w_endanlegt)
            dist = damerau_levenshtein_distance(word, w_endanlegt)
            if dist == 1:
                speech = re.sub(r"\b%s\b" % word, w_endanlegt, speech)
                replaced = 1
                break
        #     else:
        #         word_dict[dist]=w_endanlegt

        # # Need to find the min dist and substitute if not already substituted
        # if replaced == 0:
        #     speech = re.sub(r"\b%s\b" % word,word_dict[min(word_dict,key=int)],speech)

    return speech
예제 #37
0
def levenshtein_candidates(word, vocab_dict, editdistance=2):
    """
    Generates candidates
    :param word: the misspelling for which to generate replacement candidates
    :param vocab_dict: the output of load_vocab()
    :param editdistance: the maximum Damerau-Levenshtein edit distance
    :return:
    """

    candidates = []

    word_len = len(word)
    set_len = len(set(word))

    if word_len <= 2:
        word_lengths = range(word_len, word_len + 1 + editdistance)
    else:
        word_lengths = range(word_len - editdistance,
                             word_len + 1 + editdistance)

    if set_len - editdistance > 0:
        set_lengths = range(set_len - editdistance, set_len + 1 + editdistance)
    else:
        set_lengths = range(set_len, set_len + 1 + editdistance)

    selection = []

    for i in word_lengths:
        key = vocab_dict[i]
        for j in set_lengths:
            selection += key[j]

    for item in set(selection):
        if damerau_levenshtein_distance(word, item) <= editdistance:
            candidates.append(item)

    full_candidates = list(set(candidates))

    return full_candidates
예제 #38
0
def calculate_global_dissimilarity_score(test_fingerprint,
                                         sampled_fingerprints):

    scores_list = np.array(
        []
    )  # store scores between each test_fingerprint,sampled_fingerprint pajrs
    test_fingerprint_word, sampled_fingerprint_word_list = fingerprint2word(
        test_fingerprint, sampled_fingerprints)

    cheat_flag = False  # true if the testing sample itself is sampled
    for sampled_fingerprint_word in sampled_fingerprint_word_list:

        distance = damerau_levenshtein_distance(sampled_fingerprint_word,
                                                test_fingerprint_word)
        if distance == 0:
            cheat_flag = True
        scores_list = np.append(scores_list, distance)

    normalized_scores_list = scores_list / float(max(scores_list))

    global_score = sum(normalized_scores_list)
    return global_score, cheat_flag
예제 #39
0
    def rectify(self, word):
        """
            Предсказания спеллера
        """

        # запрос, преобразованный в нграммы
        char_ngrams_list = self.vectorizer.transform([word]).tocoo().col

        # подбираем число кандидатов по длине запроса
        self.n_candidates = 350 if len(word) <= 4 else 250 if len(word) <= 7 else self.n_candidates

        # для каждого терма считаем совпадение по нграммам
        counter = Counter()

        for token_id in char_ngrams_list:
            for word_id in self.index[token_id]:
                counter[word_id] += 1

        # среди топа по совпадениям по нграммам ищем "хорошее" исправление

        # используем модифицированное расстояние Левенштейна (с перестановками)
        # а также ищем слово с минимальным количеством новых букв
        suggests = list()
        for suggest in counter.most_common(n=self.n_candidates):
            sugg = self.words_list[suggest[0]]
            dl_distance = damerau_levenshtein_distance(sugg, word)
            fitted_sugg_list = self.voc_vectorizer.transform([sugg]).tocoo().col
            if dl_distance <= 5:
                suggests.append((sugg, dl_distance, self.voc[fitted_sugg_list[0]] if fitted_sugg_list else 0))

        suggests = sorted(suggests, key=lambda tup: tup[1])

        minimal_distance = min([suggest[1] for suggest in suggests])
        swap_words = sorted(
            [(suggest[0], suggest[2]) for suggest in suggests
             if suggest[1] == minimal_distance and set(suggest[0]) == set(word)],
            key=lambda tup: -tup[1])

        return swap_words[0][0] if swap_words and swap_words[0][1] > 0 else suggests[0][0]
예제 #40
0
def calcola_precisione_dizionario(lista_cluster):
    totale_campi = len(lista_cluster)
    classi_attribuite = {}
    # riempi_dizionario('coppieCampoClasse.csv')
    riempi_dizionario('training_set_1.csv')
    for elem in lista_cluster:
        res = {}
        for classe in dictionary.keys():
            dam = [(damerau_levenshtein_distance(elem, campo_misurato)) for campo_misurato in dictionary[classe]]
            res[classe] = min(dam)

        classe_attribuita = min(res, key=res.get)

        if classi_attribuite.get(classe_attribuita) is None:
            classi_attribuite[classe_attribuita] = 1
        else:
            classi_attribuite[classe_attribuita] += 1

    classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get)
    # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente]
    percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi

    return classe_piu_frequente, percentuale
예제 #41
0
def classificazione(x, y):
	parole = get_words('test_set_1.csv')
	for p in parole:
		
		# Extract parts of the record
		parola = p[0]
		classe_effettiva = p[1]
		if len(p) > 4:
			tags = []
			for i in range(4, len(p)):
				tags.append(p[i])

		# res è un dict che conterrà (per ogni parola da testare) tutte le classi
		# e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono')
		res = {}
		# **************** PRIMA CLASSIFICAZIONE *****************
		# Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare
		for classe in dictionary.keys():
			if classe == classe_effettiva:
				count_metriche[classe]['effettive'] += 1  # di ogni classe nel diz. so quante ce ne sono davvero

			# For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class
			dam = [(damerau_levenshtein_distance(p[0], campo_misurato)) for campo_misurato in
				dictionary[classe]]  # array di distanze per classe
			res[classe] = min(dam) # I pick the minimum distance for each class
			
		distanza_minima = res[min(res, key=res.get)]
		classi_con_stessa_distanza_minima = []  # riempio una lista per vedere se la distanza minima trovata è duplicata
		print 'PAROLA', parola, 'CLASSE', classe_effettiva, '-', x, '%', (len(parola) * x) / 100, 'distanza minima:', distanza_minima
		for key, value in res.iteritems():  # TODO casi in cui ci sono distanze uguali !!
			if value == distanza_minima:
				# print 'distanza minima =', key
				classi_con_stessa_distanza_minima.append(key)
		lista_distanze = []
		
		for c in res.keys():
			lista_distanze.append(res[c])

		if distanza_minima is 0:
			# print 'LA DISTANZA MINIMA è 0'
			# TODO non so se riclassificare -> può venir fuori lo stesso risultato
			if lista_distanze.count(0) > 1:  # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi
				#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)
				res = res #toglilo
		else:
			# print 'LA DISTANZA MINIMA NON è 0'
			"""
			A questo punto, verifico due condizioni:
			- se la distanza minima trovata tra tutte le classi è maggiore del x% di len(parola)
			- se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza
			"""
			percent_lunghezza = (len(parola) * x) / 100

			if distanza_minima > percent_lunghezza:
				# riclassifico solo per alcune classi !?
				# TODO cerco le classi con distanze simili alla distanza minima
				# aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili

				for classe, dist in res.iteritems():
					diff = (abs((distanza_minima - dist)) * y) / 100
					if diff < percent_lunghezza and (dist != distanza_minima):
						classi_con_stessa_distanza_minima.append(classe)
						
					#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)
		
		# We decide finally the class and check whether is right or wrong
		classe_attribuita = min(res, key=res.get)
		count_metriche[classe_attribuita]['classificate'] += 1
		if classe_effettiva == classe_attribuita:
			count_metriche[classe_attribuita]['classificate_esatte'] += 1
		print 'CLASSE ATTRIBUITA', classe_attribuita, 'distanza', res[classe_attribuita]
		
	return count_metriche
예제 #42
0
                              'effettive': 0,
                              'classificate_esatte': 0}

file2 = open('risultatoDictionary.csv', 'a+')
# Per ogni parola da testare, calcolo la distanza che c'è tra tale parola ed ogni parola nella lista di nomi_campo salvate nel dizionario
for p in parole:
    classe_effettiva = p[1]
    # res è un dict che conterrà (per ogni parola da testare) tutte le classi e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono')
    res = {}
    # Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare
    for classe in dictionary.keys():
        if classe == classe_effettiva:
            count_metriche[classe][
                'effettive'] += 1  # tra l'insieme di parole, quante sono effettivamente di tale classe

        dam = [(damerau_levenshtein_distance(p[0], campo_misurato)) for campo_misurato in
               dictionary[classe]]  # array di distanze per classe
        res[classe] = min(dam)
        # if len(dam) != 0:  #
        #     res[classe] = min(dam)

    # print res
    classe_attribuita = min(res, key=res.get)
    # print 'Classe attribuita -->', classe_attribuita, '\n'
    count_metriche[classe_attribuita]['classificate'] += 1  # quante volte attribuisco tale classe (giusta o sbagliata)
    # file2.write(str(p[0]) + ',' + ',' + str(p[1]) + ',' + str(min(res, key=res.get)) + '\n')
    if classe_effettiva == classe_attribuita:
        count_metriche[classe_attribuita]['classificate_esatte'] += 1


print count_metriche
예제 #43
0
파일: stats.py 프로젝트: amitdo/nidaba
def text_edit_ratio(doc, method=u'text_edit_ratio', ground_truth=None,
                    xml_in=True, gt_format='tei', clean_in=True, clean_gt=True,
                    divert=True):
    """
    Calculates the similarity of the input documents and a given ground truth
    using the Damerau-Levenshtein distance. The result is a value between 0.0
    (no commonality) and 1.0 (identical strings).

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to the output file.
        ground_truth (unicode): Ground truth location tuple or a list of ground
                                truths to choose from. When more than one is
                                given, the file sharing the longest prefix with
                                the input document is chosen.
        xml_in (bool): Switch to treat input as an TEI-XML document.
        gt_format (unicode): Switch to select ground truth format. Valid values
                             are 'tei', 'hocr', and 'text'.
        clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!)
        clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!)
        divert (bool): Switch selecting output diversion. If enabled the output
                       will be added to the tracking arguments and the input
                       document will be returned as the result of the task. Use
                       this to insert a statistical measure into a chain
                       without affecting the results.

    Returns:
        (unicode, unicode): Storage tuple of the output document
    """
    input_path = storage.get_abs_path(*doc[0])
    output_path = storage.insert_suffix(input_path, method,
                                        os.path.basename(input_path))
    if not isinstance(ground_truth[0], basestring):
        ground_truth = find_matching(doc, ground_truth)
    with storage.StorageFile(*ground_truth) as fp:
        if gt_format == 'tei':
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            gt = t.getvalue()
        elif gt_format == 'hocr':
            gt = html.parse(fp).text_content()
        elif gt_format == 'text':
            gt = fp.read()
        else:
            raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.')
    with storage.StorageFile(*doc) as fp:
        if xml_in:
            tei = TEIFacsimile()
            tei.read(fp)
            t = StringIO.StringIO()
            tei.write_text(t)
            text = t.getvalue()
        else:
            text = fp.read()
    if clean_in:
        text = cleanup(text)
    if clean_gt:
        gt = cleanup(gt)
    logger.debug('Recognition result: \n{}'.format(text))
    logger.debug('Ground truth: \n{}'.format(gt))
    edist = 1.0 - normalized_damerau_levenshtein_distance(text, gt)
    logger.debug('Edit distance: {}'.format(damerau_levenshtein_distance(text, gt)))
    logger.debug('Accuracy: {}'.format(edist))
    if not divert:
        storage.write_text(*storage.get_storage_path(output_path),
                           text=unicode(edit))
        return output_path
    else:
        return {'edit_ratio': edist, 'ground_truth': ground_truth, 'doc': doc}
예제 #44
0
 def error_prob(self, error, poss):
   dist = damerau_levenshtein_distance(error, poss)
   prob = (1/(2**dist))
   return prob
예제 #45
0
	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
	INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
import random
import string
import timeit

print('#edit distances (low edit distance means words are similar):')
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('smtih', 'smith', damerau_levenshtein_distance('smtih', 'smith')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('snapple', 'apple', damerau_levenshtein_distance('snapple', 'apple')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('testing', 'testtn', damerau_levenshtein_distance('testing', 'testtn')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('saturday', 'sunday', damerau_levenshtein_distance('saturday', 'sunday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('Saturday', 'saturday', damerau_levenshtein_distance('Saturday', 'saturday')))
print("damerau_levenshtein_distance('%s', '%s') = %d" % ('orange', 'pumpkin', damerau_levenshtein_distance('orange', 'pumpkin')))
print("damerau_levenshtein_distance('%s', '%s') = %d #unicode example\n" % ('Sjöstedt', 'Sjostedt', damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example

print('#normalized edit distances (low ratio means words are similar):')
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin')))
print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example
예제 #46
0
 def test_damerau_levenshtein_distance(self):
     assert damerau_levenshtein_distance('smtih', 'smith') == 1
     assert damerau_levenshtein_distance('snapple', 'apple') == 2
     assert damerau_levenshtein_distance('testing', 'testtn') == 2
     assert damerau_levenshtein_distance('saturday', 'sunday') == 3
     assert damerau_levenshtein_distance('Saturday', 'saturday') == 1
     assert damerau_levenshtein_distance('orange', 'pumpkin') == 7
     assert damerau_levenshtein_distance('gifts', 'profit') == 5
     assert damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 1
     assert damerau_levenshtein_distance([1, 2, 3], [1, 3, 2]) == 1
     assert damerau_levenshtein_distance((1, 2, 3), (1, 3, 2)) == 1
     assert damerau_levenshtein_distance((1, 2, 3), [1, 3, 2]) == 1
     assert damerau_levenshtein_distance([], []) == 0
     assert damerau_levenshtein_distance(list(range(10)), list(range(1, 11))) == 2
     assert damerau_levenshtein_distance([1, 2, 3, 4, 5, 6], [7, 8, 9, 7, 10, 11, 4]) == 7
def descr_damerau_levenshtein(row):
    return damerau_levenshtein_distance(row['description_1'], row['description_2'])
def title_damerau_levenshtein(row):
    return damerau_levenshtein_distance(row['title_1'], row['title_2'])
예제 #49
0
파일: levenshtein.py 프로젝트: bmccary/csvu
 def score_g():
     for row in row_g:
         row[target] = damerau_levenshtein_distance(row[column], string)
         yield row
예제 #50
0
def edit_distance_norm(word1, word2):
	dmdist = float(damerau_levenshtein_distance(word1, word2))
	#return dmdist / (float((len(word1)+ len(word2))) / 2)		# mean of lenghts
	#return dmdist												# pure
	#return dmdist / (float(max(len(word1), len(word2))))		# max
	return	 dmdist / (float((len(word1)+ len(word2))))
예제 #51
0
        if classi_attribuite.get(classe_attribuita) is None:
            classi_attribuite[classe_attribuita] = 1
        else:
            classi_attribuite[classe_attribuita] += 1

    classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get)
    # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente]
    percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi

    return classe_piu_frequente, percentuale

# Affinity propagation
words = np.asarray(parole)  # So that indexing with a list will work

dam = np.array([[(damerau_levenshtein_distance(w1, w2)) for w1 in words] for w2 in words])

distance_matrix = dam  # matrice con le distanze
affinity_matrix = 1 - distance_matrix

# AFFINITY PROPAGATION CLUSTERING #
mymat = -1 * distance_matrix
print mymat
# Perform Affinity Propagation Clustering of data
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
# Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering.
affprop.fit(mymat)

percentuali = 0
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
예제 #52
0
import numpy as np  # from numpy package
import sklearn.cluster  # from sklearn package
import distance  # from distance package
import jaro
from pyxdameraulevenshtein import damerau_levenshtein_distance
import sys

from bozza import words     # TODO togliere
#words = []

# words = np.asarray(words)  # So that indexing with a list will work
words = np.asarray(words)  # So that indexing with a list will work

dam = np.array([[(damerau_levenshtein_distance(w1,w2)) for w1 in words] for w2 in words]) #damerau-levenshtein 

distance_matrix = dam  # matrice con le distanze
affinity_matrix = 1 - distance_matrix

## AFFINITY PROPAGATION CLUSTERING ##
mymat = -1 * distance_matrix
print mymat
# Perform Affinity Propagation Clustering of data
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
# Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering.
affprop.fit(mymat)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)])
    cluster_str = ", ".join(cluster)
    #print(" - *%s:* %s" % (exemplar, cluster_str))  # (exemplar, cluster_str))
    print exemplar + "-----------"
예제 #53
0
def classificazione(test_set,x, y,number,current_prefix):
    l = len(test_set) - 1
    progress=0
    for stream in test_set:
        printProgress(progress, l, prefix = current_prefix, suffix = 'Complete', barLength = 50)
        field = stream["field_name"].strip().replace(' ', '_')
        #print "Classifico field: " + field

        # res è un dict che conterrà (per ogni parola da testare) tutte le classi
        # e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono')
        res = {}
        # **************** PRIMA CLASSIFICAZIONE *****************
        # Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare
        for classe in dictionary.keys():
            if classe == stream["field_class"]:
                count_metriche[classe]['effettive'] += 1  # di ogni classe nel diz. so quante ce ne sono davvero

            # For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class
            dam = [(damerau_levenshtein_distance(field, campo_misurato)) for campo_misurato in
                dictionary[classe]]  # array di distanze per classe

            if len(dam) > 0:
                res[classe] = min(dam) # I pick the minimum distance for each class
            #else:
                #res[classe] = 50 #Da modificare, ma per ora serve per evitare problemi con le classi senza parole

        distanza_minima = res[min(res, key=res.get)]
        classi_con_stessa_distanza_minima = []  # riempio una lista per vedere se la distanza minima trovata è duplicata
        for key, value in res.iteritems():  # TODO casi in cui ci sono distanze uguali !!
            if value == distanza_minima:
                # print 'distanza minima =', key
                classi_con_stessa_distanza_minima.append(key)
        '''
        lista_distanze = []
        for c in res.keys():
            lista_distanze.append(res[c])
        print "lista_distanze",lista_distanze
        '''
        if distanza_minima is 0:
        	# TODO non so se riclassificare -> può venir fuori lo stesso risultato
        	if len(classi_con_stessa_distanza_minima) > 1:  # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi
        		#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)
        		res = res #toglilo
        else:
        	"""
        	A questo punto, verifico due condizioni:
        	- se la distanza minima trovata tra tutte le classi è maggiore del x% di len(field)
        	- se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza
        	"""
        	percent_lunghezza = (len(field) * x) / 100

            #se non rispetta la condizione la assumo come buona
        	if distanza_minima > percent_lunghezza:
        		# riclassifico solo per alcune classi !?
        		# TODO cerco le classi con distanze simili alla distanza minima
        		# aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili
        		for classe, dist in res.iteritems():
        			diff = (abs((distanza_minima - dist)) * y) / 100
        			if diff < percent_lunghezza and (dist != distanza_minima):
        				classi_con_stessa_distanza_minima.append(classe)
        				
        			#XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima)

        # We decide finally the class and check whether is right or wrong
        classe_attribuita = min(res, key=res.get)
        count_metriche[classe_attribuita]['classificate'] += 1
        if stream["field_class"] == classe_attribuita:
        	count_metriche[classe_attribuita]['classificate_esatte'] += 1
        
        y_real.append(stream["field_class"])
        y_assigned.append(classe_attribuita)
        #print 'CLASSE ATTRIBUITA', classe_attribuita, 'distanza', res[classe_attribuita]
        db.store_classification(stream,classe_attribuita,res[classe_attribuita],number,"ClassificationRelevant")
        progress = progress + 1