Python levenshtein 예제들, textdistance.levenshtein Python 예제들

예제 #1

0

파일 보기

def three_recommended_items(request, id):
    """
                Fuznione che si occupa dei prodotti consigliati per l'utente, controllando di consigliare il
                prodotto stesso. Sfrutta il pacchetto textdistance e la similarità di levenshtein.
    :param:
                id: id del prodotto
    :return:
                Ritorna la lista dei prodotti consigliati
    """
    all_products = Product.objects.all()
    name_products = Product.objects.get(id=id)
    nome = name_products.name
    user_products = Product.objects.filter(user__email=request.user.email)
    all_products = all_products.difference(user_products)

    all_products_names = []
    for p in all_products:
        all_products_names.append(p.name)

    lista_nomi_prodotti = all_products_names
    lista_nomi_prodotti.remove(nome)

    user_products_names = []
    for p in all_products:
        user_products_names.append(p.name)

    if len(all_products_names) < 3:
        return 0

    import textdistance

    list = [[
        user_products_names[0], all_products_names[0],
        round(
            textdistance.levenshtein(user_products_names[0],
                                     all_products_names[0]), 4)
    ],
            [
                user_products_names[0], all_products_names[1],
                round(
                    textdistance.levenshtein(user_products_names[0],
                                             all_products_names[0]), 4)
            ],
            [
                user_products_names[0], all_products_names[2],
                round(
                    textdistance.levenshtein(user_products_names[0],
                                             all_products_names[0]), 4)
            ]]

    # Jaro–Winkler distance is a measure of edit distance which gives more similar measures to words in which
    # the beginning characters match.

    from django.db.models import Q
    list = Product.objects.filter(
        Q(name=list[0][1]) | Q(name=list[1][1]) | Q(name=list[2][1]))
    return list

예제 #2

0

파일 보기

파일: __init__.py 프로젝트: montali/hospital-triage-skill

def dictionary_searcher(query, dictionary):
    # Let's find the most similar disease to what we've got
    min_distance = textdistance.levenshtein(query, list(dictionary.keys())[0])
    for dict_key in dictionary:
        distance = textdistance.levenshtein(query, dict_key)
        if distance < min_distance:
            min_distance = distance
            nearest_key = dict_key
            nearest = dictionary[dict_key]
    return nearest_key, nearest

예제 #3

0

파일 보기

def levenshtein_accuracy(input, output, token):
    if (token == True):
        gap = len(output) - len(input)
        if (gap == 0):
            sum = 0
            for i in range(len(input)):
                sum += td.levenshtein(input[i], output[i])
            accuracy = sum / len(input)

        elif (gap < 0):  #output less tokens than input, original > generated
            i = 0
            j = 0
            gap = abs(gap)
            tokens = [0] * gap
        else:  #output more tokens than input, original < generated
            i = 0
            j = 0

    else:
        input_string = ' '.join(input)
        output_string = ' '.join(output)
        print(input_string)
        print(output_string)

        accuracy = td.levenshtein.normalized_similarity(
            input_string, output_string)

    return accuracy

예제 #4

0

파일 보기

파일: entitly_linking_clf.py 프로젝트: davidsbatista/politiquices

    def merge_substrings(entities):
        """
        This function eliminates entities which are already substrings of other entities.

        e.g.:
            input:['Ana Lourenço', 'Ana Dias Lourenço', 'Ana Afonso Dias Lourenço']
            output: ['Ana Afonso Dias Lourenço']

        Based on the principle that if a polysemous word appears two or more times in a
        written discourse, it is extremely likely that they will all share the same sense.
        (see: https://www.aclweb.org/anthology/H92-1045.pdf)
        """

        new_entities = []

        # sort the locations by size
        entities_sorted = sorted(
            [EntityLinking.clean_entity(x) for x in entities], key=len)

        # starting with the shortest one see if it's a substring of any of the longer ones
        for idx, x in enumerate(entities_sorted):
            found = False
            for other in entities_sorted[idx + 1:]:
                if x in other or textdistance.levenshtein(x, other) <= 3:
                    found = True
                    break
            if not found and x not in new_entities:
                new_entities.append(x)

        return new_entities

예제 #5

0

파일 보기

파일: awsrdscli.py 프로젝트: robins/getRDSUpgradePath

def getRegionTypoRecommendation(s):
    r = getRegionNames()
    if (s.isupper()):
        s2 = s.lower()
    else:
        s2 = s
    return min(r, key=lambda x: levenshtein(s2, x))

예제 #6

0

파일 보기

 def BFS(self, letters, words, index):
     if len(letters) == 0:
         return index
     if letters[0] != words[0][0]:
         return -1
     if len(letters) == 1:
         return index + 1
     if len(words) == 1:
         if textdistance.levenshtein(
                 words[0], letters) == len(words[0]) - len(letters):
             return index + 1
         else:
             return -1
     Queue = []
     Queue.append((letters[1:], words[1:]))
     i, j = 1, 1
     for i in range(1, len(letters), 1):
         if j != len(words[0]) and letters[i] in words[0][j:]:
             Queue.append((letters[i + 1:], words[1:]))
         j += 1
     # print(Queue)
     while len(Queue) != 0:
         (l, w) = Queue.pop(0)
         temp = self.BFS(l, w, index + 1)
         # print(temp)
         if temp == -1 and len(Queue) == 0:
             return temp
         elif temp != -1:
             return temp

예제 #7

0

파일 보기

파일: calculateMutateDistance.py 프로젝트: PiotrKaszuba/FramsAE

def distanceLevenhstein(centroid: str,
                        mutants: List[str],
                        latent_mutants: np.ndarray = None,
                        latent_mutants_encoder: np.ndarray = None) -> Dict:
    lev = [levenshtein(centroid, mutant) for mutant in mutants]

    min = float(np.min(lev))
    max = float(np.max(lev))
    avg = float(np.mean(lev))
    var = float(np.var(lev))
    rang = max - min
    if latent_mutants is not None:
        latent_distances = pdist(latent_mutants, 'euclidean')
        latent_distances_encoder = pdist(latent_mutants_encoder, 'euclidean')
        corrOuts = corrLevenshtein(mutants, latent_distances,
                                   latent_distances_encoder)
    else:
        corrOuts = {}
    return {
        **corrOuts,
        **{
            'avg_l': avg,
            'var_l': var,
            'max_l': max,
            'min_l': min,
            'rang_l': rang
        }
    }

예제 #8

0

파일 보기

파일: graph_ref.py 프로젝트: lubo93/TCR_numba

def ref_levenshtein(parser_N, src_name, ld_max):
    """
    Info: calculate the adjacency matrix with the normal textdistance
        Levenshtein distance to verify if our algorithm is correct included
    Args:  ii
    Returns: -
    """

    plotData.clusterMinLen = 1
    plotData.N = parser_N  #10**6#4*10**3
    plotData.min_ldVal = -1
    plotData.maxVal = 3
    plotData.max_ldVal = plotData.maxVal
    gpu_l = 8000  #5000
    step = 0
    _, _, seq, _, _ = dic.loadSequence(step,
                                       plotData,
                                       isExtractNum=False,
                                       src=src_name)

    a = np.zeros([len(seq), len(seq)])

    for i in range(len(seq)):
        for j in range(len(seq)):
            dist = td.levenshtein(seq[i], seq[j])
            a[i][j] = 1 if dist < ld_max else 0

    name = "two_number_indices_ref.txt"
    with open(name, 'w') as f:
        np.savetxt(f, a, fmt='%i')
    print("\n saved under: ", name)

예제 #9

0

파일 보기

파일: matching_test2.py 프로젝트: miniaishwarya/Bus-Route-Navigator

def find_route_list(
    list_eng,
    route,
    check=0
):  #check value determines whether the function find number of dest(check = 0) or get the list of matched dest
    count = 0
    index = 0
    for i in range(len(list_eng)):
        ratio = 0
        for j in range(len(route)):
            test_ratio = sm(None, list_eng[i], route[j]).ratio(
            )  #for calculating similarity ratio between strings
            val = td.levenshtein(
                list_eng[i], route[j]
            )  #for calculating levenshtein distance between strings
            if ((test_ratio > 0.73) and (test_ratio > ratio) and (val < 5)):
                ratio = test_ratio
                index = j  #index of most probable destination

        if (ratio > 0.73):
            if (check == 0):
                #print(list_eng[i]," has ratio ",ratio, "with" ,route[index]," AT", index,"\n")
                count = count + 1
            elif (check == 1):
                list_final.append(route[index])

    if (check == 0):
        count_final.append(count)

예제 #10

0

파일 보기

파일: test_distances.py 프로젝트: yoclaudius/GriSPy

 def levenshtein(c0, centres, dim):
     c0 = tuple(c0)
     distances = np.empty(len(centres))
     for idx, c1 in enumerate(centres):
         c1 = tuple(c1)
         dis = textdistance.levenshtein(c0, c1)
         distances[idx] = dis
     return distances

예제 #11

0

파일 보기

파일: str_utils.py 프로젝트: JonSteinn/T-764-DATA-DataCleaning

    def levenshtein(string1: str, string2: str) -> int:
        """The minimum number of single-character edits (insertions, deletions
        or substitutions) required to change one word into the other.

        https://en.wikipedia.org/wiki/Levenshtein_distance
        """
        dist: int = textdistance.levenshtein(string1, string2)
        return dist

예제 #12

0

파일 보기

def get_recruit_tags(img):
    import textdistance
    vw, vh = common.get_vwvh(img)
    tagimgs = [
        img.crop((50 * vw - 36.481 * vh, 50.185 * vh, 50 * vw - 17.315 * vh,
                  56.111 * vh)).convert('L'),
        img.crop((50 * vw - 13.241 * vh, 50.185 * vh, 50 * vw + 6.111 * vh,
                  56.111 * vh)).convert('L'),
        img.crop((50 * vw + 10.000 * vh, 50.185 * vh, 50 * vw + 29.259 * vh,
                  56.111 * vh)).convert('L'),
        img.crop((50 * vw - 36.481 * vh, 60.278 * vh, 50 * vw - 17.315 * vh,
                  66.019 * vh)).convert('L'),
        img.crop((50 * vw - 13.241 * vh, 60.278 * vh, 50 * vw + 6.111 * vh,
                  66.019 * vh)).convert('L')
    ]

    tagimgs = [
        Image.fromarray(
            cv2.threshold(img.array, 127, 255, cv2.THRESH_BINARY_INV)[1])
        for img in tagimgs
    ]

    eng = ocr.acquire_engine_global_cached('zh-cn')
    recognize = lambda img: eng.recognize(img,
                                          int(vh * 20),
                                          hints=[ocr.OcrHint.SINGLE_LINE],
                                          char_whitelist=known_tagchars
                                          ).text.replace(' ', '')
    cookedtags = []
    for img in tagimgs:
        logger.logimage(img)
        tag = recognize(img)
        logger.logtext(tag)
        if not tag:
            continue
        if tag in known_tags:
            cookedtags.append(tag)
            continue
        distances = [(target, textdistance.levenshtein(tag, target))
                     for target in known_tags.difference(cookedtags)]
        distances.sort(key=lambda x: x[1])
        mindistance = distances[0][1]
        matches = [x[0] for x in distances if x[1] == mindistance]
        if mindistance > 2:
            logger.logtext('autocorrect: minimum distance %d too large' %
                           mindistance)
            cookedtags.append(tag)
        elif len(matches) == 1:
            logger.logtext('autocorrect to %s, distance %d' %
                           (matches[0], mindistance))
            cookedtags.append(matches[0])
        else:
            logger.logtext(
                'autocorrect: failed to match in %s with distance %d' %
                (','.join(matches), mindistance))
            cookedtags.append(tag)

    return cookedtags

예제 #13

0

파일 보기

파일: train.py 프로젝트: SimonHFL/seq2seq_RL

def get_reward(reference, hypothesis, source):
    #return gleu_calc.sentence_gleu(hypothesis, reference, source)
    try:
        reward = reward_function((reference, hypothesis))
    except:
        return - textdistance.levenshtein(reference, hypothesis)   
        #return - Levenshtein.distance(reference, hypothesis)
    #print("reward",reward)
    return reward

예제 #14

0

파일 보기

def motSimilaire(motCle, Dico):
    liste = []
    if (len(motCle) > 2):
        for keys in Dico.keys():
            if textdistance.levenshtein(motCle, keys) < 2:
                liste.append(keys)
    else:
        liste.append(motCle)
    return liste

예제 #15

0

파일 보기

def Levenshtein(str1, match_against):
    best_match = ['', 10000000]
    str_comparison = [[x, textdistance.levenshtein(str1, x)]
                      for x in match_against]
    for item in str_comparison:
        if item[1] < best_match[1]:
            best_match = item

    return best_match

예제 #16

0

파일 보기

파일: luxembourg.py 프로젝트: achencraft/Python-Discord-Bot-CTS

    def findClosestStopName(self, input):

        stoplist = self.stopslist
        input = input.lower().replace(" ", "_")
        scores = [
            textdistance.levenshtein(input, x[0].lower().replace(" ", "_"))
            for x in stoplist
        ]
        best_name = stoplist[scores.index(min(scores))]
        return [best_name, min(scores)]

예제 #17

0

파일 보기

파일: data.py 프로젝트: aurel-au-velin-olymp/git2mid

 def get_winner(self, notes):
     top_chords = self.top_chords.copy()
     query = notes_to_chroma(librosa.midi_to_note(notes))
     top_chords['chroma_dist'] = np.array([
         textdistance.levenshtein(query, chord)
         for chord in top_chords.chroma.values
     ])
     top_chords['dist'] = np.array([
         textdistance.levenshtein(notes, chord)
         for chord in top_chords.midi_notes
     ])
     top_chords['chroma_dist'] = top_chords.chroma_dist.max(
     ) - top_chords.chroma_dist
     # top_chords['dist'] = np.array([len(textdistance.lcsseq(query, chord)) for chord in top_chords.chroma.values])
     candidates = top_chords[top_chords.dist ==
                             top_chords.dist.min()].sort_values(
                                 ['dist', 'chroma_dist'], ascending=False)
     winner = candidates.iloc[0]
     return winner.midi_notes

예제 #18

0

파일 보기

def _check_lieferando(entry_name: str, city: str) -> Optional[float]:
    propabilities = list()
    for e in restaurants:
        levenshtein_distance = textdistance.levenshtein(entry_name.lower(), e.lower())
        distance = max(1 - (levenshtein_distance / len(e)), 0)

        propabilities.append(distance)

    highest_propability = sorted(propabilities)[-1:]
    return highest_propability[0]

예제 #19

0

파일 보기

파일: evaluate.py 프로젝트: mutusfa/tesstraining

def distance(ground_truth, recognised_text):
    """Return normalized number of edits between tokens.

    Edits are removal, insertion and replacements, weighted equally.
    Number of edits is divided by number of tokens (in the ground_truth).

    Lower number is better.
    """
    yield textdistance.levenshtein(ground_truth,
                                   recognised_text) / len(ground_truth)

예제 #20

0

파일 보기

파일: strs_levenshtein.py 프로젝트: stefandragomir/string_similarity

	def compare(self,str1,str2):

		if self.debug:
			self.log("levenshtein comparison")

		self.start_time()

		self.result.distance  = levenshtein(str1,str2)

		self.end_time()

		self.result.nos         = max(len(str1),len(str2))
		self.result.threshold  = 90
		self.result.similarity = (100.0 / float(self.result.nos)) * (self.result.nos - self.result.distance)
		
		return self.result

예제 #21

0

파일 보기

def rank_hypotheses(toks, hypotheses, grams, leven_penalty=0.2, order=5):
    scores = []
    for hypo in hypotheses:
        scores.append([
            ' '.join(hypo),
            get_log_proba(hypo, grams=grams, order=order),
            len(hypo)
        ])

    d = pd.DataFrame(scores)
    d.columns = ['text', 'lm', 'n']
    d['leven'] = d.text.apply(
        lambda x: textdistance.levenshtein(x, ' '.join(toks)))
    d['penalty'] = np.log(leven_penalty) * d.leven
    d['score'] = d.lm + d.penalty
    d.sort_values('score', ascending=False, inplace=True)
    return d

예제 #22

0

파일 보기

    def assignRemoveAttributesTitles(self):
        """
        If there are columns in the csv files that are attributes of a larger class, this function will
        appropriately group them in the appropriate dictionary.
        A column is considered an attribute of another column if it's title is of the following structure:
        columnname_attributename
        E.g.verb_translation given that there is a separate column entitled 'verb'.
        """
        primary = set()
        attribute = {}
        headers = set()
        for col_name in self.headers:
            if self.delimiter not in col_name:
                headers.add(col_name)

        if self.order:
            primary = set(self.order)
            for o in primary:
                if o not in headers:
                    print("------------------")
                    print(
                        "[FATAL ERROR]\n{} - Invalid column name. Check order.csv file."
                        .format(o))
                    mispelt = list()
                    for h in headers:
                        if textdistance.levenshtein(o, h) <= 2:
                            mispelt.append(h)
                    if mispelt:
                        print("Did you mean: {}?\n".format(", ".join(mispelt)))
                    print("------------------")
                    sys.exit()

        else:
            primary = headers

        for col_name in self.headers:
            if self.delimiter in col_name:
                prim_attr = col_name.split(self.delimiter)
                if prim_attr[0] in primary:
                    attribute[col_name] = {
                        "primary": prim_attr[0],
                        "attribute": prim_attr[1]
                    }

        return primary, attribute

예제 #23

0

파일 보기

def heuristic_renames(vcs_system_id, revision_hash):
    """Return most probable rename from all FileActions, rest count as DEL/NEW.
    There may be multiple renames of the same file in the same commit, e.g., A->B, A->C.
    This is due to pygit2 and the Git heuristic for rename detection.
    This function uses another heuristic to detect renames by employing a string distance metric on the file name.
    This captures things like commons-math renames org.apache.math -> org.apache.math3.
    :param vcs_system_id vcs system of the commit
    :param revision_hash revision has of the commit for which the renames are determined
    :return Tuple of renames and added files. The renames are a list of tuples, where the first element in the tuple is
    the old name and the second element is the new name. The added files are a list.
    """
    renames = {}
    commit = Commit.objects(vcs_system_id=vcs_system_id, revision_hash=revision_hash).only('id').get()
    for fa in FileAction.objects(commit_id=commit.id, mode='R'):
        new_file = File.objects.get(id=fa.file_id)
        old_file = File.objects.get(id=fa.old_file_id)

        if old_file.path not in renames.keys():
            renames[old_file.path] = []
        renames[old_file.path].append(new_file.path)

    true_renames = []
    added_files = []
    for old_file, new_files in renames.items():

        # only one file, easy
        if len(new_files) == 1:
            true_renames.append((old_file, new_files[0]))
            continue

        # multiple files, find the best matching
        min_dist = float('inf')
        probable_file = None
        for new_file in new_files:
            d = levenshtein(old_file, new_file)
            if d < min_dist:
                min_dist = d
                probable_file = new_file
        true_renames.append((old_file, probable_file))

        for new_file in new_files:
            if new_file == probable_file:
                continue
            added_files.append(new_file)
    return true_renames, added_files

예제 #24

0

파일 보기

def listeSimilaire(request, Dico):
    liste = []
    taille = len(request)
    compteur = 1
    for motCle in request:
        #Progress bar
        progress_bar_test.print_progress_bar(
            compteur,
            taille,
            prefix='Mot de la recherche traité : ' + str(compteur) + '/' +
            str(taille),
            suffix='')
        compteur = compteur + 1
        if (len(motCle) > 2):
            for keys in Dico.keys():
                if textdistance.levenshtein(motCle, keys) < 2:
                    liste.append(keys)
        else:
            liste.append(motCle)
    return liste

예제 #25

0

파일 보기

def findClosest(txt, collection, collection_transformer=None, max_chars=None):
    best_score = 99999
    for item in collection:

        if collection_transformer is not None:
            item = collection_transformer(item)

        #print(item)

        item_comp = item
        if max_chars is not None:
            item_comp = item[:max_chars]

        d = textdistance.levenshtein(txt, item_comp)

        if d < best_score:
            best_score = d
            best_item = item

    return best_item, best_score

예제 #26

0

파일 보기

def sequenceDistance(dfEnsp, ref_dic, newcolresult, hamming, hammingNorm,
                     levenshtein, levenshteinNorm):
    res = []
    ham = []
    hamnorm = []
    lev = []
    levnorm = []
    serSeq = dfEnsp['proSequence'].copy()
    serID = dfEnsp['stableID_key'].copy()
    for inx, val in serSeq.items():
        pep = str(val)
        p = pep.strip()
        idd = str(serID[inx])
        # check pep to dict pep sequence
        mypep = ref_dic[idd]
        str(mypep)
        # identical
        if mypep == p:
            res.append('True')
            ham.append('identical')
            hamnorm.append('identical')
            lev.append('identical')
            levnorm.append('identical')
        # not identical to canonical
        if mypep != p:
            res.append('False')
            # calculates hamming distance, penalizes positional differences, edit based distance
            ham.append(textdistance.hamming(mypep, p))
            # normalized hamming = # mismatched positions/ len of longer sequence
            hamnorm.append(textdistance.hamming.normalized_distance(mypep, p))
            # levenshtein score is edit based but not not penalized position, insertion at pos 1 is jsut 1 diff
            lev.append(textdistance.levenshtein(mypep, p))
            levnorm.append(
                textdistance.levenshtein.normalized_distance(mypep, p))
    dfEnsp.loc[:, newcolresult] = res
    dfEnsp.loc[:, hamming] = ham
    dfEnsp.loc[:, hammingNorm] = hamnorm
    dfEnsp.loc[:, levenshtein] = lev
    dfEnsp.loc[:, levenshteinNorm] = levnorm
    return dfEnsp

예제 #27

0

파일 보기

파일: toCSV.py 프로젝트: NateDimick/funholidaysapi

 def inner_duplicates(hList):
     set_list = {h: set(h.lower().split())
                 for h in hList}  # map of holidays to their word sets
     new_list = []  # output set
     banned = set()
     for i, h1 in enumerate(hList):
         if h1 not in banned:
             s1 = set_list[h1]
             matches = []
             for h2 in hList[i + 1:]:
                 s2 = set_list[h2]
                 intersection = s1.intersection(s2)
                 difference = s1.symmetric_difference(s2)
                 if len(intersection) > 1 and 1 <= len(
                         difference
                 ) <= 2:  # accounts for single differences (National added or not) and spelling differences (apostorphe)
                     if len(difference) > 1:
                         # checking edit distance chnged dataset from 5366 entries to 5588 entries
                         if levenshtein(
                                 h1.lower(), h2.lower()
                         ) < 3:  # only a match if the two strings are very similar
                             matches.append(h2)
                         #print(difference, h1, h2)
                     else:
                         matches.append(
                             h2
                         )  # if difference == 1 then it's just an addition/ subtraction of one word, they match
             if matches:
                 matches.append(h1)
                 matches.sort(key=lambda x: len(x),
                              reverse=True)  # sort by longest first
                 new_list.append(matches[0])  # longest title is kept
                 for m in matches:
                     banned.add(m)
             else:
                 new_list.append(
                     h1)  # no matches, h1 is unique, keep it in the list
     return new_list

예제 #28

0

파일 보기

def rank_primers(df):
    df['levenshtein'] = df.apply(
            lambda x: levenshtein(x.primer, x.parent),
            axis=1,
    )
    df['gc_content'] = df.apply(
            lambda x: gc_content(x.primer),
            axis=1,
    )
    df['gc_clamp'] = df.apply(
            lambda x: gc_clamp(x.primer),
            axis=1,
    )
    df['gc_balance'] = abs(df.gc_content - 0.5)

    df.sort_values(
            ['gc_balance', 'levenshtein', 'gc_clamp'],
            ascending=[True, True, False],
            inplace=True,
    )
    df.drop('gc_balance', axis='columns', inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

예제 #29

0

파일 보기

def Seq_StringDistance(str_seq, str_ref, method="hamming"):

    if (method is "hamming"):
        return [
            textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "levenshtein"):
        return [
            textdistance.levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "damerau_lev"):
        return [
            textdistance.damerau_levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "j-winkler"):
        return [
            textdistance.jaro_winkler(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "smith-waterman"):
        return [
            textdistance.smith_waterman(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "jaccard"):
        return [
            textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "sorensen-dice"):
        return [
            textdistance.sorensen_dice(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "tversky"):
        return [
            textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "cosine"):
        return [
            textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "ratcliff"):
        return [
            textdistance.ratcliff_obershelp(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "bwt"):
        return [
            textdistance.bwtrle_ncd(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

예제 #30

0

파일 보기

# In[9]:


hamming = textdistance.Hamming(external=False)
hamming('text', 'testit')


# # Levenshtein
# 
# https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227

# In[10]:


textdistance.levenshtein('arrow', 'arow')


# In[11]:


textdistance.levenshtein.normalized_similarity('arrow', 'arow')


# # Jaro Winkler
# 
# https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227

# In[12]: