Python levenshtein_distance示例，jellyfish.levenshtein_distance Python示例

示例#1

0

显示文件

def get_grades_for_course():
    token = request.args['access_token']
    query = request.args['course']
    status, response = canvas_requests.get(
        token, 'courses', params={'include[]': 'total_scores'})

    if status != 200:
        return response, status

    most_similar_id = 1
    high_score = 0
    for course in response:
        name = course['name']
        code = course['course_code']
        name_sim = jellyfish.levenshtein_distance(name, query)
        code_sim = jellyfish.levenshtein_distance(code, query)
        if max(name_sim, code_sim) > high_score:
            most_similar_id = course['id']

    course = [
        course for course in response if course['id'] == most_similar_id
    ][0]
    enrollment = course['enrollments'][0]

    return {
        'grade': enrollment['computed_current_grade'],
        'score': enrollment['computed_current_score'],
        'course': course['name']
    }

示例#2

0

显示文件

文件： cho_utils.py 项目： Casist/lorewalker-cho

def levenshtein_ratio(source, target, ignore_case=True):
    """Calculates the levenshtein ratio between two strings.

    The ratio is computed as follows:
        (len(source) + len(target) - distance) / (len(source) + len(target))

    This function has been ported from (MIT license):
        https://github.com/texttheater/golang-levenshtein/blob/4041401c6e7f6a2b49815c4aea652e518ca8e92e/levenshtein/levenshtein.go#L115-L130

    :param str source:
    :param str target:
    :rtype: float
    :return:
    """

    if ignore_case:
        distance = jellyfish.levenshtein_distance(source.lower().strip(),
                                                  target.lower().strip())
    else:
        distance = jellyfish.levenshtein_distance(source, target)

    source_len = len(source)
    target_len = len(target)

    return (source_len + target_len - distance) / (source_len + target_len)

示例#3

0

显示文件

def distance_filter(df,
                    c,
                    thresh=3,
                    suffix1='_x',
                    suffix2='_y',
                    col1=None,
                    col2=None,
                    nonull=None):
    if (col1 is not None) and (col2 is not None):
        c1 = col1 + suffix1
        c2 = col2 + suffix2
    else:
        c1 = c + suffix1
        c2 = c + suffix2
    if nonull is not None:
        df['distance'] = df.apply(
            lambda x: jf.levenshtein_distance(x[c1], x[c2]), axis=1)
    else:
        df['distance'] = df.apply(lambda x: 10
                                  if (pd.isnull(x[c1]) | pd.isnull(x[c2])) else
                                  jf.levenshtein_distance(x[c1], x[c2]),
                                  axis=1)
    df = df[df.distance <= thresh]

    return df

示例#4

0

显示文件

文件： string_metrics.py 项目： francescoinfante/identity

 def extract(self, x, y):
     if x is None or y is None:
         return 0
     if self.similarity:
         return 1 - float(levenshtein_distance(unicode(x), unicode(y))) / max(len(x), len(y))
     else:
         return levenshtein_distance(unicode(x), unicode(y))

示例#5

0

显示文件

文件： stringsimilarity.py 项目： saelyne/ReLecture

def token_set_ratio(old_text, new_text):
    old_text_list = re.findall(r"[\w']+", old_text)
    new_text_list = re.findall(r"[\w']+", new_text)

    if len(old_text_list) == 0 or len(new_text_list) == 0:
    	return 0

    old_text_list = sorted(old_text_list)
    new_text_list = sorted(new_text_list)

    common_list = get_intersection(old_text_list, new_text_list)
    old_text_list_diff = get_difference(common_list, old_text_list)
    new_text_list_diff = get_difference(common_list, new_text_list)

    common_list = sorted(common_list)
    old_text_list_diff = sorted(old_text_list_diff)
    new_text_list_diff = sorted(new_text_list_diff)

    old_text_list = common_list+old_text_list_diff
    new_text_list = common_list+new_text_list_diff

    common_text_join = " ".join(str(x) for x in common_list)
    old_text_join = " ".join(str(x) for x in old_text_list)
    new_text_join = " ".join(str(x) for x in new_text_list)

    r1 = 100-jellyfish.levenshtein_distance(common_text_join, old_text_join)/len(old_text_join)*100
    r2 = 100-jellyfish.levenshtein_distance(common_text_join, new_text_join)/len(new_text_join)*100
    r3 = 100-jellyfish.levenshtein_distance(old_text_join, new_text_join)/max(len(old_text_join),len(new_text_join))*100

    result = max(r1, r2, r3)

    return round(result,1)

示例#6

0

显示文件

文件： parser.py 项目： peawyoyoyin/drug-reminder

def findToken(data, token, max_distance=2):
    result = []

    for j in range(1, max_distance + 1):
        tkl = len(token) + j
        if len(data) >= tkl:
            dl = []
            for i in range(len(data) - tkl):
                distance = jf.levenshtein_distance(data[i:i + tkl], token)
                dl.append(distance)
            for i in range(tkl):
                dl.append(tkl)
            result.append(dl)
        else:
            dl = []
            for i in range(len(data)):
                dl.append(len(token))
            result.append(dl)

    if len(data) >= len(token):
        dl = []
        for i in range(len(data) - len(token)):
            distance = jf.levenshtein_distance(data[i:i + len(token)], token)
            dl.append(distance)
        for i in range(len(token)):
            dl.append(len(token))
        result.append(dl)
    else:
        dl = []
        for i in range(len(data)):
            dl.append(len(token))
        result.append(dl)

    for j in range(1, max_distance + 1):
        tkl = len(token) - j
        if len(data) >= tkl:
            dl = []
            for i in range(len(data) - tkl):
                distance = jf.levenshtein_distance(data[i:i + tkl], token)
                dl.append(distance)
            for i in range(tkl):
                dl.append(tkl)
            result.append(dl)
        else:
            dl = []
            for i in range(len(data)):
                dl.append(len(token))
            result.append(dl)

    if len(result) == 0:
        return
    for dl in result:
        if len(dl) == 0:
            return
    eachResult = np.array(result)
    lowest_i = np.unravel_index(np.argmin(eachResult), eachResult.shape)
    if eachResult[lowest_i[0]][lowest_i[1]] <= max_distance:
        next_i = lowest_i[1] + len(token) + max_distance - lowest_i[0]
        return data[lowest_i[1]:next_i], data[next_i:]

示例#7

0

显示文件

文件： assistant.py 项目： sgvolpe/river

def get_matrix_distance(words_list, diagonal=True):
    M = [[[] for w in zip(words_list, words_list)] for w in zip(words_list, words_list)] #Generate Square Matrix
    for i in range(len(words_list)):
        for j in range(len(words_list)):
            if diagonal:
                if j>=i:
                    M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it
            else: M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it
    return M

示例#8

0

显示文件

文件： fuzzy_string_comparison.py 项目： bcgrendel/Speechcoder

def get_closest_levenshtein(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.levenshtein_distance(needle,x));
		else:
			temp = (x,jellyfish.levenshtein_distance(needle,x));
			if(temp[1] < closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];

示例#9

0

显示文件

文件： bigram.py 项目： diahnuri/TMSS

def bigram_corr(line): #function with input line(sentence)
    words = line.split() #split line into words
    for idx, (word1, word2) in enumerate(zip(words[:-1], words[1:])):
#     line = list(itertools.chain.from_iterable(line))
        for i,j in fdist: #iterate over bigrams
            if (word2==j) and (jf.levenshtein_distance(word1,i) < 5): #if 2nd words of both match, and 1st word is at an edit distance of 2 or 1, replace word with highest occurring bigram
                idx = 0
                words[idx] = i
            elif (word1==i) and (jf.levenshtein_distance(word2,j) < 5):
                idx = 1
                words[idx] = j
    return " ".join(words)

示例#10

0

显示文件

文件： geotag.py 项目： vidit-sh/SLOR_Back

def levProDistance(str1, str2):
    c1 = str1.split(" ")
    c2 = str2.split(" ")
    score = 0
    for word in c1:
        levScore = [jf.levenshtein_distance(word , alter) for alter in c2]
        score += min(levScore)
    score2 =0    
    for word in c2:
        levScore = [jf.levenshtein_distance(word , alter) for alter in c1]
        score2 += min(levScore)    
    return ((score2*1.0/len(c2))+(score*1.0/len(c1)))/2

示例#11

0

显示文件

def get_closest_levenshtein(needle, haystack):
    closest = None
    for x in haystack:
        if (closest == None):
            closest = (x, jellyfish.levenshtein_distance(needle, x))
        else:
            temp = (x, jellyfish.levenshtein_distance(needle, x))
            if (temp[1] < closest[1]):
                closest = temp
    if (closest == None):
        return None
    return closest[0]

示例#12

0

显示文件

文件： matcher_by_text.py 项目： Raul-diffindo/Django-Matcher

 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Levenshtein algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(jellyfish.levenshtein_distance(string_a, string_b))
         else:
             return jellyfish.levenshtein_distance(string_a, string_b)
     else:
         raise TypeError

示例#13

0

显示文件

文件： similar.py 项目： OlamideD/dcl

def jelly():
    import jellyfish
    a = u'Korle Bu Teaching Hospital Sickle Cell Dept'
    b = u'Korle Bu Teaching Hospital'
    # a = u'x'
    # b = u'a'
    print jellyfish.levenshtein_distance(a, b)
    print jellyfish.jaro_distance(a, b)
    print jellyfish.damerau_levenshtein_distance(a, b)
    # print jellyfish.match_rating_comparison(a,b)

    from fuzzywuzzy import fuzz

    print fuzz.ratio(a, b)

示例#14

0

显示文件

文件： server.py 项目： my13/GeoSpell

def final(mlf):
    print(mlf)
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    l5 = []
    l6 = []
    sdx_input = call_soundex(mlf)
    sdx_raw = call_soundex(utf_corpus())
    for (i, j), (k, v) in product(sdx_input.items(), sdx_raw.items()):
        l1.append(i.split('-')[0])
        l2.append(j)
        l3.append(k.split('-')[0])
        l4.append(v)
        l5.append(jellyfish.levenshtein_distance(j, v))
        l6.append(
            jellyfish.levenshtein_distance(i.split('-')[0],
                                           k.split('-')[0]))
    df = pd.DataFrame(np.nan,
                      index=range(0, len(l1)),
                      columns=[
                          'wrd',
                          'sx_wrd',
                          'cpr',
                          'sx_cpr',
                          'sx_dist',
                          'lv_dist',
                      ])
    df['wrd'] = l1
    df['sx_wrd'] = l2
    df['cpr'] = l3
    df['sx_cpr'] = l4
    df['sx_dist'] = l5
    df['lv_dist'] = l6
    print(df.head(5))

    min_df_lv = df[df['lv_dist'] <= 2]

    selected = []
    for i in range(0, len(mlf)):
        if len(mlf[i]) > 0:
            x = min_df_lv[min_df_lv['wrd'] == list(mlf)[i]].sort_values(
                by='sx_dist', ascending=False).head(10)
            #print(x)
            s = x.groupby(['cpr'])['wrd'].transform('count')
            selected.append(x['cpr'].ix[s.idxmax()])
            print(x['cpr'].ix[s.idxmax()])
    return selected

示例#15

0

显示文件

文件： learning.py 项目： hellais/ooni-lyzer

def suggest_normalizations(sample, threshold=1.0):
    """
    Attempts to identify spelling mistakes between two strings (a, b) using the Levenshtein distance metric, which is
    defined as the minimum edit distance between two strings. In order to identify candidates for replacement, we define
    a similarity measure (s) which is defined as such:

    f = jellyfish.levenshtein_distance
    s = floor([len(a)/f(a) + len(b)/f(a)]) / 2

    :param sample: a collection of terms to use
    :param threshold the threshold to use
    :return: a dict of candidates for normalization
    """
    if not all(map(lambda x: type(x) == list, [sample])):
        raise ValueError("normalize() expects scalar-valued arrays as input (e.g. a = [1, 2, 3])")
    else:
        seen = set()
        suggestions = []
        c = collections.Counter(sample).most_common()
        for t1, c1 in c:
            for t2, c2 in c:
                if t1 != t2 and (t1, t2) not in seen:
                    seen.add((t1, t2))
                    seen.add((t2, t1))

                    d = jellyfish.levenshtein_distance(t1, t2)
                    similarity = ((len(t1) / d) + (len(t2) / d)) // 2.0
                    if similarity > threshold:
                        suggestions.append(Suggestion(a=t1, b=t2, edit_distance=d))
        return suggestions

示例#16

0

显示文件

文件： load-data.py 项目： grdscarabe/information-design

def get_insee(postcode, name):
	"""
	Convert a postcode to an insee code.
	If no exact match, choose best candidate but record it as problematic.
	"""
	global problematicTown
	global problematicPost
	
	if not post2insee.has_key(postcode):
		# No match on postcode...
		problematicPost.add(postcode)
		return None
	elif post2insee[postcode].has_key(name.upper()):
		# Perfect match!
		return post2insee[postcode][name.upper()]
	else:
		# No perfect match, look for best candidate
		best = None
		best_score = None
		for candidate in post2insee[postcode].keys():
			score = jellyfish.levenshtein_distance(name.upper(), candidate)
			if (best_score is None) or (score<best_score):
				best_score = score
				best = candidate
		problematicTown.add( name.upper() )
		if not best is None:
			return post2insee[postcode][best]
		else:
			return None

示例#17

0

显示文件

文件： create_code_switching_feats.py 项目： praveen-1/alignment-with-openfst

def get_fuzzy_dict_features(w, s, dict_name=u'fuzzy', distance=5):
    import jellyfish
    to_return = dict()
    for cand in s:
        if jellyfish.levenshtein_distance(w.lower(), cand) < distance:
            to_return[u'wordlist-{}'.format(dict_name)] = 1
        return to_return

示例#18

0

显示文件

文件： mtg_find_card_tesseract.py 项目： olikraus/scad

def find_card(carddic, s):
  t = { 
  8209: 45, 8211:45, # convert dash
  48: 111, 79: 111, # convert zero and uppercase O to small o
  211: 111, 212: 111, 214: 111, # other chars similar to o
  242: 111, 243: 111, 244: 111, 245: 111, 246: 111, # other chars similar to o
  959:111, 1086:111, 8009:111, 1054:111,    # other chars similar to o
  73:105, 74:105, 106:105, 108:105, 124:105, # convert upper i, upper j, small j, small l and pipe symbol to small i
  161:105, 205:105, 206:105, 236:105, 237:105, 238:105, 239:105, 1575:105,  # convert other chars to i
  192: 65, 193: 65, 194: 65, 196: 65, 1040:65, 1044:65,         # upper A
  200: 69, 201: 69, 202: 69, 1045:69,   # upper E
  85:117,  # convert upper U to small u
  218: 117, 220: 117,  # other conversions to small u
  249: 117, 250: 117, 251: 117, 252: 117, # other conversions to small u
  956: 117, 1094: 117,
  224: 97, 225: 97, 226: 97, 227: 97, 228: 97, 229: 97, # small a conversion
  232: 101, 233: 101, 234: 101, 235: 101 # small e conversion
  }

  d = 999
  dmin = 999
  smin = ""
  for c in carddic:
    d = jellyfish.levenshtein_distance(c.translate(t), s.translate(t))
    if dmin > d:
      dmin = d
      smin = c
      print(c.translate(t) + "/"+ s.translate(t))
  return [carddic[smin], smin, dmin]

示例#19

0

显示文件

    def find_match_levenshtein(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token,
                word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        #G = ngram.NGram(candidates)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in candidates:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return candidates, is_match

示例#20

0

显示文件

文件： sanity_check.py 项目： serinachang5/bechdel

def checkID_gorinski(movies):

	movie_db = imdb.IMDb()
	correct = 0
	incorrect = 0
	id_mismatch = []

	for item in movies:
		movie_by_ID = movie_db.get_movie(item[-1])

		if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 15:

			# try:
			# 	with open(item[2]) as fp:
			# 		contents = fp.readlines()[:60]
			# 		for line in contents:
			# 			line = line.strip()
			# 			if len(line) <= 15:
			# IMDB search character not working,
			# no cross match with actor (delete)

			# except FileNotFoundError: --> lots of file mismatches
			# 	id_mismatch.append(item)

			id_mismatch.append(item)
			incorrect += 1

		else:
			correct +=1
	return (str(correct/(correct+incorrect)*100), id_mismatch)

示例#21

0

显示文件

文件： integrator.py 项目： liangsi/2d_nebula

def union_names(anidb_names, absolute_names):
    if not anidb_names and not absolute_names:
        return []
    
    if not anidb_names:
        return absolute_names

    if not absolute_names:
        return anidb_names

    anidb_names_copy = list(anidb_names)
    absolute_names_copy = list(absolute_names)
    name_matches = {}

    while anidb_names_copy:
        anidb_name = anidb_names_copy.pop()

        for name in absolute_names_copy:
            simi = jellyfish.levenshtein_distance(
                anidb_name, name.encode('utf-8'))

            if simi < 5:
                absolute_names_copy.pop(0)
                name_matches[anidb_name] = name
                break

    total_distinct_names = anidb_names
    total_distinct_names.extend([name for name in absolute_names if name not in name_matches.values()])

    return total_distinct_names

示例#22

0

显示文件

文件： abdb_prepdata_sup_fig1.py 项目： evqlv-tech/manuscript_ab_epitope_interaction

def get_levenshtein_agseq():
    '''
    get levenshtein distance per antigen
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    data = []
    for i, pdbid in enumerate(df.pdbid.unique()):
        pdbdf = df[df.pdbid == pdbid]
        agseq1 = pdbdf.iloc[0].a_sequence
        print('computing %s #%s' % (pdbid, i))
        for pdbid2 in df.pdbid.unique():
            if pdbid2 != pdbid:
                pdbdf2 = df[df.pdbid == pdbid2]
                agseq2 = pdbdf2.iloc[0].a_sequence
                ld = jellyfish.levenshtein_distance(agseq1, agseq2)
                # print(ld)
                datum = [pdbid, pdbid2, agseq1, agseq2, ld]
                data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'agseq1', 'agseq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    outname = infile[:-4] + '_antigen_full_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)

示例#23

0

显示文件

文件： sanity_check.py 项目： serinachang5/bechdel

def checkID_agarwal(movies):

	movie_db = imdb.IMDb()
	correct = 0
	incorrect = 0
	id_mismatch = []

	for item in movies:
		movie_by_ID = movie_db.get_movie(item[-1])

		# if levenshtein distance test fails for movie title, continue to check
		# for movie year
		if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 10:
			year = str(movie_by_ID["year"])
			writer = list(movie_by_ID["writer"])
			writer_to_str = [str(w) for w in writer]
			with open(item[2]) as fp:
				content = fp.readlines()[:20]
				for w in writer_to_str:
					writer_check = any(w in c for c in content)
				match_year = [s for s in content if year in s]
				if match_year == [] and writer_check == False:
					print("Sanity check failed: \n Year or writer mismatch found. \n {} {}".format(item[-1], item[0]), "\n")
					incorrect += 1
					id_mismatch.append(item)
				else:
					print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n")
					correct += 1
		else:
			

			print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n")
			correct += 1

	return (str(correct/(correct+incorrect)*100), id_mismatch)

示例#24

0

显示文件

def stringLevensteinFraction(s1, s2, recogHash=False):
    if recogHash:
        s1 = removeHashNSpace(s1)
        s2 = removeHashNSpace(s2)
    s1 = s1.replace(" ", "")
    s2 = s2.replace(" ", "")
    return (1 - jf.levenshtein_distance(s1, s2) / max(len(s1), len(s2)))

示例#25

0

显示文件

文件： main.py 项目： MiLk/adventofcode

def p2(lines):
    import itertools
    from jellyfish import levenshtein_distance
    for (l1, l2) in itertools.product(lines, repeat=2):
        d = levenshtein_distance(l1, l2)
        if d == 1:
            return common(l1, l2)

示例#26

0

显示文件

文件： abdb_prepdata_sup_fig1.py 项目： evqlv-tech/manuscript_ab_epitope_interaction

def get_levenshtein_epitopeseq():
    '''
    get levenshtein distance per antigen
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    data = []
    for i, row in df.iterrows():
        pdbid = row.pdbid
        epitopeseq1 = row.epitope
        for i2, row2 in df.iterrows():
            pdbid2 = row2.pdbid
            if pdbid2 != pdbid:
                epitopeseq2 = row2.epitope
                ld = jellyfish.levenshtein_distance(epitopeseq1, epitopeseq2)
                datum = [pdbid, pdbid2, epitopeseq1, epitopeseq2, ld]
                data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'epitopeseq1', 'epitopeseq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_antigen_epitope_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)

示例#27

0

显示文件

文件： abdb_prepdata_sup_fig1.py 项目： evqlv-tech/manuscript_ab_epitope_interaction

def get_levenshtein_segments_epitope():
    '''
    get levenshtein distance per segment
    :return:
    '''
    infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv'
    df = pd.read_csv(infile).iloc[:]
    print(df.info())
    df = df.dropna(subset=['epitope'])
    data = []
    for segment in df.region.unique():
        segdf = df[df.region == segment]
        print(segment)
        print(segdf.shape)
        counter = 0
        for i, row in segdf.iterrows():
            counter += 1
            # print(counter)
            print('seq1 %s' % row.epitope)
            seq1 = row.epitope
            pdbid = row.pdbid
            for i2, row2 in segdf.iterrows():
                pdbid2 = row2.pdbid
                if pdbid != pdbid2:
                    print('seq2 %s' % row2.epitope)
                    seq2 = row2.epitope
                    ld = jellyfish.levenshtein_distance(seq1, seq2)
                    datum = [pdbid, pdbid2, segment, seq1, seq2, ld]
                    data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'region', 'epitope1', 'epitope2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_antigen_epitope_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)

示例#28

0

显示文件

文件： abdb_prepdata_sup_fig1.py 项目： evqlv-tech/manuscript_ab_epitope_interaction

def get_levenshtein_segments():
    '''
    get levenshtein distance per segment
    :return:
    '''
    infile = 'abdb_outfiles_2019/abdb_segment_absequence_full_vgene_imgt_vgene.csv'
    df = pd.read_csv(infile)
    print(df.info())
    data = []
    for segment in df.segment.unique():
        segdf = df[df.segment == segment]
        print(segment)
        print(segdf.shape)
        counter = 0
        for i, row in segdf.iterrows():
            counter += 1
            print(counter)
            seq1 = row.segment_seq
            pdbid = row.pdbid
            for i2, row2 in segdf.iterrows():
                pdbid2 = row2.pdbid
                if pdbid != pdbid2:
                    seq2 = row2.segment_seq
                    ld = jellyfish.levenshtein_distance(seq1, seq2)
                    datum = [pdbid, pdbid2, segment, seq1, seq2, ld]
                    data.append(datum)
    colnames = ['pdbid1', 'pdbid2', 'segment', 'seq1', 'seq2', 'ld']
    lddf = pd.DataFrame(data, columns=colnames)
    print(lddf.head())
    outname = infile[:-4] + '_ld.csv'
    print(outname)
    lddf.to_csv(outname, index=False)

示例#29

0

显示文件

        def levenshtein_apply(pair):
            if _pair_has_any_null(pair):
                LOGGER.debug(
                    "Can't compute Levenshtein distance, "
                    "the pair contains null values: %s",
                    pair,
                )
                return np.nan

            scores = []
            source_list, target_list = pair

            for source in source_list:
                for target in target_list:
                    try:
                        score = 1 - jellyfish.levenshtein_distance(
                            source, target) / np.max(
                                [len(source), len(target)])
                        scores.append(score)
                    except TypeError:
                        if pd.isnull(source) or pd.isnull(target):
                            scores.append(self.missing_value)
                        else:
                            raise

            return max(scores)

示例#30

0

显示文件

文件： main.py 项目： LJWill/kt-assignment1

def apply_soundex(misspell, dictionary):
    count = 0
    result = []

    for mis_word in misspell:
        predict_words = []

        if mis_word not in dictionary:
            if '/' not in mis_word:
                for dict_word in dictionary:
                    soundex_mis = jf.soundex(mis_word)
                    soundex_dict = jf.soundex(dict_word)
                    l_dist = jf.levenshtein_distance(soundex_mis, soundex_dict)

                    predict_words.append((dict_word, l_dist))

                first_five_pred = sorted(predict_words,
                                         key=operator.itemgetter(1),
                                         reverse=False)[:5]
                pred_words = [x[0] for x in first_five_pred]

                result.append(pred_words)

            else:
                # do not predict when  word contains '/', a lazy method
                result.append(mis_word)

        # if mis_word in dictionary
        else:
            result.append(mis_word)

        count += 1
        print("Processing: {} / {}".format(count, len(misspell)), end='\r')

    return result

示例#31

0

显示文件

文件： merger.py 项目： andriiaprysiazhnyk/recipeRecommendation

def max_distance(set1, set2):
    if len(set1) == 0 or len(set2) == 0:
        return 0

    return max(1 -
               jellyfish.levenshtein_distance(e1, e2) / max(len(e1), len(e2))
               for e2 in set2 for e1 in set1)

示例#32

0

显示文件

def getSimilarityRpt(similar, hash, base_tlsh):
    # hash is a sha1, support other main hashes
    rpt = json.loads(similar)
    lengh = len(base_tlsh)
    sim = {
        'sha1_hash': hash,
        'tlsh': base_tlsh,
        'data': [],
    }
    if rpt['query_status'] == "ok":

        print(str(len(rpt['data']) - 1) + " similar files to ")
        print('Base : ' + hash + '  tlsh: ' + base_tlsh)
        for sub in rpt['data']:
            dist = jellyfish.levenshtein_distance(base_tlsh, sub['tlsh'])
            percent = str(round(100 * ((lengh - dist) / lengh), 2)) + "%"
            if sub['sha1_hash'] != hash:
                print('Sha1 : ' + sub['sha1_hash'] + '  tlsh: ' + sub['tlsh'] +
                      ' Similar at : ' + percent + '  tags:  ' +
                      str(sub['tags']))
                data = {}
                data['sha1_hash'] = sub['sha1_hash']
                data['tlsh'] = sub['tlsh']
                data['similar'] = percent
                sim['data'].append(data)
        y = json.dumps(sim, indent=4)
        return y
    else:
        return rpt['query_status']

示例#33

0

显示文件

文件： selectOfertas.py 项目： raulolles/Retrogaming

def select_busqueda(origen_datos, id_user, palabra_busq):
    unid_select = 15
    y, r, items = importa_tablas_2(origen_datos)
    r0 = r[:, id_user]

    # Crea lista con distancia (Levenshtein )
    distancia = list()
    palabra_busq = palabra_busq.lower()
    for i in range(len(r0)):
        palabra = items.loc[i][0].lower()
        dist_min = np.inf
        for p in palabra.split():
            if palabra_busq in p:
                dist = 0
                if dist < dist_min:
                    dist_min = dist

            dist = jel.levenshtein_distance(palabra_busq, p)
            if dist < dist_min:
                dist_min = dist
        distancia.append(dist_min)

    jugado = 3

    tabla_slc = crea_tabla_slc(distancia, r0, False, jugado)
    seleccion = ejecuta_seleccion(id_user, items, y, r, unid_select, tabla_slc)

    return seleccion

示例#34

0

显示文件

文件： classifyAndCalculateFMeasure.py 项目： crizzy/dpdc-exercises

def get_levenshtein_avg(row1, row2):
	sum = 0
	for columnIndex in xrange(1,15):
		a = row1[columnIndex]
		b = row2[columnIndex]
		sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b)))
	return sum / 14.0

示例#35

0

显示文件

文件： commission.py 项目： kuwas/AzurLaneAutoScript

    def commission_name_parse(self, string):
        """
        Args:
            string (str): Commission name, such as 'NYB要员护卫'.

        Returns:
            str: Commission genre, such as 'urgent_gem'.
        """
        # if self.is_doa_commission():
        #     return 'doa_daily'
        import jellyfish
        min_key = ''
        min_distance = 100
        string = re.sub(r'[\x00-\x7F]', '', string)
        for key, value in dictionary_jp.items():
            for keyword in value:
                distance = jellyfish.levenshtein_distance(keyword, string)
                if distance < min_distance:
                    min_key = key
                    min_distance = distance
        if min_distance < 3:
            return min_key

        logger.warning(f'Name with unknown genre: {string}')
        self.valid = False
        return ''

示例#36

0

显示文件

文件： searcher.py 项目： opendatamonitor/ckanext-harmonisation

    def near_dup_search(self,data,max_dist,content,md5,query,db_conn):
        q=''
        for mh in query:
            q+=str(mh)
        # results = self.s.search(q='*:*',fq='content_sg:\"'+q+'\"')
        results = db_conn[self.db][self.collection].find({'content_sg':q,
            '_id':{'$gt':ObjectId(data['_id'])},
            'catalogue_url':{'$ne':data['catalogue_url']},
            # 'dupl':{'$ne':True}
            })

        matches = defaultdict(list)
        # Just loop over it to access the results.
        for result in results:
            # print("The title is '{0}'.".format(result['content'].encode('utf8')))
            if md5 == result['md5_hash']:
                # matches.append(result)
                matches['Exact'].append(result)
            elif jellyfish.levenshtein_distance(content,
                    result['content']) < max_dist*len(content):
                matches['Approximate'].append(result)

        # if len(matches) > 0:
        #     print('Dups for _id:%s found: ' % data['_id'],end='')
        #     # for match in matches:
        #     print(','.join([str(match['_id']) for match in matches]))
        if all (k in matches for k in ('Exact' and 'Approximate')):
            del matches['Approximate']

        return matches

示例#37

0

显示文件

文件： alldist-with-graph.py 项目： fabbasi/IDS-scripts

def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum

示例#38

0

显示文件

def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new

示例#39

0

显示文件

文件： find-missing-insee.py 项目： grdscarabe/information-design

def get_insee(postcode, name, distmax=5):
	"""
	Convert a postcode to an insee code.
	If no exact match, choose best candidate but record it as problematic.
	"""
	global problematicTown
	global problematicPost
	
	# Handle cedex stuff
	if reg_cedex.search(name):
		name = reg_cedex.sub("", name)

	if not post2insee.has_key(postcode):
		# No match on postcode...
		problematicPost.add(postcode)
		return None
	elif post2insee[postcode].has_key(name.upper()):
		# Perfect match!
		return (name.upper(), post2insee[postcode][name.upper()])
	else:
		# No perfect match, look for best candidate
		best = None
		best_score = None
		for candidate in post2insee[postcode].keys():
			score = jellyfish.levenshtein_distance(name.upper(), candidate)
			if (best_score is None) or (score<best_score):
				best_score = score
				best = candidate
		problematicTown.add( name.upper() )
		if (not best is None) and (best_score<distmax):
			return (best, post2insee[postcode][best])
		else:
			return None

示例#40

0

显示文件

文件： song_finder.py 项目： shreyasparbat/song-lyric-analyser

    def find_min_dist(lyrics):
        nonlocal min_dist
        nonlocal min_dist_idx
        nonlocal phrase
        nonlocal idx

        # Find best match phrase in lyrics
        min_dist_this_lyrics = 10000
        min_dist_start_idx = 0
        min_dist_end_idx = 0
        lyrics_met = jellyfish.metaphone(lyrics).split(' ')
        for i in range(0, len(lyrics_met) - len(test_met)):
            this_lyrics_met = lyrics_met[i:i + len(test_met)]
            if this_lyrics_met[0] == test_met[0]:
                dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met))
                if dist < min_dist_this_lyrics:
                    min_dist_this_lyrics = dist
                    min_dist_start_idx = i
                    min_dist_end_idx = i + len(test_met)

        # Check against global min
        if min_dist_this_lyrics < min_dist:
            min_dist = min_dist_this_lyrics
            min_dist_idx = idx
            phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx])

        # Increment global idx
        idx += 1

示例#41

0

显示文件

文件： ncddist-with-graph-for-binary.py 项目： fabbasi/IDS-scripts

def alldist(filex, filey):
    xread = open(filex, "r").read()
    yread = open(filey, "r").read()
    lvd = jellyfish.levenshtein_distance(xread, yread)
    dlvd = jellyfish.damerau_levenshtein_distance(xread, yread)

    #    print lvd
    res = float(lvd / 100.00)
    dres = float(dlvd / 100.00)
    #    print res
    # print "Levenshtein Distance=",lv_d
    #    jaro = jellyfish.jaro_distance(xread,yread)
    ## Added jaro-winkler distance by fahim 20111011
    #    jarowink = jellyfish.jaro_winkler(xread,yread)
    #    jaro = 1.0 - jaro
    #    jarowink = 1.0 - jarowink
    # 	print "Jaro Distance = ",jaro
    #    ham = jellyfish.hamming_distance(xread,yread)
    #    ham = float ( ham / 100.00)
    # 	print "Hamming Distance = ", ham
    # 	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
    # 	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
    #    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres, jaro, jarowink, ham, kl

示例#42

0

显示文件

文件： mitll_string_matcher.py 项目： mitll/LLString

    def levenshtein_similarity(self,s,t):
        """ Levenshtein Similarity """

        Ns = len(s); Nt = len(t);

        lev_sim = 1.0 - (jellyfish.levenshtein_distance(s,t))/float(max(Ns,Nt))

        return lev_sim

示例#43

0

显示文件

文件： metrics.py 项目： paintception/ZeroAccuracySystems

def get_avg_word_distance(target_words, predicted_words):
    try:
        trim_target_words = [word.strip() for word in target_words]
        trim_predicted_words = [word.strip() for word in predicted_words]
        dists = [1 - jellyfish.levenshtein_distance(t, p) / max(len(t), len(p)) for t, p in zip(trim_target_words, trim_predicted_words)]
        return sum(dists) / len(dists)
    except ZeroDivisionError:
        return 0

示例#44

0

显示文件

文件： string.py 项目： J535D165/recordlinkage

    def levenshtein_apply(x):

        try:
            return 1 - jellyfish.levenshtein_distance(x[0], x[1]) / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

示例#45

0

显示文件

文件： LSHIndex.py 项目： shrinivaasanka/asfer-github-code

	def find_nearest_neighbour(self, e, neighbours):
		minneighbour=""
		mindistance=100000000000.0
		d=0
		for n in neighbours:
            		d=jellyfish.levenshtein_distance(unicode(e),unicode(n))
			if d < mindistance:
				mindistance=d
				minneighbour=n
		return minneighbour,mindistance

示例#46

0

显示文件

文件： test.py 项目： jprobst21/jellyfish

    def test_levenshtein_distance(self):
        cases = [("", "", 0),
                 ("abc", "", 3),
                 ("bc", "abc", 1),
                 ("kitten", "sitting", 3),
                 ("Saturday", "Sunday", 3),
                 ]

        for (s1, s2, value) in cases:
            self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)

示例#47

0

显示文件

文件： differ.py 项目： SCOAP3/invenio

def compare_strings(str1, str2):
    """Compares 2 strings with the Levenshtein distance and returns a normalized
    value between 0.0 and 1.0 (meaning totally different and exactly the same
    respectively."""
    if str1 == str2:
        return 1.0
    max_len = max(len(str1), len(str2))
    if max_len == 0:
        return 0.0
    distance = jellyfish.levenshtein_distance(str1, str2)
    return (max_len - distance) / float(max_len)

示例#48

0

显示文件

文件： nerd_tweets.bk.py 项目： rugebiker/WIR-DM

def bestcandidate(wrd):
    w = wrd
    candidate_list = []
    try:
        #Check the Brown word clusters
        c = bcluster._word[w]
        for rec in c:
            d = rec['cluster']
        recs = bcluster._cluster[d]
        for rec in recs:
            candidate = rec['word']
            levenshtein = jellyfish.levenshtein_distance(w,candidate)
            n2 = jellyfish.metaphone(w)
            n3 = jellyfish.metaphone(candidate)
            if chant.check(candidate):
                #Filter the candidates within a specific character and phonetic distance
                if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1:
                    candidate_list.append((candidate, rec['count']))
        return candidate_list[-1][0]
    except Exception:
        return 'No'

示例#49

0

显示文件

文件： hooks.py 项目： 15502119602/beets

def _string_dist_basic(str1, str2):
    """Basic edit distance between two strings, ignoring
    non-alphanumeric characters and case. Comparisons are based on a
    transliteration/lowering to ASCII characters. Normalized by string
    length.
    """
    str1 = unidecode(str1)
    str2 = unidecode(str2)
    str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
    str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
    if not str1 and not str2:
        return 0.0
    return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2)))

示例#50

0

显示文件

文件： c.py 项目： adriansoghoian/sec-honeywords

def find_similar_pws(pw, pw_list, num_passwords):
	match_indices = []
	best_leven_distances = []
	distance = 0
	for i, each in enumerate(pw_list):
		distance = jf.levenshtein_distance(pw, each)
		match_indices.append(i)
		best_leven_distances.append(distance)
	pwd_tuples = sorted(zip(match_indices, best_leven_distances), key=lambda tup: tup[1])
	pwd_tuples = pwd_tuples[2000:100000]
	pwd_tuples = [ pwd_tuples[i] for i in sorted(random.sample(xrange(len(pwd_tuples)), 1000)) ]
	output = lookup_pwds(pwd_tuples, pw_list, num_passwords)
	return output

示例#51

0

显示文件

文件： Similarity.py 项目： ChunchuanLv/anlp3

	def compute(self,m0,m1,keys = ['DOC_SIM','WIN_SIM','SENT_SIM','OVERLAP']):
		sims = {};
    
		mt_sim = jellyfish.levenshtein_distance(unicode(m0['mention_text']),unicode(m1['mention_text']));
		#return {'MT_SIM': mt_sim};
          #      sims['DOC_SIM'] = self.cos_sim(m0['doc_tf_idf'],m1['doc_tf_idf']);
                sims['WIN_SIM']= self.cos_sim(m0['win_tf_idf'],m1['win_tf_idf']);
            #    sims['SENT_SIM'] = self.cos_sim(m0['sentence_tf_idf'],m1['sentence_tf_idf']);
             #   sims['OVERLAP'] = self.overlap(m0['NER_tags'],m1['NER_tags'],2);
              #  sims['jaccard'] =self.jaccard(m0['win_VEs'],m1['win_VEs'])
             #   sims['overlapVe'] =self.overlap(m0['sentence_VEs'],m1['sentence_VEs'],3)
                sims['win_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'win'),self.extractTF_IDF(m1,'win'))
            #    sims['sentence_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'sentence'),self.extractTF_IDF(m1,'sentence'))
            #    sims['doc_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'doc'),self.extractTF_IDF(m1,'doc'))
		return sims;

示例#52

0

显示文件

文件： basic.py 项目： branjbar/miss-project

def string_compare(str1, str2, method='JARO'):
    ''' (string, string, string) -> double
    returns the similarity of str1 and str2 according to the method: LEV or JARO
    
    '''

    if method == "LEV":
        # computes Levnenshtein distance which is an integer larger or equal to zero
        # return jellyfish.levenshtein_distance(str1,str2)
        return jellyfish.levenshtein_distance(str1.lower(), str2.lower())

    if method == "JARO":
        # computes Jaro Winkler measure which is always between 0 and 1
        return jellyfish.jaro_distance(str1, str2)

    print("ERROR: Choose the right string similarity measure : LEV or JARO")

示例#53

0

显示文件

文件： features.py 项目： pjknkda/kddcup2013-kaist-pjkp

    def calculator(aid, pid):
        a_row = authors.get(aid)
        pa_row = paper_authors.get(pid, aid)

        if a_row is None or pa_row is None:
            return np.nan

        if (a_row[Authors.IDX_AFF] == '' or
                pa_row[PaperAuthors.IDX_AFF]) == '':
            return np.nan

        sim = levenshtein_distance(
            unidecode(a_row[Authors.IDX_AFF]).lower(),
            unidecode(pa_row[PaperAuthors.IDX_AFF]).lower()
        )
        return sim

示例#54

0

显示文件

文件： views.py 项目： dbarlett/namespect

def distance(string_1, string_2):
    """Compute the edit distance between two strings.
    """
    return jsonify({
        "levenshtein": jellyfish.levenshtein_distance(string_1, string_2),
        "damerau-levenshtein": jellyfish.damerau_levenshtein_distance(
            string_1,
            string_2
        ),
        "jaro": jellyfish.jaro_distance(string_1, string_2),
        "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2),
        "match_rating_codex": jellyfish.match_rating_comparison(
            string_1,
            string_2
        ),
        "sift3": pymailcheck.sift3_distance(string_1, string_2),
    })

示例#55

0

显示文件

文件： autoextractor.py 项目： djipko/uTorrent-Auto-Extractor

def levenshteincmpr(string, list):
    if len(list)==0:
        return False;
    best_lev_match = 999999999;
    fixed_string = strip_name(str(string).lower()).strip()
    for item in list:
        if options['Global']['debug']==1:
            print ".....Literating through {}".format(item) 
        fixed_itemstring = strip_name(str(item).lower()).strip()  
        levdist = levenshtein_distance(fixed_itemstring, fixed_string)
        if options['Global']['debug']==1:
            print "..........file <{}> vs imdb <{}> gave {} levenshtein distance".format(fixed_string, fixed_itemstring, levdist)
        if best_lev_match > levdist:
            best_lev_match = levdist
            best_match = fixed_itemstring
            
    return {'lev':best_lev_match, 'title':best_match}

示例#56

0

显示文件

文件： align_test.py 项目： timothyjamesbecker/tHUB_Tools

def test_edit_dist(x):
    s1 = '12012014321231200112211'
    s2 = '1300201231200112211'
    seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1]
    seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1]
    pos = np.asarray([[0,0],[0,1],   #0 and 1 are nn
                      [2,0],[2,1],   #2 and 3 are nn
                      [4,0],[4,1],   #4 and 5 are nn
                      [6,0],[6,1],   #6 and 7 are nn
                      [8,0],[8,1],   #8 and 9 are nn
                      [9,0],[9,1],   #10 and 11 are nn
                      [10,0],[10,1]],#12 and 13 are nn
                      dtype=float)
                      
    #modify this to ensure it is a non-connected k-nn
    nn = distance.ann(pos,1)[1][:,1:]
    k = 0
    rp = 1
    w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1,
         'S':lambda x:2, 'P':lambda x:0.5 }
    a = align.Align(w,rp,nn,k)
    
    u,v = 0,0
    t0 = time.time()
    for i in range(0,int(x)):
        u = jellyfish.levenshtein_distance(s1,s2)
    t1 = time.time()
    t2 = time.time()
    for i in range(0,int(x)):
        v = Levenshtein.editops(s1,s2)
    v = Levenshtein.distance(s1,s2)
    t3 = time.time()
    t4 = time.time()
    for i in range(0,int(x)):
        #v = a.edit_dist(seq1,seq2)
        #w = a.edit_graph(seq1,seq2)
        #w = a.levenshtein(seq1,seq2)
        w = 1
    w = a.edit_dist(seq1,seq2)
    t5 = time.time()
    #w = a.edit_dist(seq1,seq2)
    print('editdist  dist = %s'%v)
    print('seq edit  dist = %s'%w)
    print('editdist  runtime is %s seconds'%(t3-t2))
    print('seq edit  dist = %s'%(t5-t4))

示例#57

0

显示文件

文件： fromdiff.py 项目： leonardomaccari/fromdiff

def diff_string(string1, string2, algorithm="RO"):
    """ deafults to Ratcliff-Obershelp.
    can be changed to Levenshtein algorithm
    1 == same string, 0 == no similarity. The two algorithms
    use a reversed  score scale, I have to rescale."""

    if algorithm == "LE":
        d = jf.levenshtein_distance(string1, string2)
        if d == 0:
            return 1
        else:
            return 1 - float(d)/max(len(string1), len(string2))
    elif algorithm == "RO":
        s = SequenceMatcher(None, string1, string2)
        r = s.ratio()
        return r
    else:
        raise Exception("Wrong algorithm chosen for difference match:"
                        + algorithm)

示例#58

0

显示文件

文件： handlers.py 项目： peterburrell/openstates

    def results(self, query):
        # Look for the query to be a substring of a legislator name
        # (case-insensitive)
        pattern = re.compile(".*%s.*" % query['query'],
                             re.IGNORECASE)

        spec = {'full_name': pattern}

        for prop in query.get('properties', []):
            # Allow filtering by state or chamber for now
            if prop['pid'] in ('state', 'chamber'):
                spec[prop['pid']] = prop['v']

        legislators = db.legislators.find(spec)

        results = []
        for leg in legislators:
            if legislators.count() == 1:
                match = True
                score = 100
            else:
                match = False
                if leg['last_name'] == query['query']:
                    score = 90
                else:
                    distance = levenshtein_distance(leg['full_name'].lower(),
                                                    query['query'].lower())
                    score = 100.0 / (1 + distance)

            # Note: There's a bug in Refine that causes reconciliation
            # scores to be overwritten if the same legislator is returned
            # for multiple queries. see:
            # http://code.google.com/p/google-refine/issues/detail?id=185

            results.append({"id": leg['_id'],
                            "name": leg['full_name'],
                            "score": score,
                            "match": match,
                            "type": [
                                {"id": "/openstates/legislator",
                                 "name": "Legislator"}]})

        return sorted(results, cmp=lambda l, r: cmp(r['score'], l['score']))