Python jaro_winkler示例，textdistance.jaro_winkler Python示例

示例#1

0

显示文件

文件： visualization_blends.py 项目： Forcrush/Knowledge-Technology-Project

def jaro_winkler_sim_of_blends():
    blend1, blend2 = [], []
    count = 0
    with open("data/blends.txt", 'r') as f:
        for line in f:
            s = line.split()
            origin, first, second = s[0], s[1], s[2]
            blend1.append(textdistance.jaro_winkler(origin, first))
            blend2.append(textdistance.jaro_winkler(origin, second))
            count += 1
    #print(textdistance.jaro_winkler())

    x = np.array([i for i in range(count)])
    y1 = np.array(blend1)
    y2 = np.array(blend2)

    plt.plot(x, y1, color="r", linestyle="-", marker="^", linewidth=1)
    plt.plot(x, y2, color="b", linestyle="-", marker="s", linewidth=1)

    plt.xlabel("x")
    plt.ylabel("y")
    plt.title("jaro_winkler similarity", fontsize=12, color='g')
    print(
        "first blend: 0.7 ~ 0.95 / 0.52 ~ 0.65\n second blend: 0.7 ~ 0.95 / 0.4 ~ 0.6/ 0.0"
    )
    plt.show()

示例#2

0

显示文件

def filter_blends(word, first_blend, second_blend, limit="1237"):
    refined_first_blend, refined_second_blend = [], []
    for w in first_blend:
        qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w),
                                True, False)
        qua2 = metric.qualified(
            "levenshtein_sim",
            textdistance.levenshtein.normalized_similarity(word, w), True,
            False)
        qua3 = metric.qualified("ro_sim",
                                textdistance.ratcliff_obershelp(word, w), True,
                                False)
        qua4 = metric.qualified("needleman_wunsch",
                                textdistance.needleman_wunsch(word, w), True,
                                False)
        qua5 = metric.qualified("smith_waterman",
                                textdistance.smith_waterman(word, w), True,
                                False)
        qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), True,
                                False)
        qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w),
                                True, False)
        metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7]
        statis = True
        for i in limit:
            statis &= metric_pool[int(i) - 1]
        if statis:
            refined_first_blend.append(w)

    for w in second_blend:
        qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w),
                                False, True)
        qua2 = metric.qualified(
            "levenshtein_sim",
            textdistance.levenshtein.normalized_similarity(word, w), False,
            True)
        qua3 = metric.qualified("ro_sim",
                                textdistance.ratcliff_obershelp(word, w),
                                False, True)
        qua4 = metric.qualified("needleman_wunsch",
                                textdistance.needleman_wunsch(word, w), False,
                                True)
        qua5 = metric.qualified("smith_waterman",
                                textdistance.smith_waterman(word, w), False,
                                True)
        qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), False,
                                True)
        qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w),
                                False, True)
        metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7]
        statis = True
        for i in limit:
            statis &= metric_pool[int(i) - 1]
        if statis:
            refined_second_blend.append(w)

    return refined_first_blend, refined_second_blend

示例#3

0

显示文件

def test_jaro_winkler():
    """Confirm that the jaro winkler implementation matches the original paper"""
    assert textdistance.jaro_winkler("campell",
                                     "campbell") == pytest.approx(0.9792,
                                                                  abs=0.01)
    assert textdistance.jaro_winkler("shakelford",
                                     "shakleford") == pytest.approx(0.9848,
                                                                    abs=0.01)
    assert textdistance.jaro_winkler("dwayne",
                                     "duane") == pytest.approx(0.84, abs=0.01)

示例#4

0

显示文件

文件： mycourses.py 项目： JuezUN/INGInious

    def filter_courses(self, courses, query):
        """
        Filter the courses with respect the query. First, the course names and query are tokenized, removing
        spaces, hyphens, and punctuation characters. Then, a score is obtained comparing each token of the
        query with each course name, this is using the algorithm jaro-winkler, to calculate the distance
        between two tokens. A list of tasks is obtained, and a task with a score of more or equal 0.9, is
        appended. The resulting courses are sorted with respect the obtained score.
         """
        if query == "":
            return courses

        filtered_courses = []
        for course in courses.values():
            course_name = course.get_name(self.user_manager.session_language())
            course_tokens = re.findall(r"[\w']+", course_name.lower())
            query_tokens = re.findall(r"[\w']+", query.lower())
            total_distance = 0
            can_insert = False
            for course_name_token in course_tokens:
                for query_token in query_tokens:
                    dist = textdistance.jaro_winkler(course_name_token, query_token)
                    if dist >= 0.8:
                        # Only distance values with >= 0.8 are taken into account to calculate the score
                        # In case the distance is equal to 1, the weight is double for equal tokens.
                        if dist == 1:
                            dist *= 2
                        total_distance += dist
                        can_insert = True
            if total_distance >= 0.9 and can_insert:
                filtered_courses.append((course, total_distance))
        return OrderedDict(map(lambda x: (x[0].get_id(), x[0]), sorted(filtered_courses, key=lambda x: -x[1])))

示例#5

0

显示文件

 def create_jw_blocks(self, list_of_lawyers):
     """
     Receives list of blocks, where a block is a list of lawyers
     that all begin with the same letter. Within each block, does
     a pairwise jaro winkler comparison to block lawyers together
     """
     consumed = defaultdict(int)
     print('Doing pairwise Jaro-Winkler...',
           len(list_of_lawyers),
           flush=True)
     for i, primary in enumerate(list_of_lawyers):
         if consumed[primary]:
             continue
         consumed[primary] = 1
         self.blocks[primary].append(primary)
         for secondary in list_of_lawyers[i:]:
             if consumed[secondary]:
                 continue
             if primary == secondary:
                 self.blocks[primary].append(secondary)
                 continue
             if jaro_winkler(primary, secondary, 0.0) >= float(
                     self.THRESHOLD):
                 consumed[secondary] = 1
                 self.blocks[primary].append(secondary)
     pickle.dump(self.blocks,
                 open(self.disambig_folder + '/' + 'lawyer.pickle', 'wb'))
     print('lawyer blocks created!', flush=True)

示例#6

0

显示文件

    def _response_false(self, triple):
        qRelation = "match (n:Entity {name:" + "'" + str(triple.head.name) + "'})-[r]->(m:Entity) return r.name, m.name"
        results = self._execute(qRelation)
        dictRelationScore = {}
        dictRelationEntity = {}
        for r in results.values("r.name", "m.name"):
            dictRelationEntity[r[0]] = r[1]
            score = textdistance.jaro_winkler(triple.relation.name, r[0])
            dictRelationScore[r[0]] = score

        if len(dictRelationScore) == 0:
            print("Answer: We do not have this news\n")
            print("Triple extracted: " + str(triple.head.name) + " " + str(triple.relation.name) + " " + str(
                triple.tail.name) + "\n")
            print("===============================================================\n")
        else:
            print("Answer: We do not have this news, may be you want to know st:\n")
            print("Triple extracted: " + str(triple.head.name) + " " + str(triple.relation.name) + " " + str(triple.tail.name) + "\n")

            dictRelationScore = {k: v for k, v in
                                 sorted(dictRelationScore.items(), key=lambda item: item[1], reverse=True)}
            count = 0
            for r, s in dictRelationScore.items():
                print(">>> " + str(triple.head.name) + " " + str(r) + " " + str(dictRelationEntity.get(r)))
                print("relation has similarity: ", s)
                print("\n")
                count = count + 1
                if count == 3:
                    break
            print("===============================================================\n")

示例#7

0

显示文件

def execute():
    if request.method == "POST":
        if request.form:
            answer = "Cloud computing allows consumers and businesses to use applications without installation and access their personal files at any computer with internet access. Cloud-based services are ideal for businesses with growing or fluctuating bandwidth demands."
            text = request.form.get("text")
            tool = language_check.LanguageTool("en-US")
            matches = tool.check(text)
            language_check.correct(text, matches)
            mark = 0
            if len(matches) == 0:
                mark = mark + 5
            elif len(matches) > 5:
                mark = mark + 4
            elif len(matches) > 10:
                mark = mark + 3

            # Context based comparison

            allWords = nltk.tokenize.word_tokenize(text)
            stopwords = nltk.corpus.stopwords.words("english")
            words = []
            point = [",", ".", ";", "'", ""]
            for w in allWords:
                if w not in stopwords:
                    if w not in point:
                        words.append(w)
            allWordDist = nltk.FreqDist(w.lower() for w in words)

            allWordExceptStopDist = nltk.FreqDist(w.lower()
                                                  for w in allWordDist
                                                  if w not in stopwords)

            mostCommon = allWordDist.most_common()

            allWord = nltk.tokenize.word_tokenize(answer)
            stopwords = nltk.corpus.stopwords.words("english")
            word = []
            point = [",", ".", ";", "'", "-"]
            for w in allWord:
                if w not in allWordDist:
                    if w not in point:
                        word.append(w)
            mark1 = 0
            length = len(allWord)
            if length > 15:
                for a in allWord:
                    for r in allWordDist:
                        dist1 = float(textdistance.jaro_winkler(a, r))
                        dist = round(dist1, 2)
                        if dist > 0.500:
                            mark1 = mark1 + 1
                            break
                marks2 = int(mark1 / length * 5)
                total = mark + marks2
            else:
                total = 0
    return render_template("result.html", total=total)

示例#8

0

显示文件

文件： snapshot_mapping_edgelist.py 项目： QuantLaw/legal-data-preprocessing

def cached_text_distance(s1, s2, cache, dry_run):
    key = (s1, s2)
    if dry_run:
        distance = None
        cache[key] = distance
    elif key not in cache:
        distance = textdistance.jaro_winkler(s1, s2)
        cache[key] = distance
    else:
        distance = cache[key]
    return distance

示例#9

0

显示文件

文件： plugin.py 项目： fred0r/limnoria-plugins

 def answer(self, msg):
     if not self.active or self.correct:
         return
     if not self.correct:
         for ans in self.a:
             ans = " ".join(ans.split()).strip().lower()
             guess = " ".join(msg.args[1].split()).strip().lower()
             if guess == ans:
                 self.correct = True
                 break
             elif not self.correct:
                 answer = self.clean(ans)
                 guess = self.clean(guess)
             if not self.correct and guess == answer:
                 self.correct = True
                 break
             elif (not self.correct and self.flexibility < 1
                   and self.flexibility > 0.5):
                 dist = textdistance.jaro_winkler(guess, answer)
                 log.debug(
                     "Jeopardy: guess: {0}, answer: {1}, length: {2}, "
                     "distance: {3}, flexibility: {4}".format(
                         guess, answer, len(answer), dist,
                         self.flexibility))
                 if dist >= self.flexibility:
                     self.correct = True
                     break
                 elif (dist < self.flexibility and "," in self.a[0]
                       or "&" in self.a[0]):
                     dist = textdistance.jaccard(guess, answer)
                     if dist >= self.flexibility:
                         self.correct = True
                         break
         if self.correct:
             if not msg.nick in self.scores:
                 self.scores[msg.nick] = 0
             self.scores[msg.nick] += self.p
             if not msg.nick in self.roundscores:
                 self.roundscores[msg.nick] = 0
             self.roundscores[msg.nick] += self.p
             self.unanswered = 0
             reply = self.correct_template.render(
                 nick=msg.nick,
                 answer=self.a[0],
                 points=self.p,
                 round=self.roundscores[msg.nick],
                 total=self.scores[msg.nick],
             )
             self.reply(reply)
             self.correct = True
             self.answered += 1
             self.clear()
             self.newquestion()

示例#10

0

显示文件

文件： Log.py 项目： LeenJooken/CollaborationVisualizer

    def checkIfTypo(self, name1, name2):
        #normalizedHammingDistance = textdistance.hamming.normalized_similarity(name1,name2)
        #levenshteinDistance = textdistance.levenshtein.normalized_similarity(name1,name2)
        jaroWinkler = textdistance.jaro_winkler(name1, name2)
        #avgSimilarityScore = (normalizedHammingDistance + levenshteinDistance + jaroWinkler)/3

        #if(avgSimilarityScore > 0.78):
        if (jaroWinkler > 0.92):

            return True
        else:
            return False

示例#11

0

显示文件

def address_similarity_scorer(a: str, b: str):
    """Compares two address strings, returns 1 if they match, 0 otherwise.
    
    Uses Jaro-Winkler Distance Algorithm (JWDA) (en.wikipedia.org/wiki/Jaro–Winkler_distance).
    JWDA "measurement scale is 0.0 to 1.0, where 0.0 is the least likely and 1.0 is a positive match.  
    
    """
    jaro = textdistance.jaro_winkler(a, b)
    if jaro > 0.9:
        return 1
    else:
        return 0

示例#12

0

显示文件

def key_input_search(key):
    max_score = 0.0
    index = 0
    l = len(titles)
    for i in range(l):
        if not pd.isna(titles[str(i)]):
            score = textdistance.jaro_winkler(titles[str(i)].lower(),
                                              key.lower())
            if score > max_score:
                max_score = score
                index = i
    return joblabel[str(index)]

示例#13

0

显示文件

def three_recommended_items(request):
    all_products = Product.objects.filter(disponible=True)
    user_products = Product.objects.filter(user__email=request.user.email)
    all_products = all_products.difference(user_products)

    all_products_names = []
    for p in all_products:
        all_products_names.append(p.name)


    user_products_names = []
    for p in user_products:
        user_products_names.append(p.name)

    if len(all_products_names) < 3 or len(user_products_names) < 1:
        return 0

    import textdistance
    # set test
    list = [[user_products_names[0], all_products_names[0], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)],
            [user_products_names[0], all_products_names[1], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)],
            [user_products_names[0], all_products_names[2], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)]]

    # Jaro–Winkler distance is a measure of edit distance which gives more similar measures to words in which
    # the beginning characters match.

    for i in all_products_names:
        for j in user_products_names:
            d = round(textdistance.jaro_winkler(i, j), 4)
            m = min([t[2] for t in list])
            if d > m:
                l = [j,i, d]
                for k in range(3):
                    if m == list[k][2]:
                        list[k] = l
                        break

    from django.db.models import Q
    list = Product.objects.filter(Q(name = list[0][1]) | Q(name = list[1][1]) | Q(name = list[2][1]), disponible=True )
    return list

示例#14

0

显示文件

文件： classify.py 项目： kortukov/hpc_similarity

def calculate_distances(job1, job2):
    sensors = list(job1.keys())
    assert sensors == list(job2.keys())
    assert sensors == config.SENSORS_LIST
    distances = []
    for sensor in sensors:
        string1, string2 = job1[sensor], job2[sensor]
        # Here we can try various string distance methods to see which works better
        distance = textdistance.jaro_winkler(string1, string2)
        # distance = textdistance.levenshtein(string1, string2)
        distances.append(distance)

    return distances

示例#15

0

显示文件

def last_name_similarity_scorer(a: str, b: str):
    """Compares two first last strings, returns 1 if they match, 0 otherwise.
    
    Uses Jaro-Winkler Distance Algorithm (JWDA) (en.wikipedia.org/wiki/Jaro–Winkler_distance).
    JWDA "measurement scale is 0.0 to 1.0, where 0.0 is the least likely and 1.0 is a positive match.  
    For our purposes, anything below a 0.8 is not considered useful." (source: SAP blog)
    
    """
    jaro = textdistance.jaro_winkler(a, b)
    if jaro > 0.8:
        return 1
    else:
        return 0

示例#16

0

显示文件

def match_jaro_winkler(ee, platforms):
    for index, row in ee.iterrows():
        if row.possible_stops == '':
            subset = platforms[platforms.routes_wkd.str.contains(row.line)]
            if subset.shape[0] > 0:
                subset_stop_names = pd.DataFrame(subset.stop_name.unique(),columns=['stop_name'])
                name_dist = [textdistance.jaro_winkler(row.station_name,y) for y in subset_stop_names.stop_name]

                matched_station_name = subset_stop_names.iloc[np.argmax(name_dist),0]
                matched_stop_ids = subset[subset.stop_name == matched_station_name][['stop_id']]
                score = max(name_dist)
                if score > 0.79:
                    ee.loc[index,'possible_stops'] = ', '.join(matched_stop_ids.stop_id)
    return ee

示例#17

0

显示文件

文件： strs_jaro_winkler.py 项目： stefandragomir/string_similarity

    def compare(self, str1, str2):

        if self.debug:
            self.log("jaro winkler comparison")

        self.start_time()

        self.result.distance = jaro_winkler(str1, str2)

        self.end_time()

        self.result.nos = max(len(str1), len(str2))
        self.result.threshold = 90
        self.result.similarity = (1 - self.result.distance) * 100

        return self.result

示例#18

0

显示文件

文件： text_distances.py 项目： Anis-Bensaid/signals-data-wrangling

def brands_custom_distance(row):
    """
    Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match.
    """
    # jaccard = textdistance.jaccard(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e'))
    jaro = textdistance.jaro_winkler(
        str(row['Brand']).lower().replace('.', '').replace('&', 'and'),
        str(row['ELC_Brand']).lower().replace('.', '').replace('é', 'e'))
    try:
        fuzzi = fuzz.partial_ratio(
            str(row['Brand']).lower().replace('.', '').replace('&', 'and'),
            str(row['ELC_Brand']).lower().replace('.', '').replace('é',
                                                                   'e')) / 100
    except ValueError:
        return jaro
    return np.average([fuzzi, jaro], weights=[0.4, 0.6])

示例#19

0

显示文件

文件： clean_data.py 项目： VRanaV/Rank_Up-model-service

def get_skill_header(headers, skills):
    list_of_skills = list()
    for header in headers:
        skills_weight = list()
        for No_of_skill in range(len(skills)):
            skills_weight.append(
                (header, textdistance.jaro_winkler(header,
                                                   skills[No_of_skill])))
            #if reached the end of skills sort them and get the highest prob.only
            if No_of_skill == len(skills) - 1:
                list_of_skills.append(max(skills_weight, key=lambda x: x[1]))

    skill_element = max(list_of_skills, key=lambda x: x[1])
    skill = skill_element[0]
    headers.remove(skill)
    return skill

示例#20

0

显示文件

文件： text_distances.py 项目： Anis-Bensaid/signals-data-wrangling

def item_description_custom_distance(row):
    """
    Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match.
    """
    item_description = str(row['Item_Description']).lower().replace(
        'clinique', '').replace('origins', '').replace('tom ford', '').replace(
            'la mer',
            '').replace('estee lauder', '').replace('mac', '').replace(
                'bb', '').replace('bobbi', '').replace('brown', '')
    product = str(row['Product']).lower()
    jaro = textdistance.jaro_winkler(item_description, product)
    try:
        fuzzi = fuzz.partial_ratio(item_description, product) / 100
    except ValueError:
        return jaro
    return np.average([fuzzi, jaro], weights=[0.95, 0.05])

示例#21

0

显示文件

文件： text_distances.py 项目： Anis-Bensaid/signals-data-wrangling

def subcategory_custom_distance(row):
    """
    Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1,
    1 being a good match.
    """
    jaro = textdistance.jaro_winkler(
        str(row['Sub_Category']).lower(),
        str(row['ELC_Solution_Type']).lower() + ' ' +
        str(row['Item_Description']).lower().replace('.', '').replace(
            '&', 'and'))
    try:
        fuzzi = fuzz.partial_ratio(
            str(row['Sub_Category']).lower(),
            str(row['ELC_Solution_Type']).lower() + ' ' +
            str(row['Item_Description']).lower().replace('.', '').replace(
                '&', 'and')) / 100
    except ValueError:
        return jaro
    return np.average([fuzzi, jaro], weights=[0.95, 0.05])

示例#22

0

显示文件

文件： clean_data.py 项目： VRanaV/Rank_Up-model-service

def get_otherHeaders(headers, otherHeaders):
    list_of_chosen_Headers = set()
    list_of_otherHeaders = list()
    for header in headers:
        otherHeaders_weight = list()
        for No_of_otherH in range(len(otherHeaders)):
            otherHeaders_weight.append(
                (header,
                 textdistance.jaro_winkler(header,
                                           otherHeaders[No_of_otherH])))
            if No_of_otherH == len(otherHeaders) - 1:
                list_of_otherHeaders.append(
                    max(otherHeaders_weight, key=lambda y: y[1]))

    list_of_otherHeaders.sort(key=lambda y: y[1], reverse=True)
    for header in list_of_otherHeaders:
        if header[1] > 0.7:
            list_of_chosen_Headers.add(header[0])

    return list_of_chosen_Headers

示例#23

0

显示文件

def search_trigrams(query):
    subset = set()

    tri_search = [query[i:i+3] for i in range(len(query)-2)]

    # A union between all sets is constructed here but this operation
    # could be altered depending on the substring submitted or where
    # the substring was found in the string, potentially handling high-
    # frequency substrings or substrings occurringly word-initially
    # differently.  
    for item in tri_search:
        if trigrams.get(item) is not None:
            subset.update(trigrams.get(item))

    # Jaro-Winkler is a good edit distance metric choice to use with trigrams as it
    # favors those strings which match from the beginning.
    # Place matches in objects so that they can be sorted later using attrgetter.
    matches = [Match(bands[item], jaro_winkler(query, item)) for item in subset]

    return sorted(matches, key=attrgetter("score"), reverse=True)[:NUM_RESULTS]

示例#24

0

显示文件

def recommend():
    wrd = str(request.form.get('word'))
    message = "Displaying recommentations for " + wrd + ":\n"
    if wrd not in indices['movie_title'].unique():
        temp = indices
        temp['movie_name_distance'] = indices.apply(
            lambda row: td.jaro_winkler(row['movie_title'], wrd), axis=1)
        wrd = temp.sort_values('movie_name_distance', ascending=False).iloc[0,
                                                                            0]
        message = "Couldn't find the input movie, displaying recommendations for " + wrd + ":<br/>"
    idx = int(indices[indices['movie_title'] == wrd][0])
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommendations = data_api['movie_title'].iloc[movie_indices].str.cat(
        sep='<br/>')
    #return message + recommendations
    #return render_template('index.html', prediction='Recommended movies {}'.format(json.dumps(message + recommendations)))
    return render_template('index.html', prediction=message + recommendations)

示例#25

0

显示文件

def run_distance_matching(company_name, patstat_name,
                          elastic_score, query):
    """
    run_distance_matching
    """
    ratio = fuzz.token_sort_ratio(company_name.lower(),
    patstat_name.lower())
    jaro_winkler_score = textdistance.jaro_winkler(
    company_name.lower(),
    patstat_name.lower())
    name_length = len(company_name.split())
    if name_length > 5 : elastic_score -= 10
    distance_score = score.calculate_distance_score(ratio,
    jaro_winkler_score,
    name_length)
    pam_score = score.pam_score(query, elastic_score, distance_score)

    return {
    'levensthein_score' : pam_score,
    'jaro_winkler_score'   : jaro_winkler_score,
    'pam_score': pam_score
    }

示例#26

0

显示文件

文件： metadata_similarity.py 项目： pelagios/recogito2-similarity-scan

    def compute_pairwise(documents):
        results = []

        for outer_idx, outer_doc in enumerate(documents):
            for inner_idx, inner_doc in enumerate(documents):

                if ((outer_idx < inner_idx)
                        and (outer_doc.owner != inner_doc.owner)
                        and (outer_doc.title != 'New document')
                        and (inner_doc.title != 'New document')
                        and (not outer_doc.title.lower().startswith('test'))
                        and (not inner_doc.title.lower().startswith('test'))):

                    similarity = textdistance.jaro_winkler(
                        outer_doc.title, inner_doc.title)
                    if (similarity > 0.9):
                        results.append({
                            'doc_a': outer_doc,
                            'doc_b': inner_doc,
                            'score': similarity
                        })

        return results

示例#27

0

显示文件

文件： search.py 项目： SakalikPeter/VINF

def similarity(type, a, b):
    """
  String similarity metrics
  input:  type: hamming (similarity type)
          a: John (string 1)
          b: John Snow (string 2)
  output: 0.73 (probability)
  """
    if type == 'hamming':
        return textdistance.hamming.normalized_similarity(a, b)
    elif type == 'levenshtein':
        return textdistance.levenshtein.normalized_similarity(a, b)
    elif type == 'jaro_winkler':
        return textdistance.jaro_winkler(a, b)
    elif type == 'jaccard':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.jaccard(tokens_1, tokens_2)
    elif type == 'sorensen':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.sorensen(tokens_1, tokens_2)
    elif type == 'ratcliff_obershelp':
        return textdistance.ratcliff_obershelp(a, b)

示例#28

0

显示文件

def find():
    ans = []

    conn = sqlite3.connect('FOR.db')
    c = conn.cursor()
    #c.execute('delete from FORK')
    conn.commit()

    BET1x = c.execute('select * from BET1x')

    for i in BET1x.fetchall():
        # print('ok')
        xdata = i[0]
        xuakity = i[1]
        xopp1 = i[3]
        xopp2 = i[4]
        xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2
        xp1 = i[5]
        xp2 = i[6]
        OLIMP = c.execute('select * from OLIMPBET')
        for o in OLIMP.fetchall():
            odata = o[0]
            otime = o[1]
            league = o[2]
            oopp1 = o[3]
            oopp2 = o[4]
            obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2
            obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1
            op1 = o[5]
            op2 = o[6]
            uksastik = textdistance.jaro_winkler(xbet.lower().strip(),
                                                 obet.lower().strip())
            uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(),
                                                  obet2.lower().strip())
            if uksastik >= 0.97 and uksastik > uksastik2:
                print('1xBET - OLIMPBET', end=' ')
                print(xbet, end=' ')
                print(obet, end=' ')
                print(uksastik, end=' ')
                print(uksastik2, end=' ')
                orta1 = 1 / float(xp1) + 1 / float(op2)
                orta2 = 1 / float(op1) + 1 / float(xp2)
                if (1 / float(xp1) + 1 / float(op2) <
                        1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0):
                    print("Yes", end=' ')
                    print(obet)
                    print('--------------------------------------------')
                    if (1 / float(xp1) + 1 / float(op2) <
                            1.0) and (1 / float(xp1) + 1 / float(op2) <
                                      (1 / float(xp2) + 1 / float(op1))):
                        A1 = (1 / float(xp1) / orta1) * 100
                        A1 = int(float(A1) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, '1xBET ' + xopp1,
                            str(A1) + '%',
                            str(float(A1) * float(xp1) - 100) + '%',
                            'OLIMPBET ' + oopp2,
                            str(100 - A1) + '%',
                            str(float(100 - A1) * float(op2) - 100) + '%'
                        ])
                    elif (1 / float(xp2) + 1 / float(op1) <
                          1.0) and (1 / float(xp2) + 1 / float(op1) <
                                    (1 / float(xp1) + 1 / float(op2))):
                        A2 = (1 / float(op1) / orta2) * 100
                        A2 = int(float(A2) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp1,
                            str(A2) + '%',
                            str(float(A2) * float(op1) - 100) + '%',
                            '1xBET ' + xopp2,
                            str(100 - A2) + '%',
                            str(float(100 - A2) * float(xp2) - 100) + '%'
                        ])
                else:
                    print("No")
            elif uksastik < uksastik2 and uksastik2 >= 0.97:
                print('1xBET - OLIMPBET', end=' ')
                print(xbet, end=' ')
                print(obet, end=' ')
                print(uksastik, end=' ')
                print(uksastik2, end=' ')
                orta1 = 1 / float(xp1) + 1 / float(op1)
                orta2 = 1 / float(op2) + 1 / float(xp2)
                if (1 / float(xp1) + 1 / float(op1) <
                        1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0):
                    print("Yes", end=' ')
                    print(obet2)
                    print('--------------------------------------------')
                    if (1 / float(xp1) + 1 / float(op1) <
                            1.0) and (1 / float(xp1) + 1 / float(op1) <
                                      (1 / float(xp2) + 1 / float(op2))):
                        A1 = (1 / float(xp1) / orta1) * 100
                        A1 = int(float(A1) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, '1xBET ' + xopp1,
                            str(A1) + '%',
                            str(float(A1) * float(xp1) - 100) + '%',
                            'OLIMPBET ' + oopp1,
                            str(100 - A1) + '%',
                            str(float(100 - A1) * float(op1) - 100) + '%'
                        ])
                    elif (1 / float(xp2) + 1 / float(op2) <
                          1.0) and (1 / float(xp2) + 1 / float(op2) <
                                    (1 / float(xp1) + 1 / float(op1))):
                        A2 = (1 / float(op2) / orta2) * 100
                        A2 = int(float(A2) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp2,
                            str(A2) + '%',
                            str(float(A2) * float(op2) - 100) + '%',
                            '1xBET ' + xopp2,
                            str(100 - A2) + '%',
                            str(float(100 - A2) * float(xp2) - 100) + '%'
                        ])
                else:
                    print("No")
            else:
                continue

    BET1x = c.execute('select * from BET1x')
    for i in BET1x.fetchall():
        xdata = i[0]
        xuakity = i[1]
        xopp1 = i[3]
        xopp2 = i[4]
        xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2
        xp1 = i[5]
        xp2 = i[6]
        # print('ok')
        FONBET = c.execute('select * from FONBET')
        for o in FONBET.fetchall():
            odata = o[0]
            otime = o[1]
            league = o[2]
            oopp1 = o[3]
            oopp2 = o[4]
            obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2
            obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1
            op1 = o[5]
            op2 = o[6]
            uksastik = textdistance.jaro_winkler(xbet.lower().strip(),
                                                 obet.lower().strip())
            uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(),
                                                  obet2.lower().strip())
            if uksastik > uksastik2 and uksastik >= 0.97:
                print('1xBET - FONBET', end=' ')
                print(xbet, end=' ')
                print(obet, end=' ')
                print(uksastik, end=' ')
                print(uksastik2, end=' ')
                orta1 = 1 / float(xp1) + 1 / float(op2)
                orta2 = 1 / float(op1) + 1 / float(xp2)
                if (1 / float(xp1) + 1 / float(op2) <
                        1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0):
                    print("Yes", end=' ')
                    print(obet)
                    print('--------------------------------------------')
                    if (1 / float(xp1) + 1 / float(op2) <
                            1.0) and (1 / float(xp1) + 1 / float(op2) <
                                      (1 / float(xp2) + 1 / float(op1))):
                        A1 = (1 / float(xp1) / orta1) * 100
                        A1 = int(float(A1) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'FONBET', op1, op2, '1xBET ' + xopp1,
                            str(A1) + '%',
                            str(float(A1) * float(xp1) - 100) + '%',
                            'FONBET ' + oopp2,
                            str(100 - A1) + '%',
                            str(float(100 - A1) * float(op2) - 100) + '%'
                        ])
                    elif (1 / float(xp2) + 1 / float(op1) <
                          1.0) and (1 / float(xp2) + 1 / float(op1) <
                                    (1 / float(xp1) + 1 / float(op2))):
                        A2 = (1 / float(op1) / orta2) * 100
                        A2 = int(float(A2) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, '1xBET', xp1,
                            xp2, 'FONBET', op1, op2, 'FONBET ' + oopp1,
                            str(A2) + '%',
                            str(float(A2) * float(op1) - 100) + '%',
                            '1xBET ' + xopp2,
                            str(100 - A2) + '%',
                            str(float(100 - A2) * float(xp2) - 100) + '%'
                        ])
                    print("Yes")
                else:
                    print(xbet, end=' ')
                    print(obet, end=' ')
                    print("No")
            else:
                if uksastik < uksastik2 and uksastik2 >= 0.97:
                    print('1xBET - FONBET', end=' ')
                    print(xbet, end=' ')
                    print(obet, end=' ')
                    print(uksastik, end=' ')
                    print(uksastik2, end=' ')
                    orta1 = 1 / float(xp1) + 1 / float(op1)
                    orta2 = 1 / float(op2) + 1 / float(xp2)
                    if (1 / float(xp1) + 1 / float(op1) <
                            1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0):
                        print("Yes", end=' ')
                        print(obet2)
                        print('--------------------------------------------')
                        if (1 / float(xp1) + 1 / float(op1) <
                                1.0) and (1 / float(xp1) + 1 / float(op1) <
                                          (1 / float(xp2) + 1 / float(op2))):
                            A1 = (1 / float(xp1) / orta1) * 100
                            A1 = int(float(A1) * 1000) / 1000
                            ans.append([
                                odata, otime, league, xopp1, xopp2, '1xBET',
                                xp1, xp2, 'FONBET', op1, op2, '1xBET ' + xopp1,
                                str(A1) + '%',
                                str(float(A1) * float(xp1) - 100) + '%',
                                'FONBET ' + oopp1,
                                str(100 - A1) + '%',
                                str(float(100 - A1) * float(op1) - 100) + '%'
                            ])
                        elif (1 / float(xp2) + 1 / float(op2) <
                              1.0) and (1 / float(xp2) + 1 / float(op2) <
                                        (1 / float(xp1) + 1 / float(op1))):
                            A2 = (1 / float(op2) / orta2) * 100
                            A2 = int(float(A2) * 1000) / 1000
                            ans.append([
                                odata, otime, league, xopp1, xopp2, '1xBET',
                                xp1, xp2, 'FONBET', op1, op2,
                                'FONBET ' + oopp2,
                                str(A2) + '%',
                                str(float(A2) * float(op2) - 100) + '%',
                                '1xBET ' + xopp2,
                                str(100 - A2) + '%',
                                str(float(100 - A2) * float(xp2) - 100) + '%'
                            ])
                    else:
                        print("No")
                else:
                    continue

    FONBET = c.execute('select * from FONBET')
    for i in FONBET.fetchall():
        xdata = i[0]
        xuakity = i[1]
        xopp1 = i[3]
        xopp2 = i[4]
        xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2
        xp1 = i[5]
        xp2 = i[6]
        OLIMP = c.execute('select * from OLIMPBET')
        for o in OLIMP.fetchall():
            # print('ok')
            odata = o[0]
            otime = o[1]
            league = o[2]
            oopp1 = o[3]
            oopp2 = o[4]
            obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2
            obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1
            op1 = o[5]
            op2 = o[6]
            uksastik = textdistance.jaro_winkler(xbet.lower().strip(),
                                                 obet.lower().strip())
            uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(),
                                                  obet2.lower().strip())
            if uksastik > uksastik2 and uksastik >= 0.97:
                print('FONBET - OLIMPBET', end=' ')
                print(xbet, end=' ')
                print(obet, end=' ')
                print(uksastik, end=' ')
                print(uksastik2, end=' ')
                orta1 = 1 / float(xp1) + 1 / float(op2)
                orta2 = 1 / float(op1) + 1 / float(xp2)
                if (1 / float(xp1) + 1 / float(op2) <
                        1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0):
                    print("Yes", end=' ')
                    print(obet)
                    print('--------------------------------------------')
                    if (1 / float(xp1) + 1 / float(op2) <
                            1.0) and (1 / float(xp1) + 1 / float(op2) <
                                      (1 / float(xp2) + 1 / float(op1))):
                        A1 = (1 / float(xp1) / orta1) * 100
                        A1 = int(float(A1) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, 'FONBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp1,
                            str(A1) + '%',
                            str(float(A1) * float(xp1) - 100) + '%',
                            'OLIMPBET ' + oopp2,
                            str(100 - A1) + '%',
                            str(float(100 - A1) * float(op2) - 100) + '%'
                        ])
                    elif (1 / float(xp2) + 1 / float(op1) <
                          1.0) and (1 / float(xp2) + 1 / float(op1) <
                                    (1 / float(xp1) + 1 / float(op2))):
                        A2 = (1 / float(op1) / orta2) * 100
                        A2 = int(float(A2) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, 'FONBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp1,
                            str(A2) + '%',
                            str(float(A2) * float(op1) - 100) + '%',
                            'FONBET ' + xopp2,
                            str(100 - A2) + '%',
                            str(float(100 - A2) * float(xp2) - 100) + '%'
                        ])
                else:
                    print("No")
            if uksastik < uksastik2 and uksastik2 >= 0.97:
                print('FONBET - OLIMPBET', end=' ')
                print(xbet, end=' ')
                print(obet, end=' ')
                print(uksastik, end=' ')
                print(uksastik2, end=' ')
                orta1 = 1 / float(xp1) + 1 / float(op1)
                orta2 = 1 / float(op2) + 1 / float(xp2)
                if (1 / float(xp1) + 1 / float(op1) <
                        1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0):
                    print("Yes", end=' ')
                    print(obet2)
                    print('--------------------------------------------')
                    if (1 / float(xp1) + 1 / float(op1) <
                            1.0) and (1 / float(xp1) + 1 / float(op1) <
                                      (1 / float(xp2) + 1 / float(op2))):
                        A1 = (1 / float(xp1) / orta1) * 100
                        A1 = int(float(A1) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, 'FONBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp1,
                            str(A1) + '%',
                            str(float(A1) * float(xp1) - 100) + '%',
                            'OLIMPBET ' + oopp1,
                            str(100 - A1) + '%',
                            str(float(100 - A1) * float(op1) - 100) + '%'
                        ])
                    elif (1 / float(xp2) + 1 / float(op2) <
                          1.0) and (1 / float(xp2) + 1 / float(op2) <
                                    (1 / float(xp1) + 1 / float(op1))):
                        A2 = (1 / float(op2) / orta2) * 100
                        A2 = int(float(A2) * 1000) / 1000
                        ans.append([
                            odata, otime, league, oopp1, oopp2, 'FONBET', xp1,
                            xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp2,
                            str(A2) + '%',
                            str(float(A2) * float(xp2) - 100) + '%',
                            'OLIMPBET ' + oopp2,
                            str(100 - A2) + '%',
                            str(float(100 - A2) * float(op2) - 100) + '%'
                        ])
                else:
                    print("No")
            else:
                continue
        # time.sleep(1)

    print(ans)
    import csv
    with open('FORK.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(ans)

    return ans


# for a in ans:
#     c.execute('''INSERT INTO FORK (data, time, league, opp1, opp2, bet1, coef1_bet1, coef2_bet1, bet2, coef1_bet2, coef2_bet2, win_opp1, percent_of_your_money_for_win_opp1, profit1, win_opp2, percent_of_your_money_for_win_opp2, profit2) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''',(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16]))
#     conn.commit()
# conn.close()
# print('ok')

示例#29

0

显示文件


# In[11]:


textdistance.levenshtein.normalized_similarity('arrow', 'arow')


# # Jaro Winkler
# 
# https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227

# In[12]:


textdistance.jaro_winkler("mes", "messi")


# In[13]:


textdistance.jaro_winkler("crate", "crat")


# In[14]:


textdistance.jaro_winkler("crate", "atcr")


# # Jaccard Index

示例#30

0

显示文件

def Seq_StringDistance(str_seq, str_ref, method="hamming"):

    if (method is "hamming"):
        return [
            textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "levenshtein"):
        return [
            textdistance.levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "damerau_lev"):
        return [
            textdistance.damerau_levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "j-winkler"):
        return [
            textdistance.jaro_winkler(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "smith-waterman"):
        return [
            textdistance.smith_waterman(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "jaccard"):
        return [
            textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "sorensen-dice"):
        return [
            textdistance.sorensen_dice(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "tversky"):
        return [
            textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "cosine"):
        return [
            textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "ratcliff"):
        return [
            textdistance.ratcliff_obershelp(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "bwt"):
        return [
            textdistance.bwtrle_ncd(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]