def jaro_winkler_sim_of_blends(): blend1, blend2 = [], [] count = 0 with open("data/blends.txt", 'r') as f: for line in f: s = line.split() origin, first, second = s[0], s[1], s[2] blend1.append(textdistance.jaro_winkler(origin, first)) blend2.append(textdistance.jaro_winkler(origin, second)) count += 1 #print(textdistance.jaro_winkler()) x = np.array([i for i in range(count)]) y1 = np.array(blend1) y2 = np.array(blend2) plt.plot(x, y1, color="r", linestyle="-", marker="^", linewidth=1) plt.plot(x, y2, color="b", linestyle="-", marker="s", linewidth=1) plt.xlabel("x") plt.ylabel("y") plt.title("jaro_winkler similarity", fontsize=12, color='g') print( "first blend: 0.7 ~ 0.95 / 0.52 ~ 0.65\n second blend: 0.7 ~ 0.95 / 0.4 ~ 0.6/ 0.0" ) plt.show()
def filter_blends(word, first_blend, second_blend, limit="1237"): refined_first_blend, refined_second_blend = [], [] for w in first_blend: qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w), True, False) qua2 = metric.qualified( "levenshtein_sim", textdistance.levenshtein.normalized_similarity(word, w), True, False) qua3 = metric.qualified("ro_sim", textdistance.ratcliff_obershelp(word, w), True, False) qua4 = metric.qualified("needleman_wunsch", textdistance.needleman_wunsch(word, w), True, False) qua5 = metric.qualified("smith_waterman", textdistance.smith_waterman(word, w), True, False) qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), True, False) qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w), True, False) metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7] statis = True for i in limit: statis &= metric_pool[int(i) - 1] if statis: refined_first_blend.append(w) for w in second_blend: qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w), False, True) qua2 = metric.qualified( "levenshtein_sim", textdistance.levenshtein.normalized_similarity(word, w), False, True) qua3 = metric.qualified("ro_sim", textdistance.ratcliff_obershelp(word, w), False, True) qua4 = metric.qualified("needleman_wunsch", textdistance.needleman_wunsch(word, w), False, True) qua5 = metric.qualified("smith_waterman", textdistance.smith_waterman(word, w), False, True) qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), False, True) qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w), False, True) metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7] statis = True for i in limit: statis &= metric_pool[int(i) - 1] if statis: refined_second_blend.append(w) return refined_first_blend, refined_second_blend
def test_jaro_winkler(): """Confirm that the jaro winkler implementation matches the original paper""" assert textdistance.jaro_winkler("campell", "campbell") == pytest.approx(0.9792, abs=0.01) assert textdistance.jaro_winkler("shakelford", "shakleford") == pytest.approx(0.9848, abs=0.01) assert textdistance.jaro_winkler("dwayne", "duane") == pytest.approx(0.84, abs=0.01)
def filter_courses(self, courses, query): """ Filter the courses with respect the query. First, the course names and query are tokenized, removing spaces, hyphens, and punctuation characters. Then, a score is obtained comparing each token of the query with each course name, this is using the algorithm jaro-winkler, to calculate the distance between two tokens. A list of tasks is obtained, and a task with a score of more or equal 0.9, is appended. The resulting courses are sorted with respect the obtained score. """ if query == "": return courses filtered_courses = [] for course in courses.values(): course_name = course.get_name(self.user_manager.session_language()) course_tokens = re.findall(r"[\w']+", course_name.lower()) query_tokens = re.findall(r"[\w']+", query.lower()) total_distance = 0 can_insert = False for course_name_token in course_tokens: for query_token in query_tokens: dist = textdistance.jaro_winkler(course_name_token, query_token) if dist >= 0.8: # Only distance values with >= 0.8 are taken into account to calculate the score # In case the distance is equal to 1, the weight is double for equal tokens. if dist == 1: dist *= 2 total_distance += dist can_insert = True if total_distance >= 0.9 and can_insert: filtered_courses.append((course, total_distance)) return OrderedDict(map(lambda x: (x[0].get_id(), x[0]), sorted(filtered_courses, key=lambda x: -x[1])))
def create_jw_blocks(self, list_of_lawyers): """ Receives list of blocks, where a block is a list of lawyers that all begin with the same letter. Within each block, does a pairwise jaro winkler comparison to block lawyers together """ consumed = defaultdict(int) print('Doing pairwise Jaro-Winkler...', len(list_of_lawyers), flush=True) for i, primary in enumerate(list_of_lawyers): if consumed[primary]: continue consumed[primary] = 1 self.blocks[primary].append(primary) for secondary in list_of_lawyers[i:]: if consumed[secondary]: continue if primary == secondary: self.blocks[primary].append(secondary) continue if jaro_winkler(primary, secondary, 0.0) >= float( self.THRESHOLD): consumed[secondary] = 1 self.blocks[primary].append(secondary) pickle.dump(self.blocks, open(self.disambig_folder + '/' + 'lawyer.pickle', 'wb')) print('lawyer blocks created!', flush=True)
def _response_false(self, triple): qRelation = "match (n:Entity {name:" + "'" + str(triple.head.name) + "'})-[r]->(m:Entity) return r.name, m.name" results = self._execute(qRelation) dictRelationScore = {} dictRelationEntity = {} for r in results.values("r.name", "m.name"): dictRelationEntity[r[0]] = r[1] score = textdistance.jaro_winkler(triple.relation.name, r[0]) dictRelationScore[r[0]] = score if len(dictRelationScore) == 0: print("Answer: We do not have this news\n") print("Triple extracted: " + str(triple.head.name) + " " + str(triple.relation.name) + " " + str( triple.tail.name) + "\n") print("===============================================================\n") else: print("Answer: We do not have this news, may be you want to know st:\n") print("Triple extracted: " + str(triple.head.name) + " " + str(triple.relation.name) + " " + str(triple.tail.name) + "\n") dictRelationScore = {k: v for k, v in sorted(dictRelationScore.items(), key=lambda item: item[1], reverse=True)} count = 0 for r, s in dictRelationScore.items(): print(">>> " + str(triple.head.name) + " " + str(r) + " " + str(dictRelationEntity.get(r))) print("relation has similarity: ", s) print("\n") count = count + 1 if count == 3: break print("===============================================================\n")
def execute(): if request.method == "POST": if request.form: answer = "Cloud computing allows consumers and businesses to use applications without installation and access their personal files at any computer with internet access. Cloud-based services are ideal for businesses with growing or fluctuating bandwidth demands." text = request.form.get("text") tool = language_check.LanguageTool("en-US") matches = tool.check(text) language_check.correct(text, matches) mark = 0 if len(matches) == 0: mark = mark + 5 elif len(matches) > 5: mark = mark + 4 elif len(matches) > 10: mark = mark + 3 # Context based comparison allWords = nltk.tokenize.word_tokenize(text) stopwords = nltk.corpus.stopwords.words("english") words = [] point = [",", ".", ";", "'", ""] for w in allWords: if w not in stopwords: if w not in point: words.append(w) allWordDist = nltk.FreqDist(w.lower() for w in words) allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWordDist if w not in stopwords) mostCommon = allWordDist.most_common() allWord = nltk.tokenize.word_tokenize(answer) stopwords = nltk.corpus.stopwords.words("english") word = [] point = [",", ".", ";", "'", "-"] for w in allWord: if w not in allWordDist: if w not in point: word.append(w) mark1 = 0 length = len(allWord) if length > 15: for a in allWord: for r in allWordDist: dist1 = float(textdistance.jaro_winkler(a, r)) dist = round(dist1, 2) if dist > 0.500: mark1 = mark1 + 1 break marks2 = int(mark1 / length * 5) total = mark + marks2 else: total = 0 return render_template("result.html", total=total)
def cached_text_distance(s1, s2, cache, dry_run): key = (s1, s2) if dry_run: distance = None cache[key] = distance elif key not in cache: distance = textdistance.jaro_winkler(s1, s2) cache[key] = distance else: distance = cache[key] return distance
def answer(self, msg): if not self.active or self.correct: return if not self.correct: for ans in self.a: ans = " ".join(ans.split()).strip().lower() guess = " ".join(msg.args[1].split()).strip().lower() if guess == ans: self.correct = True break elif not self.correct: answer = self.clean(ans) guess = self.clean(guess) if not self.correct and guess == answer: self.correct = True break elif (not self.correct and self.flexibility < 1 and self.flexibility > 0.5): dist = textdistance.jaro_winkler(guess, answer) log.debug( "Jeopardy: guess: {0}, answer: {1}, length: {2}, " "distance: {3}, flexibility: {4}".format( guess, answer, len(answer), dist, self.flexibility)) if dist >= self.flexibility: self.correct = True break elif (dist < self.flexibility and "," in self.a[0] or "&" in self.a[0]): dist = textdistance.jaccard(guess, answer) if dist >= self.flexibility: self.correct = True break if self.correct: if not msg.nick in self.scores: self.scores[msg.nick] = 0 self.scores[msg.nick] += self.p if not msg.nick in self.roundscores: self.roundscores[msg.nick] = 0 self.roundscores[msg.nick] += self.p self.unanswered = 0 reply = self.correct_template.render( nick=msg.nick, answer=self.a[0], points=self.p, round=self.roundscores[msg.nick], total=self.scores[msg.nick], ) self.reply(reply) self.correct = True self.answered += 1 self.clear() self.newquestion()
def checkIfTypo(self, name1, name2): #normalizedHammingDistance = textdistance.hamming.normalized_similarity(name1,name2) #levenshteinDistance = textdistance.levenshtein.normalized_similarity(name1,name2) jaroWinkler = textdistance.jaro_winkler(name1, name2) #avgSimilarityScore = (normalizedHammingDistance + levenshteinDistance + jaroWinkler)/3 #if(avgSimilarityScore > 0.78): if (jaroWinkler > 0.92): return True else: return False
def address_similarity_scorer(a: str, b: str): """Compares two address strings, returns 1 if they match, 0 otherwise. Uses Jaro-Winkler Distance Algorithm (JWDA) (en.wikipedia.org/wiki/Jaro–Winkler_distance). JWDA "measurement scale is 0.0 to 1.0, where 0.0 is the least likely and 1.0 is a positive match. """ jaro = textdistance.jaro_winkler(a, b) if jaro > 0.9: return 1 else: return 0
def key_input_search(key): max_score = 0.0 index = 0 l = len(titles) for i in range(l): if not pd.isna(titles[str(i)]): score = textdistance.jaro_winkler(titles[str(i)].lower(), key.lower()) if score > max_score: max_score = score index = i return joblabel[str(index)]
def three_recommended_items(request): all_products = Product.objects.filter(disponible=True) user_products = Product.objects.filter(user__email=request.user.email) all_products = all_products.difference(user_products) all_products_names = [] for p in all_products: all_products_names.append(p.name) user_products_names = [] for p in user_products: user_products_names.append(p.name) if len(all_products_names) < 3 or len(user_products_names) < 1: return 0 import textdistance # set test list = [[user_products_names[0], all_products_names[0], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)], [user_products_names[0], all_products_names[1], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)], [user_products_names[0], all_products_names[2], round(textdistance.jaro_winkler(user_products_names[0], all_products_names[0]), 4)]] # Jaro–Winkler distance is a measure of edit distance which gives more similar measures to words in which # the beginning characters match. for i in all_products_names: for j in user_products_names: d = round(textdistance.jaro_winkler(i, j), 4) m = min([t[2] for t in list]) if d > m: l = [j,i, d] for k in range(3): if m == list[k][2]: list[k] = l break from django.db.models import Q list = Product.objects.filter(Q(name = list[0][1]) | Q(name = list[1][1]) | Q(name = list[2][1]), disponible=True ) return list
def calculate_distances(job1, job2): sensors = list(job1.keys()) assert sensors == list(job2.keys()) assert sensors == config.SENSORS_LIST distances = [] for sensor in sensors: string1, string2 = job1[sensor], job2[sensor] # Here we can try various string distance methods to see which works better distance = textdistance.jaro_winkler(string1, string2) # distance = textdistance.levenshtein(string1, string2) distances.append(distance) return distances
def last_name_similarity_scorer(a: str, b: str): """Compares two first last strings, returns 1 if they match, 0 otherwise. Uses Jaro-Winkler Distance Algorithm (JWDA) (en.wikipedia.org/wiki/Jaro–Winkler_distance). JWDA "measurement scale is 0.0 to 1.0, where 0.0 is the least likely and 1.0 is a positive match. For our purposes, anything below a 0.8 is not considered useful." (source: SAP blog) """ jaro = textdistance.jaro_winkler(a, b) if jaro > 0.8: return 1 else: return 0
def match_jaro_winkler(ee, platforms): for index, row in ee.iterrows(): if row.possible_stops == '': subset = platforms[platforms.routes_wkd.str.contains(row.line)] if subset.shape[0] > 0: subset_stop_names = pd.DataFrame(subset.stop_name.unique(),columns=['stop_name']) name_dist = [textdistance.jaro_winkler(row.station_name,y) for y in subset_stop_names.stop_name] matched_station_name = subset_stop_names.iloc[np.argmax(name_dist),0] matched_stop_ids = subset[subset.stop_name == matched_station_name][['stop_id']] score = max(name_dist) if score > 0.79: ee.loc[index,'possible_stops'] = ', '.join(matched_stop_ids.stop_id) return ee
def compare(self, str1, str2): if self.debug: self.log("jaro winkler comparison") self.start_time() self.result.distance = jaro_winkler(str1, str2) self.end_time() self.result.nos = max(len(str1), len(str2)) self.result.threshold = 90 self.result.similarity = (1 - self.result.distance) * 100 return self.result
def brands_custom_distance(row): """ Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match. """ # jaccard = textdistance.jaccard(str(row['brand']).lower().replace('.','').replace('&','and'), str(row['elc_brand']).lower().replace('.','').replace('é','e')) jaro = textdistance.jaro_winkler( str(row['Brand']).lower().replace('.', '').replace('&', 'and'), str(row['ELC_Brand']).lower().replace('.', '').replace('é', 'e')) try: fuzzi = fuzz.partial_ratio( str(row['Brand']).lower().replace('.', '').replace('&', 'and'), str(row['ELC_Brand']).lower().replace('.', '').replace('é', 'e')) / 100 except ValueError: return jaro return np.average([fuzzi, jaro], weights=[0.4, 0.6])
def get_skill_header(headers, skills): list_of_skills = list() for header in headers: skills_weight = list() for No_of_skill in range(len(skills)): skills_weight.append( (header, textdistance.jaro_winkler(header, skills[No_of_skill]))) #if reached the end of skills sort them and get the highest prob.only if No_of_skill == len(skills) - 1: list_of_skills.append(max(skills_weight, key=lambda x: x[1])) skill_element = max(list_of_skills, key=lambda x: x[1]) skill = skill_element[0] headers.remove(skill) return skill
def item_description_custom_distance(row): """ Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match. """ item_description = str(row['Item_Description']).lower().replace( 'clinique', '').replace('origins', '').replace('tom ford', '').replace( 'la mer', '').replace('estee lauder', '').replace('mac', '').replace( 'bb', '').replace('bobbi', '').replace('brown', '') product = str(row['Product']).lower() jaro = textdistance.jaro_winkler(item_description, product) try: fuzzi = fuzz.partial_ratio(item_description, product) / 100 except ValueError: return jaro return np.average([fuzzi, jaro], weights=[0.95, 0.05])
def subcategory_custom_distance(row): """ Calculates a distance score between two sentences. In this case elc_brand and brand. The score is between 0 and 1, 1 being a good match. """ jaro = textdistance.jaro_winkler( str(row['Sub_Category']).lower(), str(row['ELC_Solution_Type']).lower() + ' ' + str(row['Item_Description']).lower().replace('.', '').replace( '&', 'and')) try: fuzzi = fuzz.partial_ratio( str(row['Sub_Category']).lower(), str(row['ELC_Solution_Type']).lower() + ' ' + str(row['Item_Description']).lower().replace('.', '').replace( '&', 'and')) / 100 except ValueError: return jaro return np.average([fuzzi, jaro], weights=[0.95, 0.05])
def get_otherHeaders(headers, otherHeaders): list_of_chosen_Headers = set() list_of_otherHeaders = list() for header in headers: otherHeaders_weight = list() for No_of_otherH in range(len(otherHeaders)): otherHeaders_weight.append( (header, textdistance.jaro_winkler(header, otherHeaders[No_of_otherH]))) if No_of_otherH == len(otherHeaders) - 1: list_of_otherHeaders.append( max(otherHeaders_weight, key=lambda y: y[1])) list_of_otherHeaders.sort(key=lambda y: y[1], reverse=True) for header in list_of_otherHeaders: if header[1] > 0.7: list_of_chosen_Headers.add(header[0]) return list_of_chosen_Headers
def search_trigrams(query): subset = set() tri_search = [query[i:i+3] for i in range(len(query)-2)] # A union between all sets is constructed here but this operation # could be altered depending on the substring submitted or where # the substring was found in the string, potentially handling high- # frequency substrings or substrings occurringly word-initially # differently. for item in tri_search: if trigrams.get(item) is not None: subset.update(trigrams.get(item)) # Jaro-Winkler is a good edit distance metric choice to use with trigrams as it # favors those strings which match from the beginning. # Place matches in objects so that they can be sorted later using attrgetter. matches = [Match(bands[item], jaro_winkler(query, item)) for item in subset] return sorted(matches, key=attrgetter("score"), reverse=True)[:NUM_RESULTS]
def recommend(): wrd = str(request.form.get('word')) message = "Displaying recommentations for " + wrd + ":\n" if wrd not in indices['movie_title'].unique(): temp = indices temp['movie_name_distance'] = indices.apply( lambda row: td.jaro_winkler(row['movie_title'], wrd), axis=1) wrd = temp.sort_values('movie_name_distance', ascending=False).iloc[0, 0] message = "Couldn't find the input movie, displaying recommendations for " + wrd + ":<br/>" idx = int(indices[indices['movie_title'] == wrd][0]) sim_scores = list(enumerate(cosine_sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:6] movie_indices = [i[0] for i in sim_scores] recommendations = data_api['movie_title'].iloc[movie_indices].str.cat( sep='<br/>') #return message + recommendations #return render_template('index.html', prediction='Recommended movies {}'.format(json.dumps(message + recommendations))) return render_template('index.html', prediction=message + recommendations)
def run_distance_matching(company_name, patstat_name, elastic_score, query): """ run_distance_matching """ ratio = fuzz.token_sort_ratio(company_name.lower(), patstat_name.lower()) jaro_winkler_score = textdistance.jaro_winkler( company_name.lower(), patstat_name.lower()) name_length = len(company_name.split()) if name_length > 5 : elastic_score -= 10 distance_score = score.calculate_distance_score(ratio, jaro_winkler_score, name_length) pam_score = score.pam_score(query, elastic_score, distance_score) return { 'levensthein_score' : pam_score, 'jaro_winkler_score' : jaro_winkler_score, 'pam_score': pam_score }
def compute_pairwise(documents): results = [] for outer_idx, outer_doc in enumerate(documents): for inner_idx, inner_doc in enumerate(documents): if ((outer_idx < inner_idx) and (outer_doc.owner != inner_doc.owner) and (outer_doc.title != 'New document') and (inner_doc.title != 'New document') and (not outer_doc.title.lower().startswith('test')) and (not inner_doc.title.lower().startswith('test'))): similarity = textdistance.jaro_winkler( outer_doc.title, inner_doc.title) if (similarity > 0.9): results.append({ 'doc_a': outer_doc, 'doc_b': inner_doc, 'score': similarity }) return results
def similarity(type, a, b): """ String similarity metrics input: type: hamming (similarity type) a: John (string 1) b: John Snow (string 2) output: 0.73 (probability) """ if type == 'hamming': return textdistance.hamming.normalized_similarity(a, b) elif type == 'levenshtein': return textdistance.levenshtein.normalized_similarity(a, b) elif type == 'jaro_winkler': return textdistance.jaro_winkler(a, b) elif type == 'jaccard': tokens_1 = a.split() tokens_2 = b.split() return textdistance.jaccard(tokens_1, tokens_2) elif type == 'sorensen': tokens_1 = a.split() tokens_2 = b.split() return textdistance.sorensen(tokens_1, tokens_2) elif type == 'ratcliff_obershelp': return textdistance.ratcliff_obershelp(a, b)
def find(): ans = [] conn = sqlite3.connect('FOR.db') c = conn.cursor() #c.execute('delete from FORK') conn.commit() BET1x = c.execute('select * from BET1x') for i in BET1x.fetchall(): # print('ok') xdata = i[0] xuakity = i[1] xopp1 = i[3] xopp2 = i[4] xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2 xp1 = i[5] xp2 = i[6] OLIMP = c.execute('select * from OLIMPBET') for o in OLIMP.fetchall(): odata = o[0] otime = o[1] league = o[2] oopp1 = o[3] oopp2 = o[4] obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2 obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1 op1 = o[5] op2 = o[6] uksastik = textdistance.jaro_winkler(xbet.lower().strip(), obet.lower().strip()) uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(), obet2.lower().strip()) if uksastik >= 0.97 and uksastik > uksastik2: print('1xBET - OLIMPBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op2) orta2 = 1 / float(op1) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op2) < 1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op2) < 1.0) and (1 / float(xp1) + 1 / float(op2) < (1 / float(xp2) + 1 / float(op1))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'OLIMPBET', op1, op2, '1xBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'OLIMPBET ' + oopp2, str(100 - A1) + '%', str(float(100 - A1) * float(op2) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op1) < 1.0) and (1 / float(xp2) + 1 / float(op1) < (1 / float(xp1) + 1 / float(op2))): A2 = (1 / float(op1) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp1, str(A2) + '%', str(float(A2) * float(op1) - 100) + '%', '1xBET ' + xopp2, str(100 - A2) + '%', str(float(100 - A2) * float(xp2) - 100) + '%' ]) else: print("No") elif uksastik < uksastik2 and uksastik2 >= 0.97: print('1xBET - OLIMPBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op1) orta2 = 1 / float(op2) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op1) < 1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet2) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op1) < 1.0) and (1 / float(xp1) + 1 / float(op1) < (1 / float(xp2) + 1 / float(op2))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'OLIMPBET', op1, op2, '1xBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'OLIMPBET ' + oopp1, str(100 - A1) + '%', str(float(100 - A1) * float(op1) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op2) < 1.0) and (1 / float(xp2) + 1 / float(op2) < (1 / float(xp1) + 1 / float(op1))): A2 = (1 / float(op2) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp2, str(A2) + '%', str(float(A2) * float(op2) - 100) + '%', '1xBET ' + xopp2, str(100 - A2) + '%', str(float(100 - A2) * float(xp2) - 100) + '%' ]) else: print("No") else: continue BET1x = c.execute('select * from BET1x') for i in BET1x.fetchall(): xdata = i[0] xuakity = i[1] xopp1 = i[3] xopp2 = i[4] xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2 xp1 = i[5] xp2 = i[6] # print('ok') FONBET = c.execute('select * from FONBET') for o in FONBET.fetchall(): odata = o[0] otime = o[1] league = o[2] oopp1 = o[3] oopp2 = o[4] obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2 obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1 op1 = o[5] op2 = o[6] uksastik = textdistance.jaro_winkler(xbet.lower().strip(), obet.lower().strip()) uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(), obet2.lower().strip()) if uksastik > uksastik2 and uksastik >= 0.97: print('1xBET - FONBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op2) orta2 = 1 / float(op1) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op2) < 1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op2) < 1.0) and (1 / float(xp1) + 1 / float(op2) < (1 / float(xp2) + 1 / float(op1))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'FONBET', op1, op2, '1xBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'FONBET ' + oopp2, str(100 - A1) + '%', str(float(100 - A1) * float(op2) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op1) < 1.0) and (1 / float(xp2) + 1 / float(op1) < (1 / float(xp1) + 1 / float(op2))): A2 = (1 / float(op1) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, '1xBET', xp1, xp2, 'FONBET', op1, op2, 'FONBET ' + oopp1, str(A2) + '%', str(float(A2) * float(op1) - 100) + '%', '1xBET ' + xopp2, str(100 - A2) + '%', str(float(100 - A2) * float(xp2) - 100) + '%' ]) print("Yes") else: print(xbet, end=' ') print(obet, end=' ') print("No") else: if uksastik < uksastik2 and uksastik2 >= 0.97: print('1xBET - FONBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op1) orta2 = 1 / float(op2) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op1) < 1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet2) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op1) < 1.0) and (1 / float(xp1) + 1 / float(op1) < (1 / float(xp2) + 1 / float(op2))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, xopp1, xopp2, '1xBET', xp1, xp2, 'FONBET', op1, op2, '1xBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'FONBET ' + oopp1, str(100 - A1) + '%', str(float(100 - A1) * float(op1) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op2) < 1.0) and (1 / float(xp2) + 1 / float(op2) < (1 / float(xp1) + 1 / float(op1))): A2 = (1 / float(op2) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, xopp1, xopp2, '1xBET', xp1, xp2, 'FONBET', op1, op2, 'FONBET ' + oopp2, str(A2) + '%', str(float(A2) * float(op2) - 100) + '%', '1xBET ' + xopp2, str(100 - A2) + '%', str(float(100 - A2) * float(xp2) - 100) + '%' ]) else: print("No") else: continue FONBET = c.execute('select * from FONBET') for i in FONBET.fetchall(): xdata = i[0] xuakity = i[1] xopp1 = i[3] xopp2 = i[4] xbet = xdata + ',' + xuakity + ',' + xopp1 + ',' + xopp2 xp1 = i[5] xp2 = i[6] OLIMP = c.execute('select * from OLIMPBET') for o in OLIMP.fetchall(): # print('ok') odata = o[0] otime = o[1] league = o[2] oopp1 = o[3] oopp2 = o[4] obet = odata + ',' + otime + ',' + oopp1 + ',' + oopp2 obet2 = odata + ',' + otime + ',' + oopp2 + ',' + oopp1 op1 = o[5] op2 = o[6] uksastik = textdistance.jaro_winkler(xbet.lower().strip(), obet.lower().strip()) uksastik2 = textdistance.jaro_winkler(xbet.lower().strip(), obet2.lower().strip()) if uksastik > uksastik2 and uksastik >= 0.97: print('FONBET - OLIMPBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op2) orta2 = 1 / float(op1) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op2) < 1.0) or (1 / float(op1) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op2) < 1.0) and (1 / float(xp1) + 1 / float(op2) < (1 / float(xp2) + 1 / float(op1))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, 'FONBET', xp1, xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'OLIMPBET ' + oopp2, str(100 - A1) + '%', str(float(100 - A1) * float(op2) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op1) < 1.0) and (1 / float(xp2) + 1 / float(op1) < (1 / float(xp1) + 1 / float(op2))): A2 = (1 / float(op1) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, 'FONBET', xp1, xp2, 'OLIMPBET', op1, op2, 'OLIMPBET ' + oopp1, str(A2) + '%', str(float(A2) * float(op1) - 100) + '%', 'FONBET ' + xopp2, str(100 - A2) + '%', str(float(100 - A2) * float(xp2) - 100) + '%' ]) else: print("No") if uksastik < uksastik2 and uksastik2 >= 0.97: print('FONBET - OLIMPBET', end=' ') print(xbet, end=' ') print(obet, end=' ') print(uksastik, end=' ') print(uksastik2, end=' ') orta1 = 1 / float(xp1) + 1 / float(op1) orta2 = 1 / float(op2) + 1 / float(xp2) if (1 / float(xp1) + 1 / float(op1) < 1.0) or (1 / float(op2) + 1 / float(xp2) < 1.0): print("Yes", end=' ') print(obet2) print('--------------------------------------------') if (1 / float(xp1) + 1 / float(op1) < 1.0) and (1 / float(xp1) + 1 / float(op1) < (1 / float(xp2) + 1 / float(op2))): A1 = (1 / float(xp1) / orta1) * 100 A1 = int(float(A1) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, 'FONBET', xp1, xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp1, str(A1) + '%', str(float(A1) * float(xp1) - 100) + '%', 'OLIMPBET ' + oopp1, str(100 - A1) + '%', str(float(100 - A1) * float(op1) - 100) + '%' ]) elif (1 / float(xp2) + 1 / float(op2) < 1.0) and (1 / float(xp2) + 1 / float(op2) < (1 / float(xp1) + 1 / float(op1))): A2 = (1 / float(op2) / orta2) * 100 A2 = int(float(A2) * 1000) / 1000 ans.append([ odata, otime, league, oopp1, oopp2, 'FONBET', xp1, xp2, 'OLIMPBET', op1, op2, 'FONBET ' + xopp2, str(A2) + '%', str(float(A2) * float(xp2) - 100) + '%', 'OLIMPBET ' + oopp2, str(100 - A2) + '%', str(float(100 - A2) * float(op2) - 100) + '%' ]) else: print("No") else: continue # time.sleep(1) print(ans) import csv with open('FORK.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerows(ans) return ans # for a in ans: # c.execute('''INSERT INTO FORK (data, time, league, opp1, opp2, bet1, coef1_bet1, coef2_bet1, bet2, coef1_bet2, coef2_bet2, win_opp1, percent_of_your_money_for_win_opp1, profit1, win_opp2, percent_of_your_money_for_win_opp2, profit2) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''',(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16])) # conn.commit() # conn.close() # print('ok')
# In[11]: textdistance.levenshtein.normalized_similarity('arrow', 'arow') # # Jaro Winkler # # https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227 # In[12]: textdistance.jaro_winkler("mes", "messi") # In[13]: textdistance.jaro_winkler("crate", "crat") # In[14]: textdistance.jaro_winkler("crate", "atcr") # # Jaccard Index
def Seq_StringDistance(str_seq, str_ref, method="hamming"): if (method is "hamming"): return [ textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "levenshtein"): return [ textdistance.levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "damerau_lev"): return [ textdistance.damerau_levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "j-winkler"): return [ textdistance.jaro_winkler(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "smith-waterman"): return [ textdistance.smith_waterman(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "jaccard"): return [ textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "sorensen-dice"): return [ textdistance.sorensen_dice(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tversky"): return [ textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "cosine"): return [ textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "ratcliff"): return [ textdistance.ratcliff_obershelp(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "bwt"): return [ textdistance.bwtrle_ncd(str_seq_i, str_ref) for str_seq_i in str_seq ]