def WRatio(s1, s2): p1 = utils.full_process(s1) p2 = utils.full_process(s2) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2) * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale return int(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2) * unbase_scale tser = token_set_ratio(p1, p2) * unbase_scale return int(max(base, tsor, tser))
def WRatio(s1, s2): if not utils.validate_string(s1): return 0 if not utils.validate_string(s2): return 0 p1 = utils.full_process(s1) p2 = utils.full_process(s2) # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1),len(p2)))/min(len(p1),len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2) * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale return int(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2) * unbase_scale tser = token_set_ratio(p1, p2) * unbase_scale return int(max(base, tsor, tser))
def QRatio(s1, s2): if not utils.validate_string(s1): return 0 if not utils.validate_string(s2): return 0 p1 = utils.full_process(s1) p2 = utils.full_process(s2) return ratio(p1, p2)
def QRatio(s1, s2, force_ascii=True): p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 return ratio(p1, p2)
def _token_set(s1, s2, partial=True, force_ascii=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(utils.full_process(p1).split()) tokens2 = set(utils.full_process(p2).split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = partial_ratio else: ratio_func = ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1), ] return max(pairwise)
def _token_set(s1, s2, partial=True, force_ascii=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(utils.full_process(p1).split()) tokens2 = set(utils.full_process(p2).split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = partial_ratio else: ratio_func = ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1) ] return max(pairwise)
def WRatio(s1, s2, force_ascii=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale return int(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale return int(max(base, tsor, tser))
def WRatio(s1, s2, force_ascii=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = 0.95 partial_scale = 0.90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = 0.6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale return int(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale return int(max(base, tsor, tser))
def _token_print_set(s1, s2, partial=True, force_ascii=True, full_process=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1 p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(p1.split()) tokens2 = set(p2.split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) print(intersection) print(diff1to2) print(diff2to1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip()
def test_validate_strings(self): tester = None self.assertTrue(not utils.validate_string(tester)) tester = "" self.assertTrue(not utils.validate_string(tester)) tester = 0.00123 self.assertTrue(not utils.validate_string(tester)) tester = 999999999999999999999999999999999L self.assertTrue(not utils.validate_string(tester)) tester = 'a' self.assertTrue(utils.validate_string(tester)) tester = "This is a perfectly valid string" self.assertTrue(utils.validate_string(tester)) tester = "This \n\ris \n\r\ra \n\r\t\033[49m\n \033[31mperfectly\033[39m \r\nvalid string" self.assertTrue(utils.validate_string(tester))