def get(self, snapa, snapb): sa = Snap.objects.get_or_404(id=snapa) sb = Snap.objects.get_or_404(id=snapb) fstcleanhtml = cleanhtml(sa.html) sndcleanhtml = cleanhtml(sb.html) sm = SequenceMatcher(None, sndcleanhtml, fstcleanhtml) txtinsert = [] txtdel = [] txtreplace = [] for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == "replace": txtreplace.append( ("%s <-> %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip()) if tag == "insert": txtinsert.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip()) if tag == "delete": txtdel.append(("%s %s" % ("".join(fstcleanhtml[i1:i2]), "".join(sndcleanhtml[j1:j2]))).strip()) return jsonify({ 'diff': {'fst': {'id': str(sa.id), 'dthr': sa.dthr}, 'snd': {'id': str(sb.id), 'dthr': sb.dthr}, 'ratio': sm.ratio(), 'insert': txtinsert, 'replace': txtreplace, 'delete': txtdel} })
def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores))
def colordiff(a, b, highlight='red'): """Given two strings, return the same pair of strings except with their differences highlighted in the specified color. """ a_out = [] b_out = [] matcher = SequenceMatcher(lambda x: False, a, b) for op, a_start, a_end, b_start, b_end in matcher.get_opcodes(): if op == 'equal': # In both strings. a_out.append(a[a_start:a_end]) b_out.append(b[b_start:b_end]) elif op == 'insert': # Right only. b_out.append(colorize(highlight, b[b_start:b_end])) elif op == 'delete': # Left only. a_out.append(colorize(highlight, a[a_start:a_end])) elif op == 'replace': # Right and left differ. a_out.append(colorize(highlight, a[a_start:a_end])) b_out.append(colorize(highlight, b[b_start:b_end])) else: assert(False) return ''.join(a_out), ''.join(b_out)
def ratio(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") m = SequenceMatcher(None, s1, s2) return int(100 * m.ratio())
def dif_html(a, b, lower_criteria = 0.5, upper_criteria = 0.6, link_criteria = 0.4, img_criteria = 0.2, script_criteria = 0.2): from urllib2 import urlopen from difflib import SequenceMatcher from my_stats import jaccard as jac #詳細にページをみる try: html_a, html_b = urlopen(a).read(), urlopen(b).read() matcher = SequenceMatcher() matcher.set_seq1(html_a) matcher.set_seq2(html_b) if matcher.ratio() >= lower_criteria: #lower_criteria以上だけどupper_criteria以下の場合にはリンク構造など詳しく調査する if matcher.ratio() >= upper_criteria: print "white", matcher.ratio(), upper_criteria, lower_criteria, a, b return True else: #ここにnon-negative-matrix-factorizationを入れてより精緻に分析する予定だったけど、まだあまりうまく行かないのでペンディング(T.B.D.) print "grey", matcher.ratio(), upper_criteria, lower_criteria, a, b links_a, titles_a, imgs_a, scripts_a = analyze_html(html_a) links_b, titles_b, imgs_b, scripts_b = analyze_html(html_b) j_links, j_imgs, j_scripts = jac(links_a, links_b), jac(imgs_a, imgs_b), jac(imgs_a, imgs_b) if j_links >= link_criteria and j_imgs >= img_criteria and j_scripts >= script_criteria: return True else: return False #lower_criteria以上に似てない部分がある場合には似てないと判断する else: print "black", matcher.ratio(), upper_criteria, lower_criteria, a, b return False except: #クローリングできない場合には異なるページと見なす return False
def controlled_vocab_lookup(self, controlled_vocab, search_term): """ Performs a semi-fuzzy search for a term match in specified vocabulary """ search_term = search_term best_ratio = 0 best_term = None minimum_ratio = 0.8 return_value = None for term in controlled_vocab: # Exact match - exit with value if search_term == term: return search_term elif term.lower() in search_term.lower() or search_term.lower() in term.lower(): return search_term # Let's see how similar the strings are s = SequenceMatcher(None, search_term.lower(), term.lower()) ratio = s.ratio() if ratio > best_ratio: best_ratio = ratio best_term = term # Examine ratio/term and see if we have anything reasonable if best_ratio >= minimum_ratio: return_value = best_term return return_value
def _test_xml_diff(result, expected): """Compare two XML strings by using python's ``difflib.SequenceMatcher``. This is a character-by-character comparison and does not take into account the semantic meaning of XML elements and attributes. Parameters ---------- result: str The result of running the test. expected: str The expected outcome. Returns ------- bool Whether the result matches the expectations or not. """ sequence_matcher = SequenceMatcher(None, result, expected) ratio = sequence_matcher.ratio() matches = ratio == pytest.approx(1.0) if not matches: print("Result does not match expected.") diff = unified_diff(result.splitlines(), expected.splitlines()) print("\n".join(list(diff))) return matches
def _blobs_similarity(removed_blob, added): best = dict(ratio=0, name='', blob=None) for added_name in added: added_blob = self.tree.get_obj_by_path(added_name) if not isinstance(added_blob, Blob): continue diff = SequenceMatcher(None, removed_blob.text, added_blob.text) ratio = diff.quick_ratio() if ratio > best['ratio']: best['ratio'] = ratio best['name'] = added_name best['blob'] = added_blob if ratio == 1: break # we'll won't find better similarity than 100% :) if best['ratio'] > DIFF_SIMILARITY_THRESHOLD: diff = '' if best['ratio'] < 1: added_blob = best['blob'] rpath = ('a' + removed_blob.path()).encode('utf-8') apath = ('b' + added_blob.path()).encode('utf-8') diff = ''.join(unified_diff(list(removed_blob), list(added_blob), rpath, apath)) return dict(new=best['name'], ratio=best['ratio'], diff=diff)
def calc_similarity(s_standard, s_candidate): if s_standard is None or s_candidate is None: return 0 m = SequenceMatcher(None, s_standard, s_candidate) if len(s_standard) >= len(s_candidate): return m.ratio() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") blocks = m.get_matching_blocks() scores = [] for block in blocks: start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 end = start + len(s_standard) s_sub = s_candidate[start:end] m = SequenceMatcher(None, s_standard, s_sub) scores.append(m.ratio()) return max(scores)
def highlighted_ndiff(a, b): """Returns a highlited string, with bold charaters where different.""" s = '' sm = SequenceMatcher() sm.set_seqs(a, b) linesm = SequenceMatcher() for tag, i1, i2, j1, j2 in sm.get_opcodes(): if tag == REPLACE: for aline, bline in zip_longest(a[i1:i2], b[j1:j2]): if bline is None: s += redline(aline) elif aline is None: s += greenline(bline) else: s += bold_str_diff(aline, bline, sm=linesm) elif tag == DELETE: for aline in a[i1:i2]: s += redline(aline) elif tag == INSERT: for bline in b[j1:j2]: s += greenline(bline) elif tag == EQUAL: for aline in a[i1:i2]: s += ' ' + aline + '\n' else: raise RuntimeError('tag not understood') return s
def get_music(a, b, key='C', mode='major'): midi_out = StringIO() scale = build_scale(key, mode, octaves=1) matcher = SequenceMatcher(None, a, b) tone = key.lower() melodies = [tone] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): next_note = None if tag == 'replace': next_note = 'r' elif tag == 'equal': next_note = tone elif tag == 'delete': tone = tone_down(tone, scale) next_note = tone elif tag == 'insert': tone = tone_up(tone, scale) next_note = tone melodies += [next_note] * ((i2 - i1) or 1) s = SMF([parse(" ".join(melodies))]) s.write(midi_out) return midi_out
def handle_redirects(queued, target): """ This call is used to determine if a suggested redirect is valid. if it happens to be, we change the url entry with the redirected location and add it back to the call stack. """ retry_count = queued.get("retries") if retry_count and retry_count > 1: return elif not retry_count: queued["retries"] = 0 parsed_taget = urlparse(target) target_path = parsed_taget.path source_path = conf.target_base_path + queued.get("url") textutils.output_debug("Handling redirect from: " + source_path + " to " + target_path) matcher = SequenceMatcher(isjunk=None, a=target_path, b=source_path, autojunk=False) if matcher.ratio() > 0.8: queued["url"] = target_path queued["retries"] += 1 # Add back the timed-out item textutils.output_debug("Following redirect! " + str(matcher.ratio())) database.fetch_queue.put(queued) else: textutils.output_debug("Bad redirect! " + str(matcher.ratio()))
def remove_duplicates_stable(movies): nodups = [] for movie in movies: too_similar = False for existing_movie in nodups: matcher = SequenceMatcher(a=movie["name"], b=existing_movie["name"]) is_over_95_percent_similar = matcher.ratio() > 0.95 is_prefix_of_existing = movie["name"].startswith(existing_movie["name"]) or existing_movie[ "name" ].startswith(movie["name"]) if is_over_95_percent_similar or is_prefix_of_existing: # Does the movie exist, but with a different year? if not movie["year"] or movie["year"] == existing_movie["year"]: too_similar = True break if not too_similar: nodups.append(movie) elif len(movie["name"]) > len(existing_movie["name"]): nodups.remove(existing_movie) nodups.append(movie) return nodups
def test_valid_result(content): is_valid_result = True # Encoding edge case # Must be a string to be compared to the 404 fingerprint if not isinstance(content, str): content = content.decode('utf-8', 'ignore') if not len(content): content = "" # empty file, still a forged 404 elif len(content) < conf.file_sample_len: content = content[0:len(content) - 1] else: content = content[0:conf.file_sample_len - 1] # False positive cleanup for some edge cases content = content.strip('\r\n ') # Test signatures for fingerprint in database.crafted_404s: textutils.output_debug("Testing [" + content + "]" + " against Fingerprint: [" + fingerprint + "]") matcher = SequenceMatcher(isjunk=None, a=fingerprint, b=content, autojunk=False) textutils.output_debug("Ratio " + str(matcher.ratio())) # This content is almost similar to a generated 404, therefore it's a 404. if matcher.ratio() > 0.8: textutils.output_debug("False positive detected!") is_valid_result = False break return is_valid_result
def test_valid_result(content): is_valid_result = True # Tweak the content len if len(content) > conf.file_sample_len: content = content[0 : conf.file_sample_len - 1] # False positive cleanup for some edge cases content = content.strip("\r\n ") # Test signatures for fingerprint in database.crafted_404s: textutils.output_debug( "Testing [" + content.encode("hex") + "]" + " against Fingerprint: [" + fingerprint.encode("hex") + "]" ) matcher = SequenceMatcher(isjunk=None, a=fingerprint, b=content, autojunk=False) textutils.output_debug("Ratio " + str(matcher.ratio())) # This content is almost similar to a generated 404, therefore it's a 404. if matcher.ratio() > 0.8: textutils.output_debug("False positive detected!") is_valid_result = False break return is_valid_result
def html_diff(str1, str2, max_lenght=80, html_same_class="blue", html_diff_class="red"): from difflib import SequenceMatcher str1, str2 = align_strings(str1, str2, max_lenght) sm = SequenceMatcher(lambda x: x in " ") same_span = "<span style='color: %s'>" % html_same_class diff_span = "<span style='color: %s'>" % html_diff_class clos_span = "</span>" ret_str1 = [] ret_str2 = [] for str1, str2 in zip(str1, str2): temp_str1 = "" temp_str2 = "" finished = 0 sm.set_seqs(str1, str2) for m in sm.get_matching_blocks(): temp_str1 += diff_span + str1[finished:m[0]] + clos_span temp_str1 += same_span + str1[m[0]:m[0]+m[2]] + clos_span temp_str2 += diff_span + str2[finished:m[1]] + clos_span temp_str2 += same_span + str2[m[1]:m[1]+m[2]] + clos_span finished = m[0]+m[2] ret_str1 += [temp_str1] ret_str2 += [temp_str2] return ret_str1, ret_str2
def compare(self, statement_a, statement_b): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ # Return 0 if either statement has a falsy text value if not statement_a.text or not statement_b.text: return 0 # Get the lowercase version of both strings statement_a_text = str(statement_a.text.lower()) statement_b_text = str(statement_b.text.lower()) similarity = SequenceMatcher( None, statement_a_text, statement_b_text ) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent
def diff_stat(old, new): result = [0, 0] # [ADDED, REMOVED] def insert(i1, i2, j1, j2): result[ADDED] += j2 - j1 def delete(i1, i2, j1, j2): result[REMOVED] += i2 - i1 def update(i1, i2, j1, j2): result[REMOVED] += i2 - i1 result[ADDED] += j2 - j1 opcode_handler = { 'insert': insert, 'delete': delete, 'replace': update, 'equal': None, } sm = SequenceMatcher(None, old, new) for (tag, i1, i2, j1, j2) in sm.get_opcodes(): f = opcode_handler[tag] if callable(f): f(i1, i2, j1, j2) return result
def versioning(lists): """ Compute the lifetime of every element from an iterable of sequences. It returns an iterable of :class:`Versioned` classes to indicate the lifetimes. The computation is backed by the built-in :mod:`difflib` module. As such, every element must be hashable. """ ci = chain.from_iterable sm = SequenceMatcher() oldVersions = [ [] ] for newName, (oldList, newList) in enumerate(_pairwise(lists, [])): sm.set_seqs(oldList, newList) newVersions = [ oldVersions[0] ] for op, oldStart, oldEnd, newStart, newEnd in sm.get_opcodes(): if op == 'equal': for i in range(oldStart+1, oldEnd+1): oldVersions[i][0].high = newName newVersions.extend(oldVersions[oldStart+1:oldEnd+1]) if op == 'delete' or op == 'replace': newVersions[-1].extend(ci(oldVersions[oldStart+1:oldEnd+1])) if op == 'insert' or op == 'replace': newVersions.extend([Versioned(x, newName, newName)] for x in newList[newStart:newEnd]) oldVersions = newVersions return ci(oldVersions)
def items(self): source = self.context.__parent__.__name__ source = Eq('stiam.ro.source', source) tags = [self.context.__name__] tags = AnyOf('stiam.ro.tags', tags) query = SearchQuery(source).And(tags) brains = query.searchResults(sort_index='stiam.ro.effective', reverse=True, limit=30) duplicate = "" index = 0 for brain in brains: if index >= 15: raise StopIteration title = getattr(brain, "title", "") s = SequenceMatcher(lambda x: x == "", title, duplicate) if s.ratio() > 0.6: continue duplicate = title index += 1 yield brain
def drawFigureMono(self): print("Monocoeur") debutTime = time.time() #Creation liste de listes vide arr = [] for n1 in range(0,len(self.listResp)): arr.append([]) for n2 in range(0,len(self.listResp)): arr[n1].append([]) #for all response couple longueur = len(self.listResp) for n1 in range(0,longueur): print("Image R : ligne "+str(n1)+" sur "+str(longueur)) for n2 in range(n1,longueur): #responses of index n1 and n2 d1 = self.listResp[n1] d2 = self.listResp[n2] #Ratio s = SequenceMatcher(lambda x: x == " ",d1,d2) ratio = s.ratio() arr[n1][n2] = ratio arr[n2][n1] = ratio finTime = time.time() diffTime = finTime - debutTime difftuple = time.gmtime(diffTime) print("Executé en "+str(difftuple.tm_min)+" min et "+str(difftuple.tm_sec)+" sec") f = open("./sortieR_"+str(self.name), "w") for n1 in range(0,len(arr)): for n2 in range(0,len(arr)): f.write(str(arr[n1][n2])+" ") f.write("\n") f.close()
def WordByWord(self, str1, str2, bestRatio): '''''' try: # Getting best score word-by-word word1 = str1.split() word2 = str2.split() listing = [] for w in word1: if len(w)>1: w = self.TextCleanup(w) highest = 0.0 curr_word = [w, '', highest] for v in word2: if len(v)>1: v = self.TextCleanup(v) s = SequenceMatcher(None, w, v) ratio = s.ratio() #print " - comparing: [",w,"/", v, "]:", ratio if ratio >= highest: highest = ratio curr_word[1] = v curr_word[2] = ratio if curr_word[2]>0.0: #print " ",curr_word listing.append(curr_word) #print "="*20 # Checking average of matches sumed = 0.0 hits = 0.0 length = len(listing) for word in listing: sumed += word[2] if word[2]>=0.8: hits+=1 average = (sumed/length) hitsPercentage = (hits/length) #print "Length:", length #print "Avg:", average #print "Hits:", hitsPercentage msg = " Best match is:\n\t ratio:\t\t"+str(bestRatio)+ \ "\n\t best:\t\t"+str1+ \ "\n\t original:\t"+str2+ \ "\n\t average:\t"+str(average)+ \ "\n\t hits:\t\t"+str(hitsPercentage) self.logger.debug(msg) isGoodTitle = average >= ratio or hitsPercentage >= 0.7 return isGoodTitle except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() exception_fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] exception_line = str(exc_tb.tb_lineno) exception_type = str(type(inst)) exception_desc = str(inst) self.logger.debug( " %s: %s in %s:%s"%(exception_type, exception_desc, exception_fname, exception_line ))
def ensure_files_present(original_file_dict, modified_file_dict): """ Ensures that all files are available as keys in both dicts. :param original_file_dict: Dict of lists of file contents before changes :param modified_file_dict: Dict of lists of file contents after changes :return: Return a dictionary of renamed files. """ original_files = set(original_file_dict.keys()) modified_files = set(modified_file_dict.keys()) affected_files = original_files | modified_files original_unique_files = affected_files - modified_files renamed_files_dict = {} for file in filter( lambda filter_file: filter_file not in original_files, affected_files): for comparable_file in original_unique_files: s = SequenceMatcher( None, ''.join(modified_file_dict[file]), ''.join(original_file_dict[comparable_file])) if s.real_quick_ratio() >= 0.5 and s.ratio() > 0.5: renamed_files_dict[comparable_file] = file break else: original_file_dict[file] = [] for file in filter( lambda filter_file: filter_file not in modified_files, affected_files): modified_file_dict[file] = [] return renamed_files_dict
def scan_company_names(name_list, name1, results=0, ro_thresold=None): """Scan a list of company names, searching for best matches against the given name. Notice that this function takes a list of strings, and not a list of dictionaries.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm1.set_seq1(name1.lower()) resd = {} withoutCountry = not name1.endswith(']') for i, n in name_list: # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(n, str): n = unicode(n, 'latin1', 'ignore') o_name = n var = 0.0 if withoutCountry and n.endswith(']'): cidx = n.rfind('[') if cidx != -1: n = n[:cidx].rstrip() var = -0.05 # Distance with the company name. ratio = ratcliff(name1, n, sm1) + var if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, analyze_company_name(o_name))) else: resd[i] = (ratio, (i, analyze_company_name(o_name))) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
def WikiDocument(out, user_from, user_to, timestamp, subject, text): global previous ###url = get_url(id, prefix) ###header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title) ##############text = clean(text) subject = clean(subject) header = '%s\t%s\t%s\t%s\t' % (user_to, user_from, timestamp, subject) header = header.encode('utf-8') text = clean(text) ###find the diff s = SequenceMatcher(None, previous, text) opcodes = s.get_opcodes() diff = [] for i in opcodes: if i[0] == 'insert' or i[0] == 'replace': j1 = i[3] j2 = i[4] diff.append(text[j1:j2]) diff = "".join(diff) ###diff = clean(diff) ### out.reserve(len(header) + len(subject) + len(diff)) print >> out, header, print >> out, diff.encode('utf-8') previous = text
def get_from_text(self, old, new): """ Gets the differences between `old` text and `new` text and returns a changeset :param old: old Text object :param new: new text string """ olds = str(old) if olds == new: return None print repr(olds), repr(new) sm = SequenceMatcher(None, olds, new) print " CS LENS ", len(olds), len(new) csd = dict(old_len=len(olds), new_len=len(new), ops="", char_bank="") opcodes = [opcode_tup for opcode_tup in sm.get_opcodes()] last_op = 0 print " CS OPC 1", opcodes for i in range(0, len(opcodes)): if opcodes[i][0] != "equal": last_op = i print " CS OPC 2", opcodes[:last_op+1] for opcode_tup in opcodes[:last_op+1]: op_code_match(*opcode_tup, changeset=csd, sm=sm, text=old) print " CS CSD ", csd return csd
def _compare_lines(self, la, lb): sa = '\n'.join(la) sb = '\n'.join(lb) ta_result = '' tb_result = '' str_diff_start = '<em class="str-diff">' str_diff_end = '</em>' s = SequenceMatcher(None, sa, sb) cnt_a = Counter() cnt_b = Counter() for block in s.get_matching_blocks(): (a_idx, b_idx, nmatch) = block print("a[%d] and b[%d] match for %d elements" % block) cnt_a.progress(a_idx, nmatch) cnt_b.progress(b_idx, nmatch) diff_a = cnt_a.slice_diff(sa) same_a = cnt_a.slice_match(sa) diff_b = cnt_b.slice_diff(sb) same_b = cnt_b.slice_match(sb) if diff_a or diff_b: ta_result += self._enclose(str_diff_start, diff_a, str_diff_end, consider_newline = True) ta_result += same_a if diff_a or diff_b: tb_result += self._enclose(str_diff_start, diff_b, str_diff_end, consider_newline = True) tb_result += same_b cnt_a.next() cnt_b.next() return (ta_result.split('\n'), tb_result.split('\n'))
def sanitizedStrCheck(str1, str2, location): if (isinstance(str1, str) and isinstance(str2, str)) or (isinstance(str1, unicode) and isinstance(str2, unicode)): m = SequenceMatcher(None, str1, str2) rat = m.ratio() str1 = str1.replace("\n", " ") str2 = str2.replace("\n", " ") if ''.join(filter(lambda c: c in string.printable, str1)) == ''.join(filter(lambda c: c in string.printable, str2)): return True elif str1.strip() == str2.strip(): return True elif unicodedata.normalize('NFKD', str1).encode('ascii','ignore') == unicodedata.normalize('NFKD', str2).encode('ascii','ignore'): return True elif str1.replace(" ", "") == str2.replace(" ", ""): return True elif rat > .8 and not str1.__contains__('http://'): #print rat #print "SERVER:" #print repr(str1) #print "CLIENT:" #print repr(str2) return True elif str1.__contains__('http://') and str2.__contains__('http://') and urllib.quote_plus(str2.replace(" ", "%20")) == urllib.quote_plus(str1): #print str1 #print str2 #print urllib.quote_plus(str1)[:150] #print urllib.quote_plus(str2.replace(" ", "%20"))[:150] return True return False
def partial_ratio(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") if len(s1) <= len(s2): shorter = s1; longer = s2; else: shorter = s2; longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return int(100 * max(scores))
def checkNear(hash1, hash2): s = SequenceMatcher(None, hash1, hash2) r = s.ratio() if r >= 0.96: return True else: return False
def similar(a, b): return SequenceMatcher(None, a, b).ratio()
def match(self, a, b): return SequenceMatcher(None, a, b).ratio()
# get negated part of the sentence with open(data_dir + 'ntree_tmp', 'w') as fw: fw.write(t) s = re.sub('\([A-Z]*\$? |\(-[A-Z]+- |\)|\)|\(, |\(. ', '', t) print('neg part: ' + s) # find what neg term is matched and use its neg type try: m = '' for neg in [ x for x in sorted( neg_list['ITEM'].tolist(), key=len, reverse=True) ]: #for neg in ['negative for']: match = SequenceMatcher(None, s, neg).find_longest_match( 0, len(s), 0, len(neg)) matched_string = s[match.a:match.a + match.size] try: # if next char might be different, means partial match if s[match.a + match.size + 1] == neg[match.b + match.size + 1] and \ s[match.a + match.size + 2] == neg[match.b + match.size + 2]: if (len(matched_string) > len(m)) and \ ((matched_string[0] == s[0] and matched_string[1] == s[1]) or \ (matched_string[len(matched_string)-1] == s[len(s)-1] and matched_string[len(matched_string)-2] == s[len(s)-2])): # either match from the beginning or laast m = matched_string matched_neg_item = neg[match.b:match.b + match.size] if matched_neg_item[len(matched_neg_item) - 1] == ' ': matched_neg_item = matched_neg_item[ 0:len(matched_neg_item) - 1] else:
def compare_map(first_id, second_id): return SequenceMatcher(None, first_id, second_id).ratio()
def strdistance(a, b): return SequenceMatcher(None, a, b).ratio()
def check(message: discord.Message): # return str(message.content).lower() == str(answer).lower() return SequenceMatcher(None, str(answer).lower(), str(message.content).lower()).ratio() > float(0.5) and message.channel.id == chat_channel
def test_diff(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") seq1 = "ab ab2 abc3 abcd abc4".split() seq2 = "ab ab2 abc3 abc4 abc adb".split() diff = SequenceMatcher(a=seq1, b=seq2) nb = 0 for opcode in diff.get_opcodes(): fLOG(opcode) nb += 1 self.assertEqual(nb, 4) h = 20 size = 500, 500 white = 255, 255, 255 if is_travis_or_appveyor() in ("travis",): # pygame.error: No available video device return import pygame if is_travis_or_appveyor() == "circleci": # os.environ["SDL_VIDEODRIVER"] = "x11" flags = pygame.NOFRAME else: flags = 0 pygame, screen, fonts = get_pygame_screen_font(h, size, flags=flags) from ensae_teaching_cs.helpers.pygame_helper import wait_event bars = [random.randint(10, 500) / 500.0 for s in seq2] screen.fill(white) build_diff_image(pygame, screen, h=h, maxw=size[1], seq1=seq1, seq2=seq2, diff=diff, fonts=fonts, bars=bars) pygame.display.flip() temp = get_temp_folder(__file__, "temp_video_diff") for i in range(0, 21): screen.fill(white) build_diff_image(pygame, screen, h=h, maxw=size[0], seq1=seq1, seq2=seq2, diff=diff, fonts=fonts, bars=bars, progress=i / 20.0, prev_bars=None) pygame.time.wait(60) pygame.display.flip() pygame.image.save(screen, os.path.join(temp, "diff%d.png" % i)) if __name__ == "__main__": from ensae_teaching_cs.helpers.video_helper import make_video png = [os.path.join(temp, _) for _ in os.listdir(temp) if ".png" in _] out = os.path.join(temp, "diff.avi") make_video(png, out, size=(350, 250), format="XVID", fps=5) wait_event(pygame) for font in fonts.values(): del font pygame.quit()
def similar(str_a, str_b): seq = SequenceMatcher(None, str_a, str_b) return seq.ratio()
def string_sim(sent_pairs): """Create a matrix where every row is a pair of sentences and every column in a feature. Feature (column) order is not important to the algorithm.""" features = [ "NIST", "BLEU", "Word Error Rate", "Longest common substring", "Levenshtein distance" ] nist_list = [] bleu_list = [] wer_list = [] lcs_list = [] dist_list = [] for pair in sent_pairs: t1 = pair[0] t2 = pair[1] t1_token = word_tokenize(pair[0]) t2_token = word_tokenize(pair[1]) # NIST try: nist1 = nist_score.sentence_nist([ t2_token, ], t1_token) nist2 = nist_score.sentence_nist([ t1_token, ], t2_token) nist = nist1 + nist2 except ZeroDivisionError: nist = 0 nist_list.append(nist) # BLEU bleu1 = bleu_score.sentence_bleu([ t1_token, ], t2_token) bleu2 = bleu_score.sentence_bleu([ t2_token, ], t1_token) bleu_list.append(bleu1 + bleu2) # Longgest common substring s = SequenceMatcher(None, t1, t2) lcs = s.find_longest_match(0, len(t1), 0, len(t2)) lcs_list.append(lcs[2]) # Edit distance dist = edit_distance(t1, t2) dist_list.append(dist) # Word error rate dist_wer = edit_distance(t1_token, t2_token) wer = dist_wer / len(t1_token) + dist_wer / len(t2_token) wer_list.append(wer) all_list = [nist_list, bleu_list, wer_list, lcs_list, dist_list] X = np.zeros((len(sent_pairs), len(features))) for i in range(len(all_list)): X[:, i] = np.asarray(all_list[i]) return X
def similar_strings(string1, string2): return round(SequenceMatcher(None, string1, string2).ratio(), 2)
SubStationDict[subName].append(BusNumber) #print subName """ with open('subList.txt','w') as f: for name in list(subNameSet): f.write(name) f.write('\n') """ for planningBusName in NameMatchDictPlanning.keys(): similarityDict = {} for CAPEsubName in list(subNameSet): planningBusNameCompact = getCompactString(planningBusName) CAPEsubNameCompact = getCompactString(CAPEsubName) similarity = SequenceMatcher(None, planningBusNameCompact, CAPEsubNameCompact).ratio() if similarity > similarityThreshold: similarityDict[CAPEsubName] = similarity similarityDictSorted = sorted( similarityDict, key=similarityDict.get, reverse=True ) # gets the dictionary keys in descending order of the values NameMatchDictPlanning[planningBusName] = similarityDictSorted # output the sorted name match list with open(NameMatchSorted, 'w') as f: for planningBusName in NameMatchDictPlanning.keys(): string = planningBusName + ' -> ' for CAPEsubName in NameMatchDictPlanning[planningBusName]: string += str(CAPEsubName)
loweredToLink = { key.lower(): value for (key, value) in current.links.items() } lowered = loweredToLink for bad_word in bad_words: lowered = list(filter(lambda x: not bad_word in x, lowered)) while not guess in lowered: guess = input("> ").lower() if guess == 'help': print('\n'.join(lowered)) else: candidates = list( map(lambda x: (x, SequenceMatcher(None, x, guess).ratio()), lowered)) candidates = list(filter(lambda x: x[1] > 0.8, candidates)) candidates.sort(reverse=True, key=lambda x: x[1]) if len(candidates) == 1: guess = candidates[0][0] print('-->', guess) else: print('\n'.join([c[0] for c in candidates])) current = loweredToLink[guess] counter += 1 print('page', counter) printPageInfo(current)
def get_similarity(a, b): return SequenceMatcher(None, a, b).ratio()
def _get_product_distance_to_query_str(product): ratio = SequenceMatcher(None, product.name, query_str).quick_ratio() return (1 / ratio if ratio else 0)
def longest_match_ratio(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return MathUtil.try_divide(match.size, min(len(str1), len(str2)))
def __init__(self, buis): self.buis = buis.title() self.seq = SequenceMatcher()
def __similar(self, a, b): return SequenceMatcher(None, a, b).ratio()
def compare_string(self, item1, item2): """ Compare two strings and output similarities """ from difflib import SequenceMatcher return SequenceMatcher(None, item1, item2).ratio()
def match_sequence(cls, a, b): sm = SequenceMatcher(a=' '.join(a), b=' '.join(b)) return sm.ratio()
def longest_match_size(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return match.size
def ratio(a, b): m = SequenceMatcher(None, a, b) return int(round(100 * m.ratio()))
def similar(lhs, rhs): return SequenceMatcher(None, lhs, rhs).ratio()
help='Simplified intermediate delta (unstable)') group.add_argument('-c', '--compare', action='store_true', help='HTML comparison of tokenized diff to char diffs') data = parser.parse_args() lexer = pygments.lexers.get_lexer_by_name(data.lexername) a = data.file1.read() b = data.file2.read() data.unidiff = not data.verbose and not data.delta and not data.compare if data.verbose: lexa = list(pygments.lex(a, lexer)) lexb = list(pygments.lex(b, lexer)) sm = SequenceMatcher(None, lexa, lexb) for op, a1, a2, b1, b2 in sm.get_opcodes(): if op == 'equal': for item in lexa[a1:a2]: data.out.write(" %s: %s\n" % item) elif op == 'replace': data.out.write("~~~\n") for item in lexa[a1:a2]: data.out.write("- %s: %s\n" % item) for item in lexb[b1:b2]: data.out.write("+ %s: %s\n" % item) data.out.write("~~~\n") elif op == 'insert': for item in lexb[b1:b2]: data.out.write("+ %s: %s\n" % item) elif op == 'delete':
def get_font(name: FontType, size: int) -> '__font.Font': """ Return a :py:class:`pygame.font.Font` object from a name or file. :param name: Font name or path :param size: Font size (px) :return: Font object """ assert_font(name) assert isinstance(size, int) font: Optional['__font.Font'] if isinstance(name, __font.Font): font = name return font else: name = str(name) if name == '': raise ValueError('font name cannot be empty') if size <= 0: raise ValueError('font size cannot be lower or equal than zero') # Font is not a file, then use a system font if not path.isfile(name): font_name = name name = __font.match_font(font_name) if name is None: # Show system available fonts from difflib import SequenceMatcher from random import randrange system_fonts = __font.get_fonts() # Get the most similar example most_similar = 0 most_similar_index = 0 for i in range(len(system_fonts)): # noinspection PyArgumentEqualDefault sim = SequenceMatcher(None, system_fonts[i], font_name).ratio() # Similarity if sim > most_similar: most_similar = sim most_similar_index = i sys_font_sim = system_fonts[most_similar_index] sys_suggestion = 'system font "{0}" unknown, use "{1}" instead'.format( font_name, sys_font_sim) sys_message = 'check system fonts with pygame.font.get_fonts() function' # Get examples examples_number = 3 examples = [] j = 0 for i in range(len(system_fonts)): font_random = system_fonts[randrange(0, len(system_fonts))] if font_random not in examples: examples.append(font_random) j += 1 if j >= examples_number: break examples.sort() fonts_random = ', '.join(examples) sys_message_2 = 'some examples: {0}'.format(fonts_random) # Raise the exception raise ValueError('{0}\n{1}\n{2}'.format( sys_suggestion, sys_message, sys_message_2)) # Try to load the font font = None if (name, size) in _cache: return _cache[(name, size)] try: font = __font.Font(name, size) except IOError: pass # If font was not loaded throw an exception if font is None: raise IOError('font file "{0}" cannot be loaded'.format(font)) _cache[(name, size)] = font return font
def similar(self, choice): for option in self.options: if SequenceMatcher(None, choice, option).ratio() > 0.6: return option return False
class Deduplicator: logger = util.get_logger("deduplicator.Deduplicator") threshold = 0.50 boost = 0.10 def __init__(self): self.sm = SequenceMatcher() self.tokenizer = Tokenizer() self.ner = NER() self.headlines = dict() self._headlines = dict() self.parents = dict() self.groups = dict() def accept(self, _id: str, headline: str) -> str: self.headlines[_id] = headline tokens = self.tokenizer.tokenize(headline) _headline = ' '.join(tokens) self._headlines[_id] = _headline if len(self.groups) == 0: self.logger.debug("[%s] %s - first item", _id, headline) self.parents[_id] = _id self.groups[_id] = [] return _id matches = [] a = _headline doc1 = self.ner.doc(headline) ents1 = util.lowercase(self.ner.entities(doc1)) for group_id in self.groups: b = self._headlines[group_id] self.sm.set_seqs(a, b) ratio = self.sm.ratio() # Check if there are any named entities in common doc2 = self.ner.doc(self.headlines[group_id]) ents2 = util.lowercase(self.ner.entities(doc2)) ncommon = len(set(ents1) & set(ents2)) boost = ncommon * self.boost ratio += boost self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f (+%.2f)", _id, a, group_id, b, ratio, boost) if ratio >= self.threshold: matches.append((ratio, group_id)) if not matches: self.logger.debug("[%s] %s - no matches found", _id, headline) self.parents[_id] = _id self.groups[_id] = [] return _id matches.sort(key=lambda x: x[0]) highest_ratio, group_id = matches.pop() b = self._headlines[group_id] self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f was the high score", _id, a, group_id, b, highest_ratio) self.parents[_id] = group_id self.groups[group_id].append(_id) return group_id def print_tree(self, original=True): headlines = self.headlines if original else self._headlines print("") for group_id in self.groups: print("[%s] %s" % (group_id, headlines[group_id])) if self.groups[group_id]: print(" |") for _id in self.groups[group_id]: print(" |-- [%s] %s" % (_id, headlines[_id])) if self.groups[group_id]: print("") print("") def export(self): return { 'headlines': self.headlines, '_headlines': self._headlines, 'parents': self.parents, 'groups': self.groups }
def cacl_similarRatio(a, b): """ 返回两个单词之间的相似率 """ return SequenceMatcher(None, a, b).ratio()
def get_winner(year): make_year(year) global GG_RESULT if "winner" in GG_RESULT: return GG_RESULT["winner"] stop_list_people = [ 'asian', 'series', 'the', 'best', '-', 'award', 'for', 'or', 'made', 'in', 'a', 'by', 'performance', 'an', 'golden', 'globes', 'role', 'motion', 'picture', 'best', 'supporting' ] #stop_list_people =['Motion Picture','Best Actor','Best Supporting'] tweets_by_awards(year) winners = {} for award in OFFICIAL_AWARDS: winners[award] = [] # print(tweet_award_dict) name_pattern = re.compile(r'[A-Z][a-z]+\s[A-Z][a-z]+') award_list_person = [] for award in OFFICIAL_AWARDS: for person in ["actor", "actress", "demille", "director"]: if person in award: award_list_person.append(award) for award in award_list_person: for tweet in TWEET_BY_AWARD_DICT[award]: names = re.findall(name_pattern, tweet) for name in names: flag = False for name_item in name.lower().split(): if name_item in stop_list_people: flag = True if flag == False: winners[award] = winners[award] + [name] freq = {} for award in award_list_person: freq[award] = nltk.FreqDist(winners[award]) # winner list for the rest award_list_not_person = [] for award in OFFICIAL_AWARDS: if award not in award_list_person: award_list_not_person.append(award) for award in award_list_not_person: winner_stoplist = [ 'globes', 'at', 'and', 'Motion', 'Picture', 'Best', 'Supporting', '-', 'animated', 'best', 'comedy', 'drama', 'feature', 'film', 'foreign', 'globe', 'goes', 'golden', 'motion', 'movie', 'musical', 'or', 'original', 'picture', 'rt', 'series', 'song', 'television', 'to', 'tv', 'movies' ] bigrams_list = [] ignore_list = ["@", "#"] post_process = ['wins', 'goldenglobes'] for tweet in TWEET_BY_AWARD_DICT[award]: tweet = re.sub(r'[^\w\s]', '', tweet) if tweet[0:2] == "RT": #print (tweet) continue bigram = nltk.bigrams(tweet.split()) temp = [] for item in bigram: if item[0].lower() not in winner_stoplist and item[1].lower( ) not in winner_stoplist: temp.append(item) for item in temp: if item[0][0] not in ignore_list and item[1][ 0] not in ignore_list: bigrams_list.append(item) # print(bigrams_list) freq[award] = nltk.FreqDist([' '.join(item) for item in bigrams_list]) for award in OFFICIAL_AWARDS: #print(freq[award].most_common(1)) temp_winner = freq[award].most_common(1)[0][0] imdb_flag = True for word in post_process: if word in temp_winner.lower().split(): #print ('check') #print (temp_winner) temp_winner = temp_winner.lower().replace(word, '').strip() #print (temp_winner) #print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') imdb_flag = False break # winners[award] = temp_winner.lower() if award in award_list_person: winners[award] = temp_winner.lower() else: if imdb_flag == False: winners[award] = temp_winner.lower() else: if award != 'best original song - motion picture': movies = ia.search_movie(temp_winner) ss = '' #print (movies) #print ('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') for item in movies: try: if item['year'] == int(year) - 1: ss = item['title'] break except KeyError: continue if ss == '': winners[award] = temp_winner.lower() else: print(ss) print(temp_winner) print(SequenceMatcher(None, ss, temp_winner).ratio()) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') if SequenceMatcher(None, ss, temp_winner).ratio() > 0.85: winners[award] = temp_winner.lower() else: winners[award] = ss.lower() else: winners[award] = temp_winner.lower() GG_RESULT["winner"] = winners return winners
def compare_output(s1, s2): """ Compare stdout strings s1 and s2. s1 is from readelf, s2 from elftools readelf.py Return pair success, errmsg. If comparison succeeds, success is True and errmsg is empty. Otherwise success is False and errmsg holds a description of the mismatch. Note: this function contains some rather horrible hacks to ignore differences which are not important for the verification of pyelftools. This is due to some intricacies of binutils's readelf which pyelftools doesn't currently implement, features that binutils doesn't support, or silly inconsistencies in the output of readelf, which I was reluctant to replicate. Read the documentation for more details. """ def prepare_lines(s): return [line for line in s.lower().splitlines() if line.strip() != ''] def filter_readelf_lines(lines): filter_out = False for line in lines: if 'of the .eh_frame section' in line: filter_out = True elif 'of the .debug_frame section' in line or \ 'of the .zdebug_frame section' in line: filter_out = False if not filter_out: if not line.startswith('unknown: length'): yield line lines1 = prepare_lines(s1) lines2 = prepare_lines(s2) lines1 = list(filter_readelf_lines(lines1)) flag_after_symtable = False if len(lines1) != len(lines2): return False, 'Number of lines different: %s vs %s' % (len(lines1), len(lines2)) for i in range(len(lines1)): if 'symbol table' in lines1[i]: flag_after_symtable = True # Compare ignoring whitespace lines1_parts = lines1[i].split() lines2_parts = lines2[i].split() if ''.join(lines1_parts) != ''.join(lines2_parts): ok = False try: # Ignore difference in precision of hex representation in the # last part (i.e. 008f3b vs 8f3b) if (''.join(lines1_parts[:-1]) == ''.join(lines2_parts[:-1]) and int(lines1_parts[-1], 16) == int( lines2_parts[-1], 16)): ok = True except ValueError: pass sm = SequenceMatcher() sm.set_seqs(lines1[i], lines2[i]) changes = sm.get_opcodes() if flag_after_symtable: # Detect readelf's adding @ with lib and version after # symbol name. if (len(changes) == 2 and changes[1][0] == 'delete' and lines1[i][changes[1][1]] == '@'): ok = True elif 'at_const_value' in lines1[i]: # On 32-bit machines, readelf doesn't correctly represent # some boundary LEB128 numbers val = lines2_parts[-1] num2 = int(val, 16 if val.startswith('0x') else 10) if num2 <= -2**31 and '32' in platform.architecture()[0]: ok = True elif 'os/abi' in lines1[i]: if 'unix - gnu' in lines1[i] and 'unix - linux' in lines2[i]: ok = True elif ('unknown at value' in lines1[i] and 'dw_at_apple' in lines2[i]): ok = True else: for s in ('t (tls)', 'l (large)'): if s in lines1[i] or s in lines2[i]: ok = True break if not ok: errmsg = 'Mismatch on line #%s:\n>>%s<<\n>>%s<<\n (%r)' % ( i, lines1[i], lines2[i], changes) return False, errmsg return True, ''
def get_diff(job_id, v1, v2, n1=None, n2=None): job = fetch(Job, id=job_id) first = str_dict(job.logs[v1]).splitlines() second = str_dict(job.logs[v2]).splitlines() opcodes = SequenceMatcher(None, first, second).get_opcodes() return jsonify({'first': first, 'second': second, 'opcodes': opcodes})