def find_common_substrings(content, dict_term, partial_match_min_size, partial_match_thresh): """ Scan dict_term finding any common substrings from dict_term. For each possible common substring, only the first one is found. """ results = [] len_content = len(content) len_term = len(dict_term) i = 0 while i < len_content: match_start = -1 matched_chars = 0 # Ignore white space if content[i].isspace(): i += 1 continue; match = None for j in range(len_term): char_match = (i + matched_chars < len_content and content[i + matched_chars] == dict_term[j]) if char_match and match_start == -1: match_start = j elif match_start > -1 and not char_match: match = Match(i, match_start, j - match_start) break if char_match: matched_chars += 1 # Check for match at the end if match is None and match_start > -1: match = Match(i, match_start, len_term - match_start) # Process content match if not match is None: # Ignore matches if they aren't big enough # No partial matches for small terms if len_term <= partial_match_min_size: if match.size >= len_term: results.append(match) # If the term is larger, we can have content partial match elif match.size >= int(len_term * partial_match_thresh): results.append(match) i += match.size else: i += 1 # Compute word length for matched substrings # The word is terminated by whitespace, or /, unless the character in question is also present in the dictionary term at the same location results_mod = [] for res in results: start_idx = res.a start_idx_b = res.b while (start_idx > 0 and (content[start_idx - 1].isalpha() or content[start_idx - 1] == '_')) or (start_idx > 0 and start_idx_b > 0 and content[start_idx - 1] == dict_term[start_idx_b - 1]): start_idx -= 1 start_idx_b -= 1 end_idx = res.a end_idx_b = res.b while (end_idx < len_content and (content[end_idx].isalpha() or content[end_idx] == '_')) or (end_idx < len_content and end_idx_b < len_term and content[end_idx] == dict_term[end_idx_b]): end_idx += 1 end_idx_b += 1 content_word_length = end_idx - start_idx results_mod.append(IPSMatch(res.a, res.b, res.size, content_word_length)) return results_mod
def align(options, path, included): ''' Display side by side. ''' actual = get_src_file(path) if is_simple_inclusion(options, included, actual): return included = included.rstrip('\n').split('\n') actual = actual.rstrip('\n').split('\n') matches = [Match(0, 0, 0)] + \ SequenceMatcher(a=included, b=actual).get_matching_blocks() result = [] diffs_found = False fmt = '{{0}}|{{1:{}}}|{{2}}'.format(max([len(x) for x in included])) for i in range(len(matches) - 1): diffs_found |= align_one(result, options, fmt, included, actual, matches[i], matches[i + 1]) if options['names_only']: if diffs_found: print(path) elif diffs_found or options['verbose']: print('\n-- {}'.format(path)) for r in result: print(r)
def _iter_diff_blocks(a, b): m = SequenceMatcher(a=a, b=b, autojunk=False).get_matching_blocks() for start, end in zip(chain((Match(0, 0, 0),), m), m): if start.a + start.size != end.a or start.b + start.size != end.b: yield start.a + start.size, end.a, start.b + start.size, end.b
def __calculate_blocks(self, rev, min_threshold=0.6): """ Calculate line by line, which lines have changed based on min_threshold and then check for within line changes (char by char) and return which a list of matched code blocks that have remained the same. :param rev: text of new revision :type rev: str :param min_threshold: a percentage of similarity for line by line comparisons :type min_threshold: float bound between 0.0 to 1.0 :return: matched code blocks that have remained the same, list of tuples :rtype: list [(start position in original text, start pos in new text, length),(),()...] """ matches = [] # contains tuples of matched parts original = self.code_text.splitlines( True) # original text with split lines (retains \n as a char) new = rev.splitlines(True) # the new submitted text found = True # Calculate start positions for each line in original strings char_start_original = [] char_start_new = [] additive = 0 for x in range(0, len(original)): char_start_original.append(additive) additive += len(original[x]) additive = 0 for y in range(0, len(new)): char_start_new.append(additive) additive += len(new[y]) # Match identification code below # worst-case: O(x*y) or O(x^2) if x and y equal length and changes existing in all lines # best-case: O(x) # Constructing a hash-multiset to speed the process later on cnt = Counter() for word in new: cnt[word] += 1 diffs = [] new_tmp = new # Temporary object that we modify on the fly, used for reference y_list = list(range( 0, len(new))) # Temporary object for dynamic recursion counter = 0 for x in range(0, len(original)): diffs.append([]) if cnt[original[ x]] > 0: # it exists (this is O(1) which helps skip a lot of comparisons) y = y_list.index( new_tmp.index(original[x]) ) # reference index number in y_list (iterable) # Adding a matched record that simulates what difflib would find if it were to compare the two strings # Basically the whole new line matches the old, difflib always has a zero size match as the last # element. diffs[x].append([ x, y_list[y], 1.0, [ Match(a=0, b=0, size=len(original[x])), Match(a=len(original[x]), b=len(original[x]), size=0) ] ]) del (y_list[y]) # delete the existing object's line # This is like deleting the record for the purposes of retrieving the index from the tmp object. # Deleting would have shifted the numbers new_tmp[new_tmp.index(original[x])] = 0 cnt[original[x]] -= 1 # decrement else: # no duplicate so we have to compare the item with the rest of the list (code modified or removed) for y in range(0, len(y_list)): counter += 1 line_diff_result = difflib.SequenceMatcher(None, original[x], new[y_list[y]], autojunk=False) # Sanity check below, the hash-multiset should have removed all identical lines if line_diff_result.ratio() == 1: diffs[x].append([ x, y_list[y], line_diff_result.ratio(), line_diff_result.get_matching_blocks() ]) del (y_list[y]) break else: diffs[x].append([ x, y_list[y], line_diff_result.ratio(), line_diff_result.get_matching_blocks() ]) print( "Total comparisons: " + str(counter) ) # For visually seeing whether the optimizations work and we avoid n^2 del cnt del new_tmp # Iterate through all the calculated diffs and figure out the best matches # The look keeps going on and on until all possible matches are found (could be rewritten as a recursive func) # to_delete serves a deleting agent so that after a match is append the lines found to much as eliminated # from further consideration to_delete = -999 # init a non-sense number while found is True: found = False max_match = [ 0, 0, 0, 0 ] # This will hold the best match between line_x and line_y for x in range(0, len(diffs)): for y in range(0, len(diffs[x])): if diffs[x][y][1] == to_delete: diffs[x][y] = [0, 0, 0, 0] else: if diffs[x][y][2] > min_threshold and max_match[ 2] < diffs[x][y][2]: max_match = [ diffs[x][y][0], diffs[x][y][1], diffs[x][y][2], diffs[x][y][3], x ] if max_match[ 2] != 0: # we found a line that looks similar enough and was likely moved found = True for m in max_match[3]: if m[2] != 0: # make sure that the matched content matches at least 1 char (sanity check) matches.append([ char_start_original[max_match[0]] + m[0], char_start_new[max_match[1]] + m[1], m[2] ]) del (diffs[max_match[4]]) to_delete = max_match[1] return matches
def add_match(self, page, match): # l('ADDING ' + str(match)) info = RangeMatch(self, page, match) # l(info) pageno = page.info['number'] pagenoval = rnum_to_int(pageno) if pagenoval == 0 and len(pageno) > 0: pagenoval = int(pageno) matchint = Interval.between(match.b, match.b + match.size) overlaps = [m for m in self.matches if m & matchint] # if nearnos matches either, mark flag and amp score if pageno: nearnos = self.find_nearnos(match) # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval)) # for no in nearnos[1], nearnos[0]: if nearnos is None: # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!! nearnos = [] for no in nearnos[1], nearnos[0]: # for no in nearnos: if no is not None: # l(no.val) if no.val == pagenoval: info.notes += 'nearno: %s' % pageno # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score)) self.score += 1 info.nearno = no.word_index break if no.val > pagenoval - 10 and match.a < 10: self.score += .01 break # cases: no overlap if len(overlaps) == 0: self.matchinfo[matchint] = info self.matches = self.matches + IntervalSet([matchint]) else: start = match.b end = match.b + match.size for i in overlaps: oinfo = self.matchinfo[i] ostart = oinfo.match.b oend = oinfo.match.b + oinfo.match.size scootback = 0 if ostart < start: scootback = start - ostart start = ostart if oend > end: end = oend info.match = Match(info.match.a - scootback, start, end - start) if oinfo.nearno != -1: # assert(info.nearno == -1) info.nearno = oinfo.nearno # info.score += oinfo.score # info.pageno = oinfo.pageno # info.notes = info.notes + ' ' + info.notes # for opageno in oinfo.pagenos: # opagecount = oinfo.pagenos[opageno] # if opageno in info.pagenos: # info.pagenos[opageno] += opagecount # else: # info.pagenos[opageno] = opagecount self.matches += IntervalSet([matchint]) (new_i, ) = [m for m in self.matches if m & matchint] self.matchinfo[new_i] = info
def find_longest_match(self, alo, ahi, blo, bhi): """Find longest matching block in a[alo:ahi] and b[blo:bhi]. If isjunk is not defined: Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where alo <= i <= i+k <= ahi blo <= j <= j+k <= bhi and for all (i',j',k') meeting those conditions, k >= k' i <= i' and if i == i', j <= j' In other words, of all maximal matching blocks, return one that starts earliest in a, and of all those maximal matching blocks that start earliest in a, return the one that starts earliest in b. >>> s = SequenceMatcher(None, " abcd", "abcd abcd") >>> s.find_longest_match(0, 5, 0, 9) Match(a=0, b=4, size=5) If isjunk is defined, first the longest matching block is determined as above, but with the additional restriction that no junk element appears in the block. Then that block is extended as far as possible by matching (only) junk elements on both sides. So the resulting block never matches on junk except as identical junk happens to be adjacent to an "interesting" match. Here's the same example as before, but considering blanks to be junk. That prevents " abcd" from matching the " abcd" at the tail end of the second sequence directly. Instead only the "abcd" can match, and matches the leftmost "abcd" in the second sequence: >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") >>> s.find_longest_match(0, 5, 0, 9) Match(a=1, b=0, size=4) If no blocks match, return (alo, blo, 0). >>> s = SequenceMatcher(None, "ab", "c") >>> s.find_longest_match(0, 2, 0, 1) Match(a=0, b=0, size=0) """ # CAUTION: stripping common prefix or suffix would be incorrect. # E.g., # ab # acab # Longest matching block is "ab", but if common prefix is # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so # strip, so ends up claiming that ab is changed to acab by # inserting "ca" in the middle. That's minimal but unintuitive: # "it's obvious" that someone inserted "ac" at the front. # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk besti, bestj, bestsize = alo, blo, 0 # find longest junk-free match # during an iteration of the loop, j2len[j] = length of longest # junk-free match ending with a[i-1] and b[j] j2len = {} nothing = [] for i in xrange(alo, ahi): # look at all instances of a[i] in b; note that because # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get newj2len = {} for j in b2j.get(a[i], nothing): # a[i] matches b[j] if j < blo: continue if j >= bhi: break k = newj2len[j] = j2lenget(j-1, 0) + 1 if k > bestsize: besti, bestj, bestsize = i-k+1, j-k+1, k j2len = newj2len # Extend the best by non-junk elements on each end. In particular, # "popular" non-junk elements aren't in b2j, which greatly speeds # the inner loop above, but also means "the best" match so far # doesn't contain any junk *or* popular non-junk elements. while besti > alo and bestj > blo and \ not isbjunk(b[bestj-1]) and \ self.match_function(a[besti-1], b[bestj-1]): besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 while besti+bestsize < ahi and bestj+bestsize < bhi and \ not isbjunk(b[bestj+bestsize]) and \ self.match_function(a[besti+bestsize], b[bestj+bestsize]): bestsize += 1 # Now that we have a wholly interesting match (albeit possibly # empty!), we may as well suck up the matching junk on each # side of it too. Can't think of a good reason not to, and it # saves post-processing the (possibly considerable) expense of # figuring out what to do with it. In the case of an empty # interesting match, this is clearly the right thing to do, # because no other kind of match is possible in the regions. while besti > alo and bestj > blo and \ isbjunk(b[bestj-1]) and \ self.match_function(a[besti-1], b[bestj-1]): besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 while besti+bestsize < ahi and bestj+bestsize < bhi and \ isbjunk(b[bestj+bestsize]) and \ self.match_function(a[besti+bestsize], b[bestj+bestsize]): bestsize = bestsize + 1 return Match(besti, bestj, bestsize)
def adapt_match(match: Match) -> Match: return Match(a=match.a, b=match.b + lower, size=match.size + processing_offset)