def findDynamicContent(firstPage, secondPage): dynamicMarks = [] blocks = SequenceMatcher(None, firstPage, secondPage) # Removing too small matching blocks for block in blocks[:]: (_, _, length) = block if length <= DYNAMICITY_MARK_LENGTH: blocks.remove(block) # Making of dynamic markings based on prefix/suffix principle if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in range(len(blocks) - 1): prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)): continue dynamicMarks.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH / 2:]) if prefix else None, re.escape(suffix[:DYNAMICITY_MARK_LENGTH / 2]) if suffix else None)) return dynamicMarks
def findDynamicContent(firstPage, secondPage): """ This function checks if the provided pages have dynamic content. If they are dynamic, proper markings will be made >>> findDynamicContent("Lorem ipsum dolor sit amet, congue tation referrentur ei sed. Ne nec legimus habemus recusabo, natum reque et per. Facer tritani reprehendunt eos id, modus constituam est te. Usu sumo indoctum ad, pri paulo molestiae complectitur no.", "Lorem ipsum dolor sit amet, congue tation referrentur ei sed. Ne nec legimus habemus recusabo, natum reque et per. <script src='ads.js'></script>Facer tritani reprehendunt eos id, modus constituam est te. Usu sumo indoctum ad, pri paulo molestiae complectitur no.") >>> kb.dynamicMarkings [('natum reque et per. ', 'Facer tritani repreh')] """ if not firstPage or not secondPage: return blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks() dynamicMarkings = [] # Removing too small matching blocks for block in blocks[:]: (_, _, length) = block if length <= 2 * DYNAMICITY_BOUNDARY_LENGTH: blocks.remove(block) # Making of dynamic markings based on prefix/suffix principle if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in range(len(blocks) - 1): prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)): continue if prefix and suffix: prefix = prefix[-DYNAMICITY_BOUNDARY_LENGTH:] suffix = suffix[:DYNAMICITY_BOUNDARY_LENGTH] infix = max( re.search( r"(?s)%s(.+)%s" % (re.escape(prefix), re.escape(suffix)), _) for _ in (firstPage, secondPage)).group(1) if infix[0].isalnum(): prefix = trimAlphaNum(prefix) if infix[-1].isalnum(): suffix = trimAlphaNum(suffix) dynamicMarkings.append( (prefix if prefix else None, suffix if suffix else None)) return dynamicMarkings
def find_dynamic_content(first_page, second_page): """ 检查两个页面中存在的不同部分,并进行标记,存储到 byz.dynamic_markings :param first_page: :param second_page: :return: """ blocks = SequenceMatcher(None, first_page, second_page).get_matching_blocks() # 删除过小的匹配块 for block in blocks[:]: (_, _, length) = block if length <= 32: blocks.remove(block) # 基于前缀/后缀的原则进行动态标记 if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in range(len(blocks) - 1): prefix = first_page[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = first_page[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(first_page)): continue def trim_alpha_num(value): """ 从给定的位置修剪字符串 :param value: :return: """ while value and value[-1].isalnum(): value = value[:-1] while value and value[0].isalnum(): value = value[1:] return value prefix = trim_alpha_num(prefix) suffix = trim_alpha_num(suffix) byz.dynamic_markings.append((prefix[-32 // 2:] if prefix else None, suffix[:32 // 2] if suffix else None))
def findDynamicContent(firstPage, secondPage): """ This function checks if the provided pages have dynamic content. If they are dynamic, proper markings will be made """ if not firstPage or not secondPage: return blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks() dynamicMarkings = [] # Removing too small matching blocks for block in blocks[:]: (_, _, length) = block if length <= DYNAMICITY_MARK_LENGTH: blocks.remove(block) # Making of dynamic markings based on prefix/suffix principle if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in xrange(len(blocks) - 1): prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)): continue prefix = trimAlphaNum(prefix) suffix = trimAlphaNum(suffix) dynamicMarkings.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH / 2:]) if prefix else None, re.escape(suffix[:DYNAMICITY_MARK_LENGTH / 2]) if suffix else None)) if len(dynamicMarkings) > 0: infoMsg = "dynamic content marked for removal (%d region%s)" % ( len(dynamicMarkings), 's' if len(dynamicMarkings) > 1 else '') print infoMsg
def findDynamicContent(firstPage, secondPage): dynamicMarks = [] blocks = SequenceMatcher(None, firstPage, secondPage) # Removing too small matching blocks for block in blocks[:]: (_, _, length) = block if length <= DYNAMICITY_MARK_LENGTH: blocks.remove(block) # Making of dynamic markings based on prefix/suffix principle if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in range(len(blocks) - 1): prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)): continue dynamicMarks.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH / 2:]) if prefix else None, re.escape(suffix[:DYNAMICITY_MARK_LENGTH / 2]) if suffix else None)) return dynamicMarks
def checkDynamicContent(firstPage, secondPage): """ This function checks if the provided pages have dynamic content. If they are dynamic, proper markings will be made. """ if kb.nullConnection: debugMsg = "dynamic content checking skipped " debugMsg += "because NULL connection used" logger.debug(debugMsg) return if conf.longestCommon: debugMsg = "dynamic content checking skipped " debugMsg += "because longest common comparison used" logger.debug(debugMsg) return infoMsg = "searching for dynamic content" logger.info(infoMsg) blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks() kb.dynamicMarkings = [] i = 0 while i < len(blocks): block = blocks[i] (_, _, length) = block if length <= conf.minMatchBlock: blocks.remove(block) else: i += 1 if len(blocks) > 0: blocks.insert(0, None) blocks.append(None) for i in xrange(len(blocks) - 1): prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None if prefix is None and blocks[i + 1][0] == 0: continue if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)): continue kb.dynamicMarkings.append((re.escape(prefix[-conf.dynMarkLength:]) if prefix else None, re.escape(suffix[:conf.dynMarkLength]) if suffix else None)) if len(kb.dynamicMarkings) > 0: infoMsg = "dynamic content marked for removal (%d region%s)" % (len(kb.dynamicMarkings), 's' if len(kb.dynamicMarkings) > 1 else '') logger.info(infoMsg) if conf.seqMatcher.a: for item in kb.dynamicMarkings: prefix, suffix = item if prefix is None: conf.seqMatcher.a = re.sub('(?s)^.+%s' % suffix, suffix, conf.seqMatcher.a) elif suffix is None: conf.seqMatcher.a = re.sub('(?s)%s.+$' % prefix, prefix, conf.seqMatcher.a) else: conf.seqMatcher.a = re.sub('(?s)%s.+%s' % (prefix, suffix), '%s%s' % (prefix, suffix), conf.seqMatcher.a)