Exemplo n.º 1
0
	def findDynamicContent(firstPage, secondPage):
		dynamicMarks = []

		blocks = SequenceMatcher(None, firstPage, secondPage)

		# Removing too small matching blocks
		for block in blocks[:]:
			(_, _, length) = block

			if length <= DYNAMICITY_MARK_LENGTH:
				blocks.remove(block)

		# Making of dynamic markings based on prefix/suffix principle
		if len(blocks) > 0:
			blocks.insert(0, None)
			blocks.append(None)

			for i in range(len(blocks) - 1):
				prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None
				suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None

				if prefix is None and blocks[i + 1][0] == 0:
					continue

				if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)):
					continue




				dynamicMarks.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH / 2:]) if prefix else None, re.escape(suffix[:DYNAMICITY_MARK_LENGTH / 2]) if suffix else None))


		return dynamicMarks
Exemplo n.º 2
0
def findDynamicContent(firstPage, secondPage):
    """
    This function checks if the provided pages have dynamic content. If they
    are dynamic, proper markings will be made

    >>> findDynamicContent("Lorem ipsum dolor sit amet, congue tation referrentur ei sed. Ne nec legimus habemus recusabo, natum reque et per. Facer tritani reprehendunt eos id, modus constituam est te. Usu sumo indoctum ad, pri paulo molestiae complectitur no.", "Lorem ipsum dolor sit amet, congue tation referrentur ei sed. Ne nec legimus habemus recusabo, natum reque et per. <script src='ads.js'></script>Facer tritani reprehendunt eos id, modus constituam est te. Usu sumo indoctum ad, pri paulo molestiae complectitur no.")
    >>> kb.dynamicMarkings
    [('natum reque et per. ', 'Facer tritani repreh')]
    """

    if not firstPage or not secondPage:
        return

    blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks()
    dynamicMarkings = []

    # Removing too small matching blocks
    for block in blocks[:]:
        (_, _, length) = block

        if length <= 2 * DYNAMICITY_BOUNDARY_LENGTH:
            blocks.remove(block)

    # Making of dynamic markings based on prefix/suffix principle
    if len(blocks) > 0:
        blocks.insert(0, None)
        blocks.append(None)

        for i in range(len(blocks) - 1):
            prefix = firstPage[blocks[i][0]:blocks[i][0] +
                               blocks[i][2]] if blocks[i] else None
            suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] +
                               blocks[i + 1][2]] if blocks[i + 1] else None

            if prefix is None and blocks[i + 1][0] == 0:
                continue

            if suffix is None and (blocks[i][0] + blocks[i][2] >=
                                   len(firstPage)):
                continue

            if prefix and suffix:
                prefix = prefix[-DYNAMICITY_BOUNDARY_LENGTH:]
                suffix = suffix[:DYNAMICITY_BOUNDARY_LENGTH]

                infix = max(
                    re.search(
                        r"(?s)%s(.+)%s" %
                        (re.escape(prefix), re.escape(suffix)), _)
                    for _ in (firstPage, secondPage)).group(1)

                if infix[0].isalnum():
                    prefix = trimAlphaNum(prefix)

                if infix[-1].isalnum():
                    suffix = trimAlphaNum(suffix)

            dynamicMarkings.append(
                (prefix if prefix else None, suffix if suffix else None))
    return dynamicMarkings
Exemplo n.º 3
0
Arquivo: common.py Projeto: su18/Curt
def find_dynamic_content(first_page, second_page):
    """
    检查两个页面中存在的不同部分,并进行标记,存储到 byz.dynamic_markings
    :param first_page:
    :param second_page:
    :return:
    """

    blocks = SequenceMatcher(None, first_page,
                             second_page).get_matching_blocks()

    # 删除过小的匹配块
    for block in blocks[:]:
        (_, _, length) = block

        if length <= 32:
            blocks.remove(block)

    # 基于前缀/后缀的原则进行动态标记
    if len(blocks) > 0:
        blocks.insert(0, None)
        blocks.append(None)

        for i in range(len(blocks) - 1):
            prefix = first_page[blocks[i][0]:blocks[i][0] +
                                blocks[i][2]] if blocks[i] else None
            suffix = first_page[blocks[i + 1][0]:blocks[i + 1][0] +
                                blocks[i + 1][2]] if blocks[i + 1] else None

            if prefix is None and blocks[i + 1][0] == 0:
                continue

            if suffix is None and (blocks[i][0] + blocks[i][2] >=
                                   len(first_page)):
                continue

            def trim_alpha_num(value):
                """
                从给定的位置修剪字符串
                :param value:
                :return:
                """
                while value and value[-1].isalnum():
                    value = value[:-1]
                while value and value[0].isalnum():
                    value = value[1:]
                return value

            prefix = trim_alpha_num(prefix)
            suffix = trim_alpha_num(suffix)

            byz.dynamic_markings.append((prefix[-32 // 2:] if prefix else None,
                                         suffix[:32 // 2] if suffix else None))
Exemplo n.º 4
0
def findDynamicContent(firstPage, secondPage):
    """
    This function checks if the provided pages have dynamic content. If they
    are dynamic, proper markings will be made
    """

    if not firstPage or not secondPage:
        return

    blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks()
    dynamicMarkings = []

    # Removing too small matching blocks
    for block in blocks[:]:
        (_, _, length) = block

        if length <= DYNAMICITY_MARK_LENGTH:
            blocks.remove(block)

    # Making of dynamic markings based on prefix/suffix principle
    if len(blocks) > 0:
        blocks.insert(0, None)
        blocks.append(None)

        for i in xrange(len(blocks) - 1):
            prefix = firstPage[blocks[i][0]:blocks[i][0] +
                               blocks[i][2]] if blocks[i] else None
            suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] +
                               blocks[i + 1][2]] if blocks[i + 1] else None

            if prefix is None and blocks[i + 1][0] == 0:
                continue

            if suffix is None and (blocks[i][0] + blocks[i][2] >=
                                   len(firstPage)):
                continue

            prefix = trimAlphaNum(prefix)
            suffix = trimAlphaNum(suffix)

            dynamicMarkings.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH /
                                                     2:]) if prefix else None,
                                    re.escape(suffix[:DYNAMICITY_MARK_LENGTH /
                                                     2]) if suffix else None))

    if len(dynamicMarkings) > 0:
        infoMsg = "dynamic content marked for removal (%d region%s)" % (
            len(dynamicMarkings), 's' if len(dynamicMarkings) > 1 else '')
        print infoMsg
Exemplo n.º 5
0
    def findDynamicContent(firstPage, secondPage):
        dynamicMarks = []

        blocks = SequenceMatcher(None, firstPage, secondPage)

        # Removing too small matching blocks
        for block in blocks[:]:
            (_, _, length) = block

            if length <= DYNAMICITY_MARK_LENGTH:
                blocks.remove(block)

        # Making of dynamic markings based on prefix/suffix principle
        if len(blocks) > 0:
            blocks.insert(0, None)
            blocks.append(None)

            for i in range(len(blocks) - 1):
                prefix = firstPage[blocks[i][0]:blocks[i][0] +
                                   blocks[i][2]] if blocks[i] else None
                suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] +
                                   blocks[i + 1][2]] if blocks[i + 1] else None

                if prefix is None and blocks[i + 1][0] == 0:
                    continue

                if suffix is None and (blocks[i][0] + blocks[i][2] >=
                                       len(firstPage)):
                    continue

                dynamicMarks.append((re.escape(prefix[-DYNAMICITY_MARK_LENGTH /
                                                      2:]) if prefix else None,
                                     re.escape(suffix[:DYNAMICITY_MARK_LENGTH /
                                                      2]) if suffix else None))

        return dynamicMarks
Exemplo n.º 6
0
def checkDynamicContent(firstPage, secondPage):
    """
    This function checks if the provided pages have dynamic content. If they
    are dynamic, proper markings will be made.
    """

    if kb.nullConnection:
        debugMsg  = "dynamic content checking skipped "
        debugMsg += "because NULL connection used"
        logger.debug(debugMsg)
        return

    if conf.longestCommon:
        debugMsg  = "dynamic content checking skipped "
        debugMsg += "because longest common comparison used"
        logger.debug(debugMsg)
        return

    infoMsg = "searching for dynamic content"
    logger.info(infoMsg)

    blocks = SequenceMatcher(None, firstPage, secondPage).get_matching_blocks()
    kb.dynamicMarkings = []

    i = 0
    while i < len(blocks):
        block = blocks[i]
        (_, _, length) = block

        if length <= conf.minMatchBlock:
            blocks.remove(block)

        else:
            i += 1

    if len(blocks) > 0:
        blocks.insert(0, None)
        blocks.append(None)

        for i in xrange(len(blocks) - 1):
            prefix = firstPage[blocks[i][0]:blocks[i][0] + blocks[i][2]] if blocks[i] else None
            suffix = firstPage[blocks[i + 1][0]:blocks[i + 1][0] + blocks[i + 1][2]] if blocks[i + 1] else None

            if prefix is None and blocks[i + 1][0] == 0:
                continue

            if suffix is None and (blocks[i][0] + blocks[i][2] >= len(firstPage)):
                continue

            kb.dynamicMarkings.append((re.escape(prefix[-conf.dynMarkLength:]) if prefix else None, re.escape(suffix[:conf.dynMarkLength]) if suffix else None))

    if len(kb.dynamicMarkings) > 0:
        infoMsg = "dynamic content marked for removal (%d region%s)" % (len(kb.dynamicMarkings), 's' if len(kb.dynamicMarkings) > 1 else '')
        logger.info(infoMsg)

        if conf.seqMatcher.a:
            for item in kb.dynamicMarkings:
                prefix, suffix = item

                if prefix is None:
                    conf.seqMatcher.a = re.sub('(?s)^.+%s' % suffix, suffix, conf.seqMatcher.a)
                elif suffix is None:
                    conf.seqMatcher.a = re.sub('(?s)%s.+$' % prefix, prefix, conf.seqMatcher.a)
                else:
                    conf.seqMatcher.a = re.sub('(?s)%s.+%s' % (prefix, suffix), '%s%s' % (prefix, suffix), conf.seqMatcher.a)