示例#1
0
class RFPDupeFilter(BaseDupeFilter):
    def __init__(self, server, key):
        self.server = server
        self.key = key
        self.bf = BloomFilter(server, key, blockNum=1)  # 如果url比较多可以增加blockNum

    @classmethod
    def from_settings(cls, settings):

        server = connection.from_settings_filter(settings)
        key = "dupefilter:%s" % int(time.time())
        return cls(server, key)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def request_seen(self, request):
        fp = request_fingerprint(request)
        if self.bf.isContains(fp):  # 如果已经存在
            return True
        else:
            self.bf.insert(fp)
            return False
        # This returns the number of values added, zero if already exists.
        #added = self.server.sadd(self.key, fp)
        #return added == 0

    def close(self, reason=''):
        self.clear()

    def clear(self):
        """Clears fingerprints data."""
        self.server.delete(self.key)
示例#2
0
 def __init__(self, server, key):
     """Initialize duplication filter
     Parameters
     ----------
     server : Redis instance
     key : str
         Where to store fingerprints
     """
     self.server = server
     self.key = key
     self.bf = BloomFilter(server, key, blockNum=1)  # you can increase blockNum if your are filtering too many urls
示例#3
0
class RFPDupeFilter(BaseDupeFilter):
    """Redis-based request duplication filter"""

    def __init__(self, server, key):
        """Initialize duplication filter

        Parameters
        ----------
        server : Redis instance
        key : str
            Where to store fingerprints
        """
        self.server = server
        self.key = key
        self.bf = BloomFilter(server, key, blockNum=1)  # you can increase blockNum if your are filtering too many urls

    @classmethod
    def from_settings(cls, settings):
        server = connection.from_settings_filter(settings)
        # create one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        key = "dupefilter:%s" % int(time.time())
        return cls(server, key)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def request_seen(self, request):
        fp = request_fingerprint(request)
        if self.bf.isContains(fp):
            return True
        else:
            self.bf.insert(fp)
            return False
        # added = self.server.sadd(self.key, fp)
        # return not added

    def close(self, reason):
        """Delete data on close. Called by scrapy's scheduler"""
        self.clear()

    def clear(self):
        """Clears fingerprints data"""
        self.server.delete(self.key)
    def __init__(self, server, key, debug=False):
        """Initialize the duplicates filter.

        Parameters
        ----------
        server : redis.StrictRedis
            The redis server instance.
        key : str
            Redis key Where to store fingerprints.
        debug : bool, optional
            Whether to log filtered requests.

        """
        self.server = server
        self.key = key
        self.debug = debug
        self.logdupes = True
        self.bf = BloomFilter(server, key, blockNum=1)  #add by me
示例#5
0
    def __init__(self, server, key):
        """Initialize duplication filter

        Parameters
        ----------
        server : Redis instance
        key : str
            Where to store fingerprints
        """
        self.server = server
        self.key = key
        self.bf = BloomFilter(server, key, blockNum=1)  # you can increase blockNum if your are filtering too many urls
示例#6
0
# encoding=utf-8
import redis
from BloomfilterOnRedis import BloomFilter
from scrapy.utils.request import request_fingerprint
from scrapy import Request

rconn = redis.Redis('172.16.188.121', 6379)
bf = BloomFilter(rconn, 'spider_1:dupefilter')

if __name__ == '__main__':
    # while True:
    url = 'http://www.x14hack.com/'
    request = Request(url)
    fp = request_fingerprint(request)
    print fp,
    if bf.isContains(fp):
        print 'exist!'
    else:
        print 'not exist!'
class RFPDupeFilter(BaseDupeFilter):
    """Redis-based request duplicates filter.

    This class can also be used with default Scrapy's scheduler.

    """

    logger = logger

    def __init__(self, server, key, debug=False):
        """Initialize the duplicates filter.

        Parameters
        ----------
        server : redis.StrictRedis
            The redis server instance.
        key : str
            Redis key Where to store fingerprints.
        debug : bool, optional
            Whether to log filtered requests.

        """
        self.server = server
        self.key = key
        self.debug = debug
        self.logdupes = True
        self.bf = BloomFilter(server, key, blockNum=1)  #add by me

    @classmethod
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:<timestamp>``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        Parameters
        ----------
        settings : scrapy.settings.Settings

        Returns
        -------
        RFPDupeFilter
            A RFPDupeFilter instance.


        """
        server = get_redis_from_settings(settings)
        # XXX: This creates one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(server, key=key, debug=debug)

    @classmethod
    def from_crawler(cls, crawler):
        """Returns instance from crawler.

        Parameters
        ----------
        crawler : scrapy.crawler.Crawler

        Returns
        -------
        RFPDupeFilter
            Instance of RFPDupeFilter.

        """
        return cls.from_settings(crawler.settings)

    def request_seen(self, request):
        """Returns True if request was already seen.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        bool

        """
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        # added = self.server.sadd(self.key, fp)
        # return added == 0
        if self.bf.isContains(fp):
            return True
        else:
            self.bf.insert(fp)
            return False

    def request_fingerprint(self, request):
        """Returns a fingerprint for a given request.

        Parameters
        ----------
        request : scrapy.http.Request

        Returns
        -------
        str

        """
        return request_fingerprint(request)

    def close(self, reason=''):
        """Delete data on close. Called by Scrapy's scheduler.

        Parameters
        ----------
        reason : str, optional

        """
        self.clear()

    def clear(self):
        """Clears fingerprints data."""
        self.server.delete(self.key)

    def log(self, request, spider):
        """Logs given request.

        Parameters
        ----------
        request : scrapy.http.Request
        spider : scrapy.spiders.Spider

        """
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request},
                              extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request},
                              extra={'spider': spider})
            self.logdupes = False
示例#8
0
 def __init__(self, server, key):
     self.server = server
     self.key = key
     self.bf = BloomFilter(server, key, blockNum=1)  # 如果url比较多可以增加blockNum