Exemplo n.º 1
0
    def from_spider(cls, spider):
        """Returns instance from crawler.

        Parameters
        ----------
        spider :

        Returns
        -------
        RFPDupeFilter
            Instance of RFPDupeFilter.

        """
        settings = spider.settings
        server = get_redis_from_settings(settings)
        dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY",
                                      defaults.SCHEDULER_DUPEFILTER_KEY)
        key = dupefilter_key % {'spider': spider.name}
        debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
        bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
        hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER',
                                      BLOOMFILTER_HASH_NUMBER)
        print(key, bit, hash_number)
        instance = cls(server,
                       key=key,
                       debug=debug,
                       bit=bit,
                       hash_number=hash_number)
        return instance
Exemplo n.º 2
0
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:<timestamp>``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        Parameters
        ----------
        settings : scrapy.settings.Settings

        Returns
        -------
        RFPDupeFilter
            A RFPDupeFilter instance.


        """
        server = get_redis_from_settings(settings)
        # XXX: This creates one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
        key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(server, key=key, debug=debug)
Exemplo n.º 3
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     if settings.getbool('REDIS_IP_PROXY_ENABLED'):
         ipproxy_key = crawler.settings.get('REDIS_IP_PROXY_KEY')
         server = get_redis_from_settings(crawler.settings)
         return cls(server=server, key=ipproxy_key)
     return cls(crawler.settings.get('PROXIES'))
Exemplo n.º 4
0
 def from_spider(cls, spider):
     settings = spider.settings
     server = get_redis_from_settings(settings)
     dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY",
                                   defaults.SCHEDULER_DUPEFILTER_KEY)
     key = dupefilter_key % {'spider': spider.name}
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(server, key=key, debug=debug)
Exemplo n.º 5
0
 def from_settings(cls, settings, server=None, key=None, debug=None):
     redis_server = get_redis_from_settings(settings)
     debug = settings.getbool('DUPEFILTER_DEBUG')
     redis_db = settings.getint('REDIS_DB')
     redis_blockNum = settings.getint('REDIS_BLOCKNUM')
     redis_key = settings['REDIS_KEY']
     # return cls(redis_server, key=key, debug=debug, db=redis_db, blockNum=redis_blockNum, redis_key=redis_key)
     return cls()
Exemplo n.º 6
0
 def compete_key(self):
     self.server = get_redis_from_settings(self.settings)
     self.redis_compete = self.settings.get('REDIS_COMPETE') % {'spider': self.name}
     self.redis_wait = self.settings.get('REDIS_WAIT') % {'spider': self.name}
     self.key = 1
     # self.server.sadd(self.key, fp)
     while self.server.sadd(self.redis_compete, self.key) == 0:
         self.key = self.key + 1
     self.logger.info("get key %s" % self.key)
Exemplo n.º 7
0
 def __init__(self, spider_name, spider_num=psutil.cpu_count(logical=True),  write_asyn=True):
     self.write_asyn = write_asyn
     self.spider_name = spider_name
     self.spider_num = spider_num
     self.start_urls_redis_key = "%(name)s:start_urls" % {"name": self.spider_name}
     self.items_redis_key = "%(name)s:items" % {"name": self.spider_name}
     self.setting = get_project_settings()
     self.logger = self.get_loger()
     self.redis = get_redis_from_settings(self.setting)
     self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0)
     self.logger.info(self.redis)
Exemplo n.º 8
0
    def __init__(self, settings):
        super(CookieMiddleware, self).__init__(settings)
        self.site = settings.get('SITE', None)
        if not self.site:
            raise CrwyScrapyPlugsException('SITE_NOT_SET')

        self.server = get_redis_from_settings(settings)

        self.h = RedisHash(
            'cookie_pool:{}'.format(self.site),
            server=self.server
        )
Exemplo n.º 9
0
 def __init__(self, crawler):
     self.crawler = crawler
     self.setting = crawler.settings
     self.spider = crawler.spider
     self.spider_name = self.spider.name
     self.http_proxies_queue_redis_key = self.setting.get(
         "HTTP_PROXIES_QUEUE_REDIS_KEY", "%(name)s:http_proxies_queue") % {
             "name": self.spider_name
         }
     self.logger.info(self.http_proxies_queue_redis_key)
     self.user_agent = UserAgent()
     self.redis = get_redis_from_settings(self.setting)
     self.current_proxy = self.get_new_proxy()
Exemplo n.º 10
0
    def from_settings(cls, settings):

        key = 'isbnfilter:%(timestamp)s' % {'timestamp': int(time.time())}
        server = get_redis_from_settings(settings)

        mysql = {}
        mysql['host'] = settings.get('MYSQL_HOST')
        mysql['user'] = settings.get('MYSQL_USER')
        mysql['passwd'] = settings.get('MYSQL_PASSWD')
        mysql['db'] = settings.get('MYSQL_DB')
        mysql['table'] = settings.get('MYSQL_TABLE')

        return cls(server=server, key=key, mysql=mysql)
Exemplo n.º 11
0
 def from_settings(cls, settings):
     server = get_redis_from_settings(settings)
     debug = settings.getbool('DUPEFILTER_DEBUG')
     bot_name = settings.get('BOT_NAME')
     spider_name = settings.get('SPIDER_NAME')
     duperliter_delay_day = settings.getint('DUPEFILTER_DELAY_DAY', 0)
     do_hash = settings.getbool('DUPEFILTER_DO_HASH', True)
     if not spider_name:
         raise NotConfigured('%s - "SPIDER_NAME" is not found.' %
                             cls.__name__)
     return cls(debug=debug, server=server, bot_name=bot_name,
                spider_name=spider_name,
                duperliter_delay_day=duperliter_delay_day,
                do_hash=do_hash)
Exemplo n.º 12
0
 def __init__(self, redis_key, start_urls_num_redis_key, interval=1, bar_name=None):
     threading.Thread.__init__(self)
     self.start_urls_num_redis_key = start_urls_num_redis_key
     self.setDaemon(True)
     self.setting = get_project_settings()
     self.redis = get_redis_from_settings(self.setting)
     self.redis_key = redis_key
     self.total = int(self.redis.get(self.start_urls_num_redis_key))
     self.interval = interval
     if bar_name:
         self.bar_name = bar_name
     else:
         self.bar_name = self.redis_key
     self.stop = False
Exemplo n.º 13
0
 def __init__(self, redis_key, interval=1, bar_name=None):
     threading.Thread.__init__(self)
     self.setDaemon(True)
     #self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0)
     self.setting = get_project_settings()
     self.redis = get_redis_from_settings(self.setting)
     self.redis_key = redis_key
     self.total = self.redis.scard(self.redis_key)
     self.interval = interval
     if bar_name:
         self.bar_name = bar_name
     else:
         self.bar_name = self.redis_key
     self.stop = False
Exemplo n.º 14
0
 def __init__(self, redis_key, interval=1, bar_name=None, buffer_size=512):
     threading.Thread.__init__(self)
     self.stop = False
     self.interval = interval
     self.buffer_size = buffer_size
     self.counter = 0
     #self.redis = redis.Redis(host='192.168.0.117', port=6379, db=0)
     self.setting = get_project_settings()
     self.redis = get_redis_from_settings(self.setting)
     self.redis_key = redis_key
     self.total = self.redis.llen(self.redis_key)
     if bar_name:
         self.bar_name = bar_name
     else:
         self.bar_name = self.redis_key
Exemplo n.º 15
0
 def __init__(self, redis_key, bar_name=None, buffer_size=512, show_pbar=True, stop_epoch=12*30, distinct_field=None):
     threading.Thread.__init__(self)
     self.distinct_field = distinct_field
     self.show_pbar = show_pbar
     self.stop = False
     self.stop_epoch = stop_epoch
     self.buffer_size = buffer_size
     self.counter = 0
     self.setting = get_project_settings()
     self.redis = get_redis_from_settings(self.setting)
     self.redis_key = redis_key
     self.total = self.redis.llen(self.redis_key)
     if bar_name:
         self.bar_name = bar_name
     else:
         self.bar_name = self.redis_key
     self.distinct_set = set()
Exemplo n.º 16
0
 def __init__(self, spider_name, spider_num=psutil.cpu_count(logical=True), start_id=0):
     self.spider_name = spider_name
     self.spider_num = spider_num
     self.setting = get_project_settings()
     configure_logging(self.setting)
     self.start_urls_redis_key = self.setting.get("START_URLS_KEY",
                                                  "%(name)s:start_urls") % {"name": self.spider_name}
     self.items_redis_key = self.setting.get("RESULT_ITEMS_REDIS_KEY", "%(name)s:items") % {"name": self.spider_name}
     self.start_urls_num_redis_key = self.setting.get("START_URLS_NUM_KEY",
                                                      "%(name)s:start_urls_num") % {"name": self.spider_name}
     self.http_proxies_queue_redis_key = self.setting.get("HTTP_PROXIES_QUEUE_REDIS_KEY",
                                                           "%(name)s:http_proxies_queue") % {"name": self.spider_name}
     self.dupefilter_redis_key = self.setting.get("SCHEDULER_DUPEFILTER_KEY",
                                                          "%(spider)s:dupefilter") % {"spider": self.spider_name}
     self.logger = logging.getLogger(__name__)
     self.redis = get_redis_from_settings(self.setting)
     self.logger.info(self.redis)
     self.start_id = start_id #范围start_id>=0 and start_id+self.spider_num<=237
     if not (self.start_id >= 0 and start_id + self.spider_num <= 237):
         raise InterruptedError("not valid start_id, spider_num")
Exemplo n.º 17
0
    def from_settings(cls, settings):
        """Returns an instance from given settings.

        This uses by default the key ``dupefilter:<timestamp>``. When using the
        ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
        it needs to pass the spider name in the key.

        Parameters
        ----------
        settings : scrapy.settings.Settings

        Returns
        -------
        RFPDupeFilter
            A RFPDupeFilter instance.


        """
        server = get_redis_from_settings(settings)

        # XXX: This creates one-time key. needed to support to use this
        # class as standalone dupefilter with scrapy's default scheduler
        # if scrapy passes spider on open() method this wouldn't be needed
        # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
        key = defaults.DUPEFILTER_KEY

        debug = settings.getbool('DUPEFILTER_DEBUG')

        mysql = {}
        mysql['host'] = settings.get('MYSQL_HOST')
        mysql['user'] = settings.get('MYSQL_USER')
        mysql['passwd'] = settings.get('MYSQL_PASSWD')
        mysql['db'] = settings.get('MYSQL_DB')
        mysql['table'] = setting.get('MYSQL_TABLE')

        return cls(server, key=key, debug=debug, mysql=mysql)
Exemplo n.º 18
0
 def spider_opened(self, spider):
     logger.info("opened spider %s redis spider Idle, Continuous idle limit: %d", spider.name, self.idle_number)
     self.redis = get_redis_from_settings(self.settings)
     self.redis_key = self.settings.get('RESULT_ITEMS_REDIS_KEY', '%(name)s:items') % {"name": spider.name}
Exemplo n.º 19
0
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import base64
import random
from scrapy_redis.connection import get_redis_from_settings
from scrapy.utils.project import get_project_settings

redis_cli = get_redis_from_settings(get_project_settings())


class NewsSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
Exemplo n.º 20
0
 def from_settings(cls, settings):
     server = get_redis_from_settings(settings)
     key = settings.get('DUPEFILTER_KEY')
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(server, key=key, debug=debug)
Exemplo n.º 21
0
 def from_spider(cls, spider):
     settings = spider.settings
     server = get_redis_from_settings(settings)
     key = spider.name
     debug = settings.getbool('DUPEFILTER_DEBUG')
     return cls(server, key=key, debug=debug)