def from_settings(cls, settings, spidername):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')

        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = CustomLogFactory.get_instance(json=my_json,
                                               name=my_name,
                                               stdout=my_output,
                                               level=my_level,
                                               dir=my_dir,
                                               file=my_file,
                                               bytes=my_bytes,
                                               backups=my_backups)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex)
    def from_settings(cls, settings, spidername):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
        hits = settings.get('QUEUE_HITS', 10)
        window = settings.get('QUEUE_WINDOW', 60)
        mod = settings.get('QUEUE_MODERATED', False)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60)
        add_type = settings.get('SCHEDULER_TYPE_ENABLED', False)
        add_ip = settings.get('SCHEDULER_IP_ENABLED', False)
        retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3)
        ip_regex = settings.get('IP_ADDR_REGEX', '.*')

        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s"%(spidername, get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log"%(spidername, get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = CustomLogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        return cls(server, persist, up_int, timeout, retries, logger, hits,
                   window, mod, ip_refresh, add_type, add_ip, ip_regex)
示例#3
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s" % (crawler.spidercls.name, get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log" % (crawler.spidercls.name, get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        cls.logger = LogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)
        return cls(crawler.settings)
示例#4
0
    def setup_logger(cls, settings, spidername):

        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s" % (spidername, get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log" % (spidername, get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        logger = CustomLogFactory.get_instance(json=my_json,
                                               name=my_name,
                                               stdout=my_output,
                                               level=my_level,
                                               dir=my_dir,
                                               file=my_file,
                                               bytes=my_bytes,
                                               backups=my_backups)
        return logger
示例#5
0
    def from_crawler(cls, crawler):
        settings = crawler.settings
        my_level = settings.get('SC_LOG_LEVEL', 'DEBUG')
        my_name = "%s_%s" % (crawler.spidercls.name,
                             get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        my_file = "%s_%s.log" % (crawler.spidercls.name,
                                 get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        cls.logger = LogFactory.get_instance(json=my_json,
                                             name=my_name,
                                             stdout=my_output,
                                             level=my_level,
                                             dir=my_dir,
                                             file=my_file,
                                             bytes=my_bytes,
                                             backups=my_backups)
        return cls(crawler.settings)
示例#6
0
    def setup(self, settings):
        '''
        Does the actual setup of the middleware
        '''
        # set up the default sc logger
        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
        #my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
        my_name = "%s_%s" % (settings['SPIDER_NAME'], get_raspberrypi_ip_address())
        my_output = settings.get('SC_LOG_STDOUT', False)
        my_json = settings.get('SC_LOG_JSON', True)
        my_dir = settings.get('SC_LOG_DIR', 'logs')
        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
        #my_file = settings.get('SC_LOG_FILE', 'main.log')
        my_file = "%s_%s.log" % (settings['SPIDER_NAME'], get_raspberrypi_ip_address())
        my_backups = settings.get('SC_LOG_BACKUPS', 5)

        self.logger = CustomLogFactory.get_instance(json=my_json,
                                         name=my_name,
                                         stdout=my_output,
                                         level=my_level,
                                         dir=my_dir,
                                         file=my_file,
                                         bytes=my_bytes,
                                         backups=my_backups)

        #self.logger.setLevel(logging.DEBUG)
        self.retry_http_codes = set(int(x) for x in
                                    settings.getlist('RETRY_HTTP_CODES'))

        # stats setup
        self.stats_dict = {}
        self.settings = settings
        self.name = self.settings['SPIDER_NAME']
        if self.settings['STATS_STATUS_CODES']:
            self.redis_conn = redis.Redis(host=self.settings.get('REDIS_HOST'),
                                          port=self.settings.get('REDIS_PORT'))
            self._setup_stats_status_codes()
示例#7
0
 def __init__(self, settings):
     # 保存上次不用代理直接连接的时间点
     self.last_no_proxy_time = datetime.now()
     # 一定分钟数后切换回不用代理, 因为用代理影响到速度
     self.recover_interval = 20
     # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
     self.dump_count_threshold = 20
     # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份
     self.proxy_file = "%s_proxyes.list" % get_raspberrypi_ip_address()
     # 是否在超时的情况下禁用代理
     self.invalid_proxy_flag = True
     # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数
     # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码.
     # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率
     self.extend_proxy_threshold = 10
     # 初始化代理列表
     #self.proxyes = [{"proxy": None, "valid": True, "count": 0}, {"proxy": "http://10.10.2.58:6666", "valid": True, "count": 0},
     #                {"proxy": "http://10.110.93.95:8088", "valid": True, "count": 0}]
     self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
     # 初始时使用0号代理(即无代理)
     self.proxy_index = 0
     # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接)
     self.fixed_proxy = len(self.proxyes)
     # 上一次抓新代理的时间
     self.last_fetch_proxy_time = datetime.now()
     # 每隔固定时间强制抓取新代理(min)
     self.fetch_proxy_interval = 120
     # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid
     self.invalid_proxy_threshold = 200
     if not os.path.exists(self.proxy_file):
         open(self.proxy_file, "w")
     # 从文件读取初始代理
     with open(self.proxy_file, "r") as fd:
         lines = fd.readlines()
         shuffle(lines)
         for line in lines:
             line = line.strip()
             if not line or self.url_in_proxyes("http://" + line):
                 continue
             self.proxyes.append({
                 "proxy": "http://" + line,
                 "valid": True,
                 "count": 0
             })
示例#8
0
 def __init__(self, settings):
     # 保存上次不用代理直接连接的时间点
     self.last_no_proxy_time = datetime.now()
     # 一定分钟数后切换回不用代理, 因为用代理影响到速度
     self.recover_interval = 20
     # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
     self.dump_count_threshold = 20
     # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份
     self.proxy_file = "%s_proxyes.list"%get_raspberrypi_ip_address()
     # 是否在超时的情况下禁用代理
     self.invalid_proxy_flag = True
     # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数
     # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码.
     # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率
     self.extend_proxy_threshold = 10
     # 初始化代理列表
     #self.proxyes = [{"proxy": None, "valid": True, "count": 0}, {"proxy": "http://10.10.2.58:6666", "valid": True, "count": 0},
     #                {"proxy": "http://10.110.93.95:8088", "valid": True, "count": 0}]
     self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
     # 初始时使用0号代理(即无代理)
     self.proxy_index = 0
     # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接)
     self.fixed_proxy = len(self.proxyes)
     # 上一次抓新代理的时间
     self.last_fetch_proxy_time = datetime.now()
     # 每隔固定时间强制抓取新代理(min)
     self.fetch_proxy_interval = 120
     # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid
     self.invalid_proxy_threshold = 200
     if not os.path.exists(self.proxy_file):
         open(self.proxy_file, "w")
     # 从文件读取初始代理
     with open(self.proxy_file, "r") as fd:
         lines = fd.readlines()
         shuffle(lines)
         for line in lines:
             line = line.strip()
             if not line or self.url_in_proxyes("http://" + line):
                 continue
             self.proxyes.append({"proxy": "http://"  + line,
                                 "valid": True,
                                 "count": 0})
    def update_ipaddress(self):
        '''
        Updates the scheduler so it knows its own ip address
        '''
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = get_raspberrypi_ip_address()
        try:
            obj = urllib2.urlopen(
                settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw'))
            results = self.ip_regex.findall(obj.read())
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(
                old=self.old_ip, new=self.my_ip))
示例#10
0
    def update_ipaddress(self):
        '''
        Updates the scheduler so it knows its own ip address
        '''
        # assign local ip in case of exception
        self.old_ip = self.my_ip
        self.my_ip = get_raspberrypi_ip_address()
        try:
            obj = urllib2.urlopen(settings.get('PUBLIC_IP_URL',
                                  'http://ip.42.pl/raw'))
            results = self.ip_regex.findall(obj.read())
            if len(results) > 0:
                self.my_ip = results[0]
            else:
                raise IOError("Could not get valid IP Address")
            obj.close()
            self.logger.debug("Current public ip: {ip}".format(ip=self.my_ip))
        except IOError:
            self.logger.error("Could not reach out to get public ip")
            pass

        if self.old_ip != self.my_ip:
            self.logger.info("Changed Public IP: {old} -> {new}".format(
                             old=self.old_ip, new=self.my_ip))