def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.redis = connection.from_settings(settings) # Ensure the connection is working. self.redis.ping() #self.redis = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.redis, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get( 'RCONN', redis.Redis(crawler.settings.get('REDIS_HOST', '192.168.195.1'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.rconn, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get( "RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.rconn, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) #self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True) ##decode_responses设置取出的编码为str # 首次登陆获取cookies res = QichabaoCookie().init_cookie() # 存入redis中 PyRedis().get_redis().set("qichabao:Cookies", json.dumps(res, ensure_ascii=False))
def __init__(self, settings, crawler): #自己获取的ip self.TIMES = 10 RetryMiddleware.__init__(self, settings) self.rconn = settings.get( "RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379)))
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get( "RCONN", redis.Redis(host=crawler.settings.get('REDIS_HOST', 'localhsot'), port=crawler.settings.get('REDIS_PORT', 6379), password=crawler.settings.get('REDIS_PASSWORD', ''))) initCookie(self.rconn, crawler.spider.name)
def __init__(self, settings, crawler): # 重载父类 RetryMiddleware.__init__(self, settings) # decode_responses 设置取出的编码为str # settings['REDIS_URL'] 访问scrapy的settings self.redis_connection = redis.from_url(settings['REDIS_URL'], db=14, decode_responses=True) # 往redis中添加cookie。第二个参数就是spidername的获取方法(其实就是字典啦!) init_cookie(self.redis_connection, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) # 设置cookie的redis连接 self.rconn = settings.get( "RCONN", redis.Redis(settings.get('COOKIE_REDIS_HOST', 'localhsot'), settings.get('COOKIE_REDIS_PORT', 6379), settings.get('COOKIE_REDIS_DB', 2))) # 初始化所有cookie,将cookie放入redis init_cookie(self.rconn, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) # xiaoman self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True) # weixin # self.rconn = redis.from_url(settings['REDIS_URL'], db=4, decode_responses=True) self.cookie_tool = CookieTool() self.cookie_tool.init_cookie(self.rconn, crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) REDIS_HOST = settings.get('REDIS_HOST') REDIS_PORT = settings.get('REDIS_PORT') REDIS_DB = settings.get('COOKIES_DB') self.r = redis.Redis( host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True) #decode_responses设置取出的编码为str self.init_cookie(crawler.spider.name)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get( "RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhost'), crawler.settings.get('REDIS_PORT', 6379), 2)) if crawler.spider.name not in ["SoGouSpider", "QxjSpider"]: # 由于微博的cookie是以spider.name管理的,这里为了统一,就都使用“SinaSpider” name = "SinaSpider" # name = crawler.spider.name initCookie(self.rconn, name)
def __init__(self, settings, crawler): self.logger = logging.getLogger("---Cookies池---") RetryMiddleware.__init__(self, settings) # 模拟登陆初始化 cookies,若注释掉需要手动向数据库中写入 cookies # 写入格式为: username(string) cookies(json string) # | username | cookies | # | "*****@*****.**" | "{"SSOLoginState": "1570157316", "SUB": "_2A25wksNUDeRhGedI7lER9i_Jzj6IHXVQfO0crDV6PUJbktANLW7HkW1NVzqyC0ntAqR8szHeQCefNRM41xZZJ3YI"}" | # init_cookies() client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT) db = client[MONGO_DB_NAME] col = db[COOKIES_COLLECTION_NAME] self.cookies_pool = [] for item in col.find(): self.cookies_pool.append(json.loads(item['cookies']))
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.proxy_list = settings.get('PROXY_LIST') self.proxies = {} for line in self.proxy_list: parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line) # Cut trailing @ if parts.group(2): user_pass = parts.group(2)[:-1] else: user_pass = '' self.proxies[parts.group(1) + parts.group(3)] = user_pass
def process_exception(self, request, exception, spider): to_return = RetryMiddleware.process_exception(self, request, exception, spider) # customize retry middleware by modifying this request.meta['url'] = request.url self.record_failed('failed.txt', request, exception, 'url') return to_return
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.proxy_list = settings.get('PROXY_LIST') fin = open(self.proxy_list) self.proxies = {} for line in fin.readlines(): parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line) # Cut trailing @ if parts.group(2): user_pass = parts.group(2)[:-1] else: user_pass = '' self.proxies[parts.group(1) + parts.group(3)] = user_pass fin.close()
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.mode = settings.get('PROXY_MODE') self.proxy_list = settings.get('PROXY_LIST') self.chosen_proxy = '' if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE: if self.proxy_list is None: raise KeyError('PROXY_LIST setting is missing') self.proxies = {} fin = open(self.proxy_list) try: for line in fin.readlines(): parts = re.match('(\w+://)(\w+:\w+@)?(.+)', line.strip()) if not parts: continue # Cut trailing @ if parts.group(2): user_pass = parts.group(2)[:-1] else: user_pass = '' self.proxies[parts.group(1) + parts.group(3)] = user_pass finally: fin.close() if self.mode == Mode.RANDOMIZE_PROXY_ONCE: self.chosen_proxy = random.choice(list(self.proxies.keys())) elif self.mode == Mode.SET_CUSTOM_PROXY: custom_proxy = settings.get('CUSTOM_PROXY') self.proxies = {} parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', custom_proxy.strip()) if not parts: raise ValueError('CUSTOM_PROXY is not well formatted') if parts.group(2): user_pass = parts.group(2)[:-1] else: user_pass = '' self.proxies[parts.group(1) + parts.group(3)] = user_pass self.chosen_proxy = parts.group(1) + parts.group(3)
def _retry(self, request, reason, spider): log.msg('Changing proxy') tn = telnetlib.Telnet('127.0.0.1', 9051) tn.read_until("Escape character is '^]'.", 2) tn.write('AUTHENTICATE "267765"\r\n') tn.read_until("250 OK", 2) tn.write("signal NEWNYM\r\n") tn.read_until("250 OK", 2) tn.write("quit\r\n") tn.close() time.sleep(3) log.msg('Proxy changed') return RetryMiddleware._retry(self, request, reason, spider)
def _retry(self, request, reason, spider): log.msg('Changing proxy') tn = telnetlib.Telnet('127.0.0.1', 9051) tn.read_until("Escape character is '^]'.", 2) tn.write('AUTHENTICATE "267765"\r\n') tn.read_until("250 OK", 2) tn.write("signal NEWNYM\r\n") tn.read_until("250 OK", 2) tn.write("quit\r\n") tn.close() time.sleep(3) log.msg('Proxy changed') return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings): RetryMiddleware.__init__(self, settings)
def __init__(self, settings): RetryMiddleware.__init__(self, settings)
def _retry(self, request, reason, spider): print('start RetryChangeProxyMiddleware:') os.system('/usr/local/bin/nym.sh') time.sleep(3) # print 'ret RetryChangeProxyMiddleware:' return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get("RCONN", redis.Redis("localhost", 6379)) initCookie(self.rconn, crawler.spider.name)
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mw = RetryMiddleware.from_crawler(self.crawler) self.mw.max_retry_times = 2 self.invalid_url = 'http://www.scrapytest.org/invalid_url'
def __init__(self, settings): RetryMiddleware.__init__(self, settings) DownloaderBaseMiddleware.__init__(self, settings)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = redis.from_url(settings['REDIS_URL'], db=1, decode_responses=True)##decode_responses设置取出的编码为str init_cookie(self.rconn, crawler.spider.name)
def setUp(self): crawler = get_crawler(Spider) self.spider = crawler._create_spider("foo") self.mw = RetryMiddleware.from_crawler(crawler) self.mw.max_retry_times = 2
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.retry_intervals = settings.getint('RETRY_TIME_INTERVAL')
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mw = RetryMiddleware.from_crawler(self.crawler) self.mw.max_retry_times = 2
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mw = RetryMiddleware.from_crawler(self.crawler) self.mw.max_retry_times = 2 self.invalid_url = 'http://www.scrapytest.org/invalid_url'
def _retry(self, request, reason, spider): print( 'start RetryChangeProxyMiddleware:') os.system('/usr/local/bin/nym.sh') time.sleep(3) # print 'ret RetryChangeProxyMiddleware:' return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.rconn = redis.Redis(settings.get('REDIS_HOST', 'localhsot'), settings.get('REDIS_PORT', 6379)) self.cookiemanager=CookiesManager() self.cookiemanager.init_all_cookies(self.rconn)
def __init__(self, crawler): RetryMiddleware.__init__(self, crawler.settings) RandomUserAgentBase.__init__(self, crawler)
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.redis_conn = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'))
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings)
def _retry(self, request, reason, spider): log.msg('Changing proxy') request.meta['proxy'] = settings.get('HTTP_PROXY') return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get("RCONN", redis.Redis("localhost", 6379)) initCookie(self.rconn, crawler.spider.name)
def get_spider_and_middleware(self, settings=None): crawler = get_crawler(Spider, settings or {}) spider = crawler._create_spider('foo') middleware = RetryMiddleware.from_crawler(crawler) return spider, middleware
def _retry(self, request, reason, spider): # log.msg('Changing proxy') request.meta['proxy'] = settings.get('HTTP_PROXY') return RetryMiddleware._retry(self, request, reason, spider)
def __init__(self, settings, crawler): RetryMiddleware.__init__(self, settings) self.rconn = settings.get("RCONN", redis.Redis(crawler.settings.get('REDIS_HOST', 'localhsot'), crawler.settings.get('REDIS_PORT', 6379))) initCookie(self.rconn, crawler.spider.name)
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider('foo') self.mw = RetryMiddleware.from_crawler(self.crawler) self.mw.max_retry_times = 2
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.rconn = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=3)
def __init__(self, settings): RetryMiddleware.__init__(self, settings) self.set_logger(self.crawler)
def _retry(self, request, reason, spider): log.msg("Changing proxy") request.meta["proxy"] = settings.get("HTTP_PROXY") return RetryMiddleware._retry(self, request, reason, spider)