def process_exception(self, request, exception, spider): r = get_redis() current_proxy = request.meta['proxy'] try: self.del_proxy(current_proxy) except ValueError: pass
def load(request): """Set start url to redis queue""" Product.objects.all().delete() r = get_redis() r.lpush('marissa:start_urls', 'https://www.marissacollections.com/shop/clothing.html?limit=9') return redirect('clothing:products_list')
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('key', help="Redis key where items are stored") parser.add_argument('--host') parser.add_argument('--port') parser.add_argument('--timeout', type=int, default=5) parser.add_argument('--limit', type=int, default=0) parser.add_argument('--progress-every', type=int, default=100) parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() params = {} if args.host: params['host'] = args.host if args.port: params['port'] = args.port params['password'] = REDIS_PASS logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) r = get_redis(**params) # mongodb connection client = MongoClient(MONGO_URI) db = client[MONGO_DATABASE] host = r.connection_pool.get_connection('info').host logger.info("Waiting for items in '%s' (server: %s)", args.key, host) kwargs = { 'keys': [args.key], 'timeout': args.timeout, 'limit': args.limit, 'log_every': args.progress_every, } try: process_items(r, db, **kwargs) retcode = 0 # ok except KeyboardInterrupt: retcode = 0 # ok except Exception: logger.exception("Unhandled exception") retcode = 2 return retcode
def parse(self, response): selector = Selector(response) proxy_ip_list = selector.xpath('//td[2]/text()').extract() proxy_port_list = selector.xpath('//td[3]/text()').extract() r = get_redis() r.flushdb() ip_test_url = 'http://ip.chinaz.com/getip.aspx' socket.setdefaulttimeout(3) for ip, port in zip(proxy_ip_list, proxy_port_list): try: proxy_host = "http://" + ip + ":" + port proxy_temp = {"http": proxy_host} res = urllib.urlopen(ip_test_url, proxies=proxy_temp).read() print proxy_host r.lpush('Proxy:host', proxy_host) except Exception, e: continue
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('key', help="Redis key where items are stored") parser.add_argument('--host') parser.add_argument('--port') parser.add_argument('--timeout', type=int, default=5) parser.add_argument('--limit', type=int, default=0) parser.add_argument('--progress-every', type=int, default=100) parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() params = {} if args.host: params['host'] = args.host if args.port: params['port'] = args.port logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) r = get_redis(**params) host = r.connection_pool.get_connection('info').host logger.info("Waiting for items in '%s' (server: %s)", args.key, host) kwargs = { 'keys': [args.key], 'timeout': args.timeout, 'limit': args.limit, 'log_every': args.progress_every, } try: init_db() process_items(r, **kwargs) logger.info("Building spreadsheet") build_spreadsheet() retcode = 0 # ok except KeyboardInterrupt: retcode = 0 # ok except Exception: logger.exception("Unhandled exception") retcode = 2 return retcode
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('key', help="Redis key where items are stored") parser.add_argument('--host') parser.add_argument('--port') parser.add_argument('--timeout', type=int, default=5) parser.add_argument('--limit', type=int, default=0) parser.add_argument('--progress-every', type=int, default=100) parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() params = {} if args.host: params['host'] = args.host if args.port: params['port'] = args.port logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) r = get_redis(**params) host = r.connection_pool.get_connection('info').host logger.info("Waiting for items in '%s' (server: %s)", args.key, host) kwargs = { 'keys': [args.key], 'timeout': args.timeout, 'limit': args.limit, 'log_every': args.progress_every, } try: process_items(r, **kwargs) retcode = 0 # ok except KeyboardInterrupt: retcode = 0 # ok except Exception: logger.exception("Unhandled exception") retcode = 2 return retcode
class ProxyMiddleware(object): #rand_index = random.randint(0,r.llen("Proxy:host")-1) r = get_redis() proxy_host = r.rpoplpush("Proxy:host", "Proxy:host") def process_request(self, request, spider): # Set the location of the proxy request.meta['proxy'] = self.proxy_host retry_times = request.meta.get('retry_times', 0) print "Current proxy: " + request.meta[ 'proxy'] + " Retry times: %d" % retry_times # Use the following lines if your proxy requires authentication proxy_user_pass = "******" # setup basic authentication for the proxy encoded_user_pass = base64.encodestring(proxy_user_pass) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass def process_exception(self, request, exception, spider): r = get_redis() current_proxy = request.meta['proxy'] try: self.del_proxy(current_proxy) except ValueError: pass #return request def del_proxy(self, current_proxy): r = get_redis() print('Remove proxy: %s, %d proxies left' % (current_proxy, r.llen("Proxy:host") - 1)) r.lrem("Proxy:host", 0, current_proxy) print "Successfully removed :" + current_proxy '''
def __init__(self, *args, **kwargs): super(AlistSpider, self).__init__(*args, **kwargs) self.conn = get_redis(url=settings.get('REDIS_URL'))
def del_proxy(self, current_proxy): r = get_redis() print('Remove proxy: %s, %d proxies left' % (current_proxy, r.llen("Proxy:host") - 1)) r.lrem("Proxy:host", 0, current_proxy) print "Successfully removed :" + current_proxy
def __init__(self, **params): params = {} self.r = get_redis(**params)
def connect_redis(self): self.r = get_redis(host=self.params['host'], port=self.params['port'])
import json import redis from scrapy_redis import get_redis from setting import REDIS_URL redis_cli = get_redis(url=REDIS_URL) # redis_cli = redis.StrictRedis( # host='127.0.0.1', port=6379) class RedisTools: """实现了获取数据编码格式的自动转换""" @staticmethod def duplicate(key, data): """数据是否存在集合中""" v = redis_cli.sismember(key, data) if v: return True return False @staticmethod def insert_to_set_redis(key, *data): """存入集合""" redis_cli.sadd(key, *data) @staticmethod def get_set_pop(key): """随机弹出一条数据""" if not RedisTools.get_set_number(key): return None data = redis_cli.spop(key)
# -*- coding: utf-8 -*- # Scrapy settings for ProxyCrawler project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html from scrapy_redis import get_redis r = get_redis() PROXY_LENGTH = r.llen("Proxy::host") BOT_NAME = 'ProxyCrawler' SPIDER_MODULES = ['ProxyCrawler.spiders'] NEWSPIDER_MODULE = 'ProxyCrawler.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ProxyCrawler (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay