Exemplo n.º 1
0
 def process_exception(self, request, exception, spider):
     r = get_redis()
     current_proxy = request.meta['proxy']
     try:
         self.del_proxy(current_proxy)
     except ValueError:
         pass
Exemplo n.º 2
0
def load(request):
    """Set start url to redis queue"""

    Product.objects.all().delete()

    r = get_redis()
    r.lpush('marissa:start_urls',
            'https://www.marissacollections.com/shop/clothing.html?limit=9')

    return redirect('clothing:products_list')
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('key', help="Redis key where items are stored")
    parser.add_argument('--host')
    parser.add_argument('--port')
    parser.add_argument('--timeout', type=int, default=5)
    parser.add_argument('--limit', type=int, default=0)
    parser.add_argument('--progress-every', type=int, default=100)
    parser.add_argument('-v', '--verbose', action='store_true')

    args = parser.parse_args()

    params = {}
    if args.host:
        params['host'] = args.host
    if args.port:
        params['port'] = args.port

    params['password'] = REDIS_PASS

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    r = get_redis(**params)
    # mongodb connection
    client = MongoClient(MONGO_URI)
    db = client[MONGO_DATABASE]

    host = r.connection_pool.get_connection('info').host
    logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
    kwargs = {
        'keys': [args.key],
        'timeout': args.timeout,
        'limit': args.limit,
        'log_every': args.progress_every,
    }
    try:
        process_items(r, db, **kwargs)
        retcode = 0  # ok
    except KeyboardInterrupt:
        retcode = 0  # ok
    except Exception:
        logger.exception("Unhandled exception")
        retcode = 2

    return retcode
Exemplo n.º 4
0
    def parse(self, response):
        selector = Selector(response)
        proxy_ip_list = selector.xpath('//td[2]/text()').extract()
        proxy_port_list = selector.xpath('//td[3]/text()').extract()

        r = get_redis()
        r.flushdb()
        ip_test_url = 'http://ip.chinaz.com/getip.aspx'
        socket.setdefaulttimeout(3)
        for ip, port in zip(proxy_ip_list, proxy_port_list):
            try:
                proxy_host = "http://" + ip + ":" + port
                proxy_temp = {"http": proxy_host}
                res = urllib.urlopen(ip_test_url, proxies=proxy_temp).read()
                print proxy_host
                r.lpush('Proxy:host', proxy_host)
            except Exception, e:
                continue
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('key', help="Redis key where items are stored")
    parser.add_argument('--host')
    parser.add_argument('--port')
    parser.add_argument('--timeout', type=int, default=5)
    parser.add_argument('--limit', type=int, default=0)
    parser.add_argument('--progress-every', type=int, default=100)
    parser.add_argument('-v', '--verbose', action='store_true')

    args = parser.parse_args()

    params = {}
    if args.host:
        params['host'] = args.host
    if args.port:
        params['port'] = args.port

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    r = get_redis(**params)
    host = r.connection_pool.get_connection('info').host
    logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
    kwargs = {
        'keys': [args.key],
        'timeout': args.timeout,
        'limit': args.limit,
        'log_every': args.progress_every,
    }
    try:
        init_db()
        process_items(r, **kwargs)
        logger.info("Building spreadsheet")
        build_spreadsheet()
        retcode = 0  # ok
    except KeyboardInterrupt:
        retcode = 0  # ok
    except Exception:
        logger.exception("Unhandled exception")
        retcode = 2

    return retcode
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('key', help="Redis key where items are stored")
    parser.add_argument('--host')
    parser.add_argument('--port')
    parser.add_argument('--timeout', type=int, default=5)
    parser.add_argument('--limit', type=int, default=0)
    parser.add_argument('--progress-every', type=int, default=100)
    parser.add_argument('-v', '--verbose', action='store_true')

    args = parser.parse_args()

    params = {}
    if args.host:
        params['host'] = args.host
    if args.port:
        params['port'] = args.port

    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    r = get_redis(**params)
    host = r.connection_pool.get_connection('info').host
    logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
    kwargs = {
        'keys': [args.key],
        'timeout': args.timeout,
        'limit': args.limit,
        'log_every': args.progress_every,
    }
    try:
        process_items(r, **kwargs)
        retcode = 0  # ok
    except KeyboardInterrupt:
        retcode = 0  # ok
    except Exception:
        logger.exception("Unhandled exception")
        retcode = 2

    return retcode
Exemplo n.º 7
0
class ProxyMiddleware(object):
    #rand_index = random.randint(0,r.llen("Proxy:host")-1)
    r = get_redis()
    proxy_host = r.rpoplpush("Proxy:host", "Proxy:host")

    def process_request(self, request, spider):
        # Set the location of the proxy
        request.meta['proxy'] = self.proxy_host
        retry_times = request.meta.get('retry_times', 0)
        print "Current proxy: " + request.meta[
            'proxy'] + " Retry times: %d" % retry_times

        # Use the following lines if your proxy requires authentication
        proxy_user_pass = "******"

        # setup basic authentication for the proxy
        encoded_user_pass = base64.encodestring(proxy_user_pass)
        request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass

    def process_exception(self, request, exception, spider):
        r = get_redis()
        current_proxy = request.meta['proxy']
        try:
            self.del_proxy(current_proxy)
        except ValueError:
            pass
        #return request

    def del_proxy(self, current_proxy):
        r = get_redis()
        print('Remove proxy: %s, %d proxies left' %
              (current_proxy, r.llen("Proxy:host") - 1))
        r.lrem("Proxy:host", 0, current_proxy)
        print "Successfully removed :" + current_proxy

    '''
Exemplo n.º 8
0
 def __init__(self, *args, **kwargs):
     super(AlistSpider, self).__init__(*args, **kwargs)
     self.conn = get_redis(url=settings.get('REDIS_URL'))
Exemplo n.º 9
0
 def del_proxy(self, current_proxy):
     r = get_redis()
     print('Remove proxy: %s, %d proxies left' %
           (current_proxy, r.llen("Proxy:host") - 1))
     r.lrem("Proxy:host", 0, current_proxy)
     print "Successfully removed :" + current_proxy
Exemplo n.º 10
0
 def __init__(self, **params):
     params = {}
     self.r = get_redis(**params)
Exemplo n.º 11
0
 def connect_redis(self):
     self.r = get_redis(host=self.params['host'], port=self.params['port'])
Exemplo n.º 12
0
import json
import redis
from scrapy_redis import get_redis
from setting import REDIS_URL

redis_cli = get_redis(url=REDIS_URL)
# redis_cli = redis.StrictRedis(
#         host='127.0.0.1', port=6379)


class RedisTools:
    """实现了获取数据编码格式的自动转换"""
    @staticmethod
    def duplicate(key, data):
        """数据是否存在集合中"""
        v = redis_cli.sismember(key, data)
        if v:
            return True
        return False

    @staticmethod
    def insert_to_set_redis(key, *data):
        """存入集合"""
        redis_cli.sadd(key, *data)

    @staticmethod
    def get_set_pop(key):
        """随机弹出一条数据"""
        if not RedisTools.get_set_number(key):
            return None
        data = redis_cli.spop(key)
Exemplo n.º 13
0
# -*- coding: utf-8 -*-

# Scrapy settings for ProxyCrawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

from scrapy_redis import get_redis
r = get_redis()
PROXY_LENGTH = r.llen("Proxy::host")

BOT_NAME = 'ProxyCrawler'

SPIDER_MODULES = ['ProxyCrawler.spiders']
NEWSPIDER_MODULE = 'ProxyCrawler.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ProxyCrawler (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay