示例#1
0
 def test_cache_with_limit(self):
     cache = LocalCache(limit=2)
     cache['a'] = 1
     cache['b'] = 2
     cache['c'] = 3
     self.assertEqual(len(cache), 2)
     self.assertNotIn('a', cache)
     self.assertIn('b', cache)
     self.assertIn('c', cache)
     self.assertEqual(cache['b'], 2)
     self.assertEqual(cache['c'], 3)
示例#2
0
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver

from scrapy.utils.datatypes import LocalCache

# TODO: cache misses

dnscache = LocalCache(10000)

# 异步解析?
class CachingThreadedResolver(ThreadedResolver):
    def __init__(self, reactor, cache_size, timeout):
        super(CachingThreadedResolver, self).__init__(reactor)

        dnscache.limit = cache_size
        self.timeout = timeout

    def getHostByName(self, name, timeout=None):
        if name in dnscache:
            # 命中缓冲
            return defer.succeed(dnscache[name])

        # in Twisted<=16.6, getHostByName() is always called with
        # a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple),
        # so the input argument above is simply overridden
        # to enforce Scrapy's DNS_TIMEOUT setting's value
        # 异步解析
        timeout = (self.timeout,)
        d = super(CachingThreadedResolver, self).getHostByName(name, timeout)
        if dnscache.limit:
            d.addCallback(self._cache_result, name)
示例#3
0
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver
from twisted.internet.interfaces import IHostResolution, IHostnameResolver, IResolutionReceiver, IResolverSimple
from zope.interface.declarations import implementer, provider

from scrapy.utils.datatypes import LocalCache

# TODO: cache misses
dnscache = LocalCache(10000)  #就是一个带有limit的 OrderedDict


@implementer(IResolverSimple
             )  #IResolverSimple 有一个方法	getHostByName 就是将域名解析为IP. zope接口
class CachingThreadedResolver(ThreadedResolver):
    """
    Default caching resolver. IPv4 only, supports setting a timeout value for DNS requests.
    """
    def __init__(self, reactor, cache_size, timeout):
        super().__init__(reactor)
        dnscache.limit = cache_size
        self.timeout = timeout

    @classmethod
    def from_crawler(cls, crawler, reactor):
        if crawler.settings.getbool(
                'DNSCACHE_ENABLED'):  #这块可以考虑在setting里加上  这个可以减少DNS调用哈
            cache_size = crawler.settings.getint('DNSCACHE_SIZE')
        else:
            cache_size = 0
        return cls(reactor, cache_size,
                   crawler.settings.getfloat('DNS_TIMEOUT'))