Exemplo n.º 1
0
    def __init__(self, poolSize=10, agent=None, stopWhenDone=False,
        delay=2, allowAll=False, use_lock=None, **kwargs):

        # First, call the parent constructor
        BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)

        # Import DownpourLock only if use_lock specified, because it uses
        # *NIX-specific features. We use one lock for the pldQueue and one
        # for all the request queues collectively. The latter is a tad
        # overly restrictive, but is far easier than managing hundreds
        # of locks for hundreds of queues.
        if use_lock:
            import DownpourLock
            self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock)
            self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock)
        else:
            self.pld_lock  = threading.RLock()
            self.req_lock = threading.RLock()
        self.twi_lock = threading.RLock()  # Twisted reactor lock

        # Include a priority queue of plds
        self.pldQueue = PLDQueue('plds', **kwargs)
        # Make sure that there is an entry in the plds for
        # each domain waiting to be fetched. Also, include
        # the number of urls from each domain in the count
        # of remaining urls to be fetched.
        self.r = redis.Redis(**kwargs)
        # Redis has a pipeline feature that allows for bulk
        # requests, the result of which is a list of the
        # result of each individual request. Thus, only get
        # the length of each of the queues in the pipeline
        # as we're just going to set remaining to the sum
        # of the lengths of each of the domain queues.
        with self.r.pipeline() as p:
            for key in self.r.keys('domain:*'):
                with self.pld_lock:
                    self.pldQueue.push_init(key, 0)
                p.llen(key)
            self.remaining = sum(p.execute())
        # For whatever reason, pushing key names back into the
        # priority queue has been problematic. As such, we'll
        # set them aside as they fail, and then retry them at
        # some point. Like when the next request finishes.
        self.retries = []
        # Now make a queue for incoming requests
        self.requests = qr.Queue('request', **kwargs)
        self.delay = float(delay)
        # This is used when we have to impose a delay before
        # servicing the next available request.
        with self.twi_lock:
            self.timer = None
        # This is a way to ignore the allow/disallow directives
        # For example, if you're checking for allow in other places
        self.allowAll = allowAll
        self.userAgentString = reppy.getUserAgentString(self.agent)
Exemplo n.º 2
0
 def __init__(self, poolSize=10, agent=None, stopWhenDone=False, 
     delay=2, allowAll=False, **kwargs):
     
     # Call the parent constructor
     BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)
     # Include a priority queue of plds
     self.pldQueue = qr.PriorityQueue('plds', **kwargs)
     # Make sure that there is an entry in the plds for
     # each domain waiting to be fetched. Also, include
     # the number of urls from each domain in the count
     # of remaining urls to be fetched.
     self.r = redis.Redis(**kwargs)
     # Redis has a pipeline feature that allows for bulk
     # requests, the result of which is a list of the 
     # result of each individual request. Thus, only get
     # the length of each of the queues in the pipeline
     # as we're just going to set remaining to the sum
     # of the lengths of each of the domain queues.
     with self.r.pipeline() as p:
         for key in self.r.keys('domain:*'):
             self.pldQueue.push(key, 0)
             p.llen(key)
         self.remaining = sum(p.execute())
     # For whatever reason, pushing key names back into the 
     # priority queue has been problematic. As such, we'll
     # set them aside as they fail, and then retry them at
     # some point. Like when the next request finishes.
     self.retries = []
     # Now make a queue for incoming requests
     self.requests = qr.Queue('request', **kwargs)
     self.delay = float(delay)
     # This is used when we have to impose a delay before
     # servicing the next available request.
     self.timer = None
     # This is a way to ignore the allow/disallow directives
     # For example, if you're checking for allow in other places
     self.allowAll = allowAll
     self.userAgentString = reppy.getUserAgentString(self.agent)
     self.lock  = threading.RLock()
     self.tlock = threading.RLock()
Exemplo n.º 3
0
    def __init__(self,
                 poolSize=10,
                 agent=None,
                 stopWhenDone=False,
                 delay=2,
                 allowAll=False,
                 use_lock=None,
                 **kwargs):

        # First, call the parent constructor
        BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)

        # Import DownpourLock only if use_lock specified, because it uses
        # *NIX-specific features. We use one lock for the pldQueue and one
        # for all the request queues collectively. The latter is a tad
        # overly restrictive, but is far easier than managing hundreds
        # of locks for hundreds of queues.
        if use_lock:
            import DownpourLock
            self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock)
            self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock)
        else:
            self.pld_lock = threading.RLock()
            self.req_lock = threading.RLock()
        self.twi_lock = threading.RLock()  # Twisted reactor lock

        # Include a priority queue of plds
        self.pldQueue = PLDQueue('plds', **kwargs)
        # Make sure that there is an entry in the plds for
        # each domain waiting to be fetched. Also, include
        # the number of urls from each domain in the count
        # of remaining urls to be fetched.
        self.r = redis.Redis(**kwargs)
        # Redis has a pipeline feature that allows for bulk
        # requests, the result of which is a list of the
        # result of each individual request. Thus, only get
        # the length of each of the queues in the pipeline
        # as we're just going to set remaining to the sum
        # of the lengths of each of the domain queues.
        with self.r.pipeline() as p:
            for key in self.r.keys('domain:*'):
                with self.pld_lock:
                    self.pldQueue.push_init(key, 0)
                p.llen(key)
            self.remaining = sum(p.execute())
        # For whatever reason, pushing key names back into the
        # priority queue has been problematic. As such, we'll
        # set them aside as they fail, and then retry them at
        # some point. Like when the next request finishes.
        self.retries = []
        # Now make a queue for incoming requests
        self.requests = qr.Queue('request', **kwargs)
        self.delay = float(delay)
        # This is used when we have to impose a delay before
        # servicing the next available request.
        with self.twi_lock:
            self.timer = None
        # This is a way to ignore the allow/disallow directives
        # For example, if you're checking for allow in other places
        self.allowAll = allowAll
        self.userAgentString = reppy.getUserAgentString(self.agent)