def __init__(self, poolSize=10, agent=None, stopWhenDone=False, delay=2, allowAll=False, use_lock=None, **kwargs): # First, call the parent constructor BaseFetcher.__init__(self, poolSize, agent, stopWhenDone) # Import DownpourLock only if use_lock specified, because it uses # *NIX-specific features. We use one lock for the pldQueue and one # for all the request queues collectively. The latter is a tad # overly restrictive, but is far easier than managing hundreds # of locks for hundreds of queues. if use_lock: import DownpourLock self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock) self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock) else: self.pld_lock = threading.RLock() self.req_lock = threading.RLock() self.twi_lock = threading.RLock() # Twisted reactor lock # Include a priority queue of plds self.pldQueue = PLDQueue('plds', **kwargs) # Make sure that there is an entry in the plds for # each domain waiting to be fetched. Also, include # the number of urls from each domain in the count # of remaining urls to be fetched. self.r = redis.Redis(**kwargs) # Redis has a pipeline feature that allows for bulk # requests, the result of which is a list of the # result of each individual request. Thus, only get # the length of each of the queues in the pipeline # as we're just going to set remaining to the sum # of the lengths of each of the domain queues. with self.r.pipeline() as p: for key in self.r.keys('domain:*'): with self.pld_lock: self.pldQueue.push_init(key, 0) p.llen(key) self.remaining = sum(p.execute()) # For whatever reason, pushing key names back into the # priority queue has been problematic. As such, we'll # set them aside as they fail, and then retry them at # some point. Like when the next request finishes. self.retries = [] # Now make a queue for incoming requests self.requests = qr.Queue('request', **kwargs) self.delay = float(delay) # This is used when we have to impose a delay before # servicing the next available request. with self.twi_lock: self.timer = None # This is a way to ignore the allow/disallow directives # For example, if you're checking for allow in other places self.allowAll = allowAll self.userAgentString = reppy.getUserAgentString(self.agent)
def __init__(self, poolSize=10, agent=None, stopWhenDone=False, delay=2, allowAll=False, **kwargs): # Call the parent constructor BaseFetcher.__init__(self, poolSize, agent, stopWhenDone) # Include a priority queue of plds self.pldQueue = qr.PriorityQueue('plds', **kwargs) # Make sure that there is an entry in the plds for # each domain waiting to be fetched. Also, include # the number of urls from each domain in the count # of remaining urls to be fetched. self.r = redis.Redis(**kwargs) # Redis has a pipeline feature that allows for bulk # requests, the result of which is a list of the # result of each individual request. Thus, only get # the length of each of the queues in the pipeline # as we're just going to set remaining to the sum # of the lengths of each of the domain queues. with self.r.pipeline() as p: for key in self.r.keys('domain:*'): self.pldQueue.push(key, 0) p.llen(key) self.remaining = sum(p.execute()) # For whatever reason, pushing key names back into the # priority queue has been problematic. As such, we'll # set them aside as they fail, and then retry them at # some point. Like when the next request finishes. self.retries = [] # Now make a queue for incoming requests self.requests = qr.Queue('request', **kwargs) self.delay = float(delay) # This is used when we have to impose a delay before # servicing the next available request. self.timer = None # This is a way to ignore the allow/disallow directives # For example, if you're checking for allow in other places self.allowAll = allowAll self.userAgentString = reppy.getUserAgentString(self.agent) self.lock = threading.RLock() self.tlock = threading.RLock()