def pop(self, polite=True): '''Get the next request''' while True: # First, we pop the next thing in pldQueue *if* it's not a # premature fetch (and a race condition is not detected). with self.pld_lock: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: # logger.debug('Nothing in pldQueue.') return None # If the next-fetchable is too soon, wait. If we're # already waiting, don't schedule a double callLater. now = time.time() if polite and when > now: with self.twi_lock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater( when - now, self.serveNext) return None # If we get here, we don't need to wait. However, the # multithreaded nature of Twisted means that something # else might be waiting. Only clear timer if it's not # holding some other pending call. with self.twi_lock: if not (self.timer and self.timer.active()): self.timer = None # We know the time has passed (we peeked) so pop it. next = self.pldQueue.pop() # Get the queue pertaining to the PLD of interest and # acquire a request lock for it. q = qr.Queue(next) with self.req_lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: logger.debug('maxParallelRequests exceeded for %s' % next) with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) try: with self.pld_lock: self.pldQueue.clear_ph(next) except ValueError: logger.error( 'pldQueue.clear_ph failed for %s' % next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + 20) logger.debug( 'Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None
def pop(self, polite=True): '''Get the next request''' while True: # First, we pop the next thing in pldQueue *if* it's not a # premature fetch (and a race condition is not detected). with self.pld_lock: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: # logger.debug('Nothing in pldQueue.') return None # If the next-fetchable is too soon, wait. If we're # already waiting, don't schedule a double callLater. now = time.time() if polite and when > now: with self.twi_lock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater(when - now, self.serveNext) return None # If we get here, we don't need to wait. However, the # multithreaded nature of Twisted means that something # else might be waiting. Only clear timer if it's not # holding some other pending call. with self.twi_lock: if not (self.timer and self.timer.active()): self.timer = None # We know the time has passed (we peeked) so pop it. next = self.pldQueue.pop() # Get the queue pertaining to the PLD of interest and # acquire a request lock for it. q = qr.Queue(next) with self.req_lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: logger.debug('maxParallelRequests exceeded for %s' % next) with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. with self.pld_lock: self.pldQueue.push_unique(next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) try: with self.pld_lock: self.pldQueue.clear_ph(next) except ValueError: logger.error('pldQueue.clear_ph failed for %s' % next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) logger.debug('Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None
def pop(self, polite=True): '''Get the next request''' now = time.time() while True: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: return None # If the next-fetchable is not soon enough, then wait if polite and when > now: with self.tlock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater(when - now, self.serveNext) return None else: # Go ahead and pop this item last = next next = self.pldQueue.pop() # Unset the timer self.timer = None q = qr.Queue(next) with self.lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: self.pldQueue.push(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. self.pldQueue.push(next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. self.pldQueue.push(next, time.time() + 20) logger.debug('Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue return None