def get_analyzerchain(self, linkQ): """ Creates an AnalyzerChain with these analyzers: GetLinks (10x) CrawlStateAnalyzer (with a handle to self.fetchInfoQ) SpeedDiagnostics This starts the AnalyzerChain and returns it. """ try: ac = AnalyzerChain(self._go, self._debug) ac.append(LogInfo, 1) ac.append(GetLinks, 10) ac.append(SpeedDiagnostics, 1) ac.append(CrawlStateAnalyzer, 1, kwargs={'linkQ': linkQ}) ac.start() return ac except Exception, exc: multi_syslog(exc, logger=self.logger.warning)
def run(self): """ Moves records out of fetchInfoQ and into ... """ self.prepare_process() try: while not self._stop.is_set(): self.logger.debug('CrawlStateManager loop.') # do something to put hosts into packed_hostQ for the fetchers try: info = self.fetchInfoQ.get_nowait() self.logger.debug('Got %r' % info) except Queue.Empty: sleep(1) continue # must be a FetchInfo assert isinstance(info, FetchInfo) self.outQ.put(info) except Exception, exc: multi_syslog(exc, logger=self.logger.warning)
def cleanup(self): """send a long message to the log""" stop = time() self.logger.info("doing cleanup") out = "" out += "fetcher finished, now analyzing times\n" out += "%d deltas to consider\n" % len(self.deltas) mean, median, success = self.stats() if success is None: return out += "%.4f mean, %.4f median, %.2f%% succeeded\n" % \ (mean, median, 100. * success/float(len(self.deltas))) begin = 0 self.logger.info("entering cleanup for loop") for i in range(1,11): # consider ten bins frac = i / 10. end = int(len(self.deltas) * frac) # index number of the last item in this bin t = self.deltas[end-1][0] # slowest time in this bin num_per_frac = len(self.deltas[begin:end]) if num_per_frac == 0: num_per_frac = 1 rate = t > 0 and num_per_frac / t or 0 # slowest rate in this bin success = 0 failure = 0 for d in self.deltas[begin:end]: if d[1] <= 1: # means reject success += 1 else: # means rejected for some reason failure += 1 begin = end # bottom index of the next bin out += "%.f%% %.4f sec %.4f per sec %d (%.2f%%) succeeded %d failed\n" % \ (frac * 100, t, rate, success, 100.*success/num_per_frac, failure) # arrival time processing #arrivals.sort() window_sizes = [1., 10., 100., 1000.] # seconds binned_arrivals = {} for win in window_sizes: binned_arrivals[win] = {} max = int(win * (1 + int((stop-self.start_time)/win))) for i in range(0, max, int(win)): binned_arrivals[win][i] = 0 #out += binned_arrivals[win] + "\n" for a in self.arrivals: k = int(win * int(a / win)) binned_arrivals[win][k] += 1 #out += binned_arrivals[win] + "\n" avg_rate = 0. for k in binned_arrivals[win]: avg_rate += binned_arrivals[win][k]/win avg_rate /= len(binned_arrivals[win]) out += "Averaging over fixed bins of size %d, we have a rate of %.4f completions/second\n" % (win, avg_rate) bins = binned_arrivals[win].items() bins.sort() tot = 0. for bin, count in bins: tot += count for bin, count in bins: out += "%s --> %s, %.1f\n" % (bin, count, count / tot) multi_syslog(out, logger=self.logger.debug) self.logger.info("SpeedDiagnostics finished")
def run(self): """ Gets yzable objects out of inQ, calls self.analyze(yzable) """ try: self.prepare_process() self.prepare() self.logger.debug("Starting.") while not self._stop.is_set(): try: yzable = self.inQ.get(timeout=self.queue_wait_sleep) except Queue.Empty: continue self._trace("Analyzer %s getting ready to process %s" % (type(self).__name__, yzable)) # We need to keep things sane. This is anti-duck-typing to be sure # we're sending things we intended to send. if not isinstance(yzable, Analyzable): self.logger.error('Expected instance of Analyzable(), got %r' % yzable) yzable = None processed_yzable = None try: if yzable is not None: processed_yzable = self.analyze(yzable) except Exception, exc: multi_syslog( msg="analyze failed on: %s%s" % ( \ hasattr(yzable, "hostkey") and yzable.hostkey or "", hasattr(yzable, "relurl") and yzable.relurl or ""), exc=exc, logger=self.logger.warning) # If there was an exception, drop it! # Some analyzers are intended to be filters. don't let things # past if they fail to work properly. yzable = processed_yzable # Drop yzable on request of analyzer. if yzable is None: if self.in_flight is not None: self.in_flight.acquire() self.in_flight.value -= 1 self.in_flight.release() self.logger.debug("Dropped yzable.") continue # this blocks when processes later in the chain block initial_block = time() last_blocked = initial_block while not self._stop.is_set(): try: self.outQ.put(yzable, timeout=self.queue_wait_sleep) break except Queue.Full: cur = time() if (cur - last_blocked) > 10: self.logger.warning("Chain blocked for %d seconds" % (cur - initial_block)) last_blocked = cur self._trace("Analyzer %s finished processing %s" % (type(self).__name__, yzable)) self.cleanup()
class Analyzer(Process): """ Super class for all content analyzers. This creates Queues for passing document-processing jobs through a chain of Analyzer instances. Subclasses should implement .prepare() and .analyze(job) and .cleanup() and set the 'name' attr. """ name = "Analyzer Base Class" def __init__(self, inQ, outQ, debug=False, trace=False, in_flight=None, queue_wait_sleep=1): """ Sets up an inQ, outQ """ Process.__init__(self, go=None, debug=debug) self.inQ = inQ self.outQ = outQ self.trace = trace self.queue_wait_sleep = queue_wait_sleep self.in_flight = in_flight assert float(queue_wait_sleep) > 0 if in_flight is not None: assert hasattr(in_flight, 'value') assert hasattr(in_flight, 'acquire') assert hasattr(in_flight, 'release') def prepare_process(self): super(Analyzer, self).prepare_process() self.logger = logging.getLogger("PyCrawler.Analyzer") def run(self): """ Gets yzable objects out of inQ, calls self.analyze(yzable) """ try: self.prepare_process() self.prepare() self.logger.debug("Starting.") while not self._stop.is_set(): try: yzable = self.inQ.get(timeout=self.queue_wait_sleep) except Queue.Empty: continue self._trace("Analyzer %s getting ready to process %s" % (type(self).__name__, yzable)) # We need to keep things sane. This is anti-duck-typing to be sure # we're sending things we intended to send. if not isinstance(yzable, Analyzable): self.logger.error('Expected instance of Analyzable(), got %r' % yzable) yzable = None processed_yzable = None try: if yzable is not None: processed_yzable = self.analyze(yzable) except Exception, exc: multi_syslog( msg="analyze failed on: %s%s" % ( \ hasattr(yzable, "hostkey") and yzable.hostkey or "", hasattr(yzable, "relurl") and yzable.relurl or ""), exc=exc, logger=self.logger.warning) # If there was an exception, drop it! # Some analyzers are intended to be filters. don't let things # past if they fail to work properly. yzable = processed_yzable # Drop yzable on request of analyzer. if yzable is None: if self.in_flight is not None: self.in_flight.acquire() self.in_flight.value -= 1 self.in_flight.release() self.logger.debug("Dropped yzable.") continue # this blocks when processes later in the chain block initial_block = time() last_blocked = initial_block while not self._stop.is_set(): try: self.outQ.put(yzable, timeout=self.queue_wait_sleep) break except Queue.Full: cur = time() if (cur - last_blocked) > 10: self.logger.warning("Chain blocked for %d seconds" % (cur - initial_block)) last_blocked = cur self._trace("Analyzer %s finished processing %s" % (type(self).__name__, yzable)) self.cleanup() except Exception, exc: multi_syslog(exc, logger=self.logger.warning)
def run(self): """ Gets an Analyzable object from inQ and feeds them into the chain of analyzers, which are its child processes. """ self.prepare_process() try: if not self._yzers: self.logger.warning("run called with no analyzers") # The next lines stops coverage, so doesn't get recorded as covered. return self.cleanup_process() # pragma: no cover self.logger.debug("starting yzers with queues between") queues = [multiprocessing.Queue(self.qlen)] for pos in range(len(self._yzers)): queues.append(multiprocessing.Queue(self.qlen)) (yzer, copies, args, kwargs) = self._yzers[pos] yzers = [yzer(queues[pos], queues[pos + 1], *args, debug=self._debug, queue_wait_sleep=self.queue_wait_sleep, in_flight=self.in_flight, **kwargs ) for copy in range(copies)] for yzer in yzers: yzer.start() self._yzers[pos] = yzers self.logger.debug("Starting main loop") yzable = None def pop_queue(timeout=None): """ Pop as many items as we can off of the queue of finished items, without waiting. If timeout a positive integer, then wait up to timeout seconds for the first item, and get the rest without waiting. """ block = bool(timeout) try: while True: my_yzable = queues[-1].get(block=block, timeout=timeout) # After a single success, we become non-blocking and only # eat available items. timeout = None block = False self.in_flight.acquire() self.in_flight.value -= 1 self.in_flight.release() self.total_processed += 1 except Queue.Empty: pass self.last_in_flight = None last_in_flight_error_report = 0 get_new = True while get_new or self.in_flight.value > 0: #syslog( # LOG_DEBUG, "%d in_flight %d ever %d inQ.qsize %s" \ # % (in_flight, total_processed, # self.inQ.qsize(), # multiprocessing.active_children())) # We simply *must* always attempt to get objects from the queue if # they are available. it is never acceptable to block on putting things # in the next queue, as there may be things in flight we need. if get_new: try: yzable = self.inQ.get(timeout=self.queue_wait_sleep) except Queue.Empty: yzable = None if isinstance(yzable, Canary): get_new = False self.logger.info("Got canary, stopping.") elif yzable is not None: self.enqueue_yzable(queues[0], yzable, pop_queue) else: pop_queue(timeout=self.queue_wait_sleep) # if none are in_flight, then we can sleep here curtime = time() # FIXME: we may not want to sleep if things are in-flight # and we have successfully popped an item. But we want to # ensure we don't spin here. if self.in_flight.value > 0 and self.timewarn and \ curtime - self.last_in_flight > self.timewarn: # report warnings if we've been waiting too long! if curtime - last_in_flight_error_report > self.timewarn: self.logger.warning('Most recent in-flight packet over %d seconds old, ' \ '%d outstanding!' % (curtime - self.last_in_flight, self.in_flight.value)) last_in_flight_error_report = curtime if self.in_flight.value != 0 and not get_new and \ self.timeout and (curtime - self.last_in_flight) > self.timeout: break # go is clear and none in_flight, stop all analyzers self.logger.info("Finished main loop") try: self.inQ.close() self.inQ.cancel_join_thread() except Exception, exc: multi_syslog(exc, logger=self.logger.warning) # stop all yzers for pos in range(len(self._yzers)): for yzer in self._yzers[pos]: try: yzer.stop() except Exception, exc: multi_syslog("%s(%d)" % (yzer.name, yzer.pid), exc, logger=self.logger.warning)
try: self.inQ.close() self.inQ.cancel_join_thread() except Exception, exc: multi_syslog(exc, logger=self.logger.warning) # stop all yzers for pos in range(len(self._yzers)): for yzer in self._yzers[pos]: try: yzer.stop() except Exception, exc: multi_syslog("%s(%d)" % (yzer.name, yzer.pid), exc, logger=self.logger.warning) self.logger.debug("stopped all Analyzers") except Exception, exc: multi_syslog(exc, logger=self.logger.warning) try: self.cleanup_process() except Exception, exc: multi_syslog(exc, logger=self.logger.warning) def list_analyzers(self): """ Returns a list of tuples (analyzer.name, copies) """ return [(copies[0].name, len(copies)) for copies in self._yzers] class Analyzer(Process): """ Super class for all content analyzers. This creates Queues for