예제 #1
0
    def get_analyzerchain(self, linkQ):
        """
        Creates an AnalyzerChain with these analyzers:

            GetLinks (10x)
            CrawlStateAnalyzer (with a handle to self.fetchInfoQ)
            SpeedDiagnostics

        This starts the AnalyzerChain and returns it.
        """
        try:
            ac = AnalyzerChain(self._go, self._debug)
            ac.append(LogInfo, 1)
            ac.append(GetLinks, 10)
            ac.append(SpeedDiagnostics, 1)
            ac.append(CrawlStateAnalyzer, 1, kwargs={'linkQ': linkQ})
            ac.start()
            return ac
        except Exception, exc:
            multi_syslog(exc, logger=self.logger.warning)
예제 #2
0
    def run(self):
        """
        Moves records out of fetchInfoQ and into ...
        """
        self.prepare_process()

        try:
            while not self._stop.is_set():
                self.logger.debug('CrawlStateManager loop.')
                # do something to put hosts into packed_hostQ for the fetchers
                try:
                    info = self.fetchInfoQ.get_nowait()
                    self.logger.debug('Got %r' % info)
                except Queue.Empty:
                    sleep(1)
                    continue
                # must be a FetchInfo
                assert isinstance(info, FetchInfo)
                self.outQ.put(info)
        except Exception, exc:
            multi_syslog(exc, logger=self.logger.warning)
예제 #3
0
    def cleanup(self):
        """send a long message to the log"""
        stop = time()
        self.logger.info("doing cleanup")
        out = ""
        out += "fetcher finished, now analyzing times\n"
        out += "%d deltas to consider\n" % len(self.deltas)

        mean, median, success = self.stats()
        if success is None:
            return

        out += "%.4f mean, %.4f median, %.2f%% succeeded\n" % \
            (mean, median, 100. * success/float(len(self.deltas)))
        begin = 0
        self.logger.info("entering cleanup for loop")
        for i in range(1,11):                      # consider ten bins
            frac = i / 10.
            end = int(len(self.deltas) * frac)     # index number of the last item in this bin
            t = self.deltas[end-1][0]              # slowest time in this bin
            num_per_frac = len(self.deltas[begin:end])
            if num_per_frac == 0:
                num_per_frac = 1
            rate = t > 0 and num_per_frac / t or 0 # slowest rate in this bin
            success = 0
            failure = 0
            for d in self.deltas[begin:end]:
                if d[1] <= 1:  # means reject
                    success += 1
                else:          # means rejected for some reason
                    failure += 1
            begin = end        # bottom index of the next bin
            out += "%.f%%    %.4f sec    %.4f per sec    %d (%.2f%%) succeeded    %d failed\n" % \
                  (frac * 100, t, rate, success, 100.*success/num_per_frac, failure)
        # arrival time processing
        #arrivals.sort()
        window_sizes = [1., 10., 100., 1000.] # seconds
        binned_arrivals = {}
        for win in window_sizes:
            binned_arrivals[win] = {}
            max = int(win * (1 + int((stop-self.start_time)/win)))
            for i in range(0, max, int(win)):
                binned_arrivals[win][i] = 0
            #out += binned_arrivals[win] + "\n"
            for a in self.arrivals:
                k = int(win * int(a / win))
                binned_arrivals[win][k] += 1
            #out += binned_arrivals[win] + "\n"
            avg_rate = 0.
            for k in binned_arrivals[win]:
                avg_rate += binned_arrivals[win][k]/win
            avg_rate /= len(binned_arrivals[win])
            out += "Averaging over fixed bins of size %d, we have a rate of %.4f completions/second\n" % (win, avg_rate)
            bins = binned_arrivals[win].items()
            bins.sort()
            tot = 0.
            for bin, count in bins:
                tot += count
            for bin, count in bins:
                out += "%s --> %s, %.1f\n" % (bin, count, count / tot)
        multi_syslog(out, logger=self.logger.debug)
        self.logger.info("SpeedDiagnostics finished")
예제 #4
0
    def run(self):
        """
        Gets yzable objects out of inQ, calls self.analyze(yzable)
        """
        try:
            self.prepare_process()
            self.prepare()
            self.logger.debug("Starting.")
            while not self._stop.is_set():
                try:
                    yzable = self.inQ.get(timeout=self.queue_wait_sleep)
                except Queue.Empty:
                    continue
                self._trace("Analyzer %s getting ready to process %s" % (type(self).__name__,
                                                                         yzable))

                # We need to keep things sane.  This is anti-duck-typing to be sure
                # we're sending things we intended to send.
                if not isinstance(yzable, Analyzable):
                    self.logger.error('Expected instance of Analyzable(), got %r' % yzable)
                    yzable = None

                processed_yzable = None
                try:
                    if yzable is not None:
                        processed_yzable = self.analyze(yzable)
                except Exception, exc:
                    multi_syslog(
                        msg="analyze failed on: %s%s" % ( \
                            hasattr(yzable, "hostkey") and yzable.hostkey or "",
                            hasattr(yzable, "relurl") and yzable.relurl or ""),
                        exc=exc,
                        logger=self.logger.warning)

                # If there was an exception, drop it!
                # Some analyzers are intended to be filters. don't let things
                # past if they fail to work properly.
                yzable = processed_yzable

                # Drop yzable on request of analyzer.
                if yzable is None:
                    if self.in_flight is not None:
                        self.in_flight.acquire()
                        self.in_flight.value -= 1
                        self.in_flight.release()

                    self.logger.debug("Dropped yzable.")
                    continue

                # this blocks when processes later in the chain block
                initial_block = time()
                last_blocked = initial_block
                while not self._stop.is_set():
                    try:
                        self.outQ.put(yzable, timeout=self.queue_wait_sleep)
                        break
                    except Queue.Full:
                        cur = time()
                        if (cur - last_blocked) > 10:
                            self.logger.warning("Chain blocked for %d seconds" % (cur - initial_block))
                            last_blocked = cur
                self._trace("Analyzer %s finished processing %s" % (type(self).__name__,
                                                                    yzable))

            self.cleanup()
예제 #5
0
class Analyzer(Process):
    """
    Super class for all content analyzers.  This creates Queues for
    passing document-processing jobs through a chain of Analyzer
    instances.

    Subclasses should implement .prepare() and .analyze(job) and
    .cleanup() and set the 'name' attr.
    """
    name = "Analyzer Base Class"
    def __init__(self, inQ, outQ, debug=False, trace=False,
                 in_flight=None, queue_wait_sleep=1):
        """
        Sets up an inQ, outQ
        """
        Process.__init__(self, go=None, debug=debug)
        self.inQ  = inQ
        self.outQ = outQ
        self.trace = trace
        self.queue_wait_sleep = queue_wait_sleep
        self.in_flight = in_flight

        assert float(queue_wait_sleep) > 0

        if in_flight is not None:
            assert hasattr(in_flight, 'value')
            assert hasattr(in_flight, 'acquire')
            assert hasattr(in_flight, 'release')

    def prepare_process(self):
        super(Analyzer, self).prepare_process()
        self.logger = logging.getLogger("PyCrawler.Analyzer")

    def run(self):
        """
        Gets yzable objects out of inQ, calls self.analyze(yzable)
        """
        try:
            self.prepare_process()
            self.prepare()
            self.logger.debug("Starting.")
            while not self._stop.is_set():
                try:
                    yzable = self.inQ.get(timeout=self.queue_wait_sleep)
                except Queue.Empty:
                    continue
                self._trace("Analyzer %s getting ready to process %s" % (type(self).__name__,
                                                                         yzable))

                # We need to keep things sane.  This is anti-duck-typing to be sure
                # we're sending things we intended to send.
                if not isinstance(yzable, Analyzable):
                    self.logger.error('Expected instance of Analyzable(), got %r' % yzable)
                    yzable = None

                processed_yzable = None
                try:
                    if yzable is not None:
                        processed_yzable = self.analyze(yzable)
                except Exception, exc:
                    multi_syslog(
                        msg="analyze failed on: %s%s" % ( \
                            hasattr(yzable, "hostkey") and yzable.hostkey or "",
                            hasattr(yzable, "relurl") and yzable.relurl or ""),
                        exc=exc,
                        logger=self.logger.warning)

                # If there was an exception, drop it!
                # Some analyzers are intended to be filters. don't let things
                # past if they fail to work properly.
                yzable = processed_yzable

                # Drop yzable on request of analyzer.
                if yzable is None:
                    if self.in_flight is not None:
                        self.in_flight.acquire()
                        self.in_flight.value -= 1
                        self.in_flight.release()

                    self.logger.debug("Dropped yzable.")
                    continue

                # this blocks when processes later in the chain block
                initial_block = time()
                last_blocked = initial_block
                while not self._stop.is_set():
                    try:
                        self.outQ.put(yzable, timeout=self.queue_wait_sleep)
                        break
                    except Queue.Full:
                        cur = time()
                        if (cur - last_blocked) > 10:
                            self.logger.warning("Chain blocked for %d seconds" % (cur - initial_block))
                            last_blocked = cur
                self._trace("Analyzer %s finished processing %s" % (type(self).__name__,
                                                                    yzable))

            self.cleanup()
        except Exception, exc:
            multi_syslog(exc, logger=self.logger.warning)
예제 #6
0
    def run(self):
        """
        Gets an Analyzable object from inQ and feeds them into the
        chain of analyzers, which are its child processes.
        """
        self.prepare_process()
        try:
            if not self._yzers:
                self.logger.warning("run called with no analyzers")
                # The next lines stops coverage, so doesn't get recorded as covered.
                return self.cleanup_process() # pragma: no cover

            self.logger.debug("starting yzers with queues between")

            queues = [multiprocessing.Queue(self.qlen)]
            for pos in range(len(self._yzers)):
                queues.append(multiprocessing.Queue(self.qlen))
                (yzer, copies, args, kwargs) = self._yzers[pos]
                yzers = [yzer(queues[pos], queues[pos + 1], 
                              *args,
                              debug=self._debug,
                              queue_wait_sleep=self.queue_wait_sleep,
                              in_flight=self.in_flight,
                              **kwargs
                              )
                         for copy in range(copies)]
                for yzer in yzers:
                    yzer.start()
                self._yzers[pos] = yzers
            self.logger.debug("Starting main loop")
            yzable = None

            def pop_queue(timeout=None):
                """
                Pop as many items as we can off of the queue of finished items,
                without waiting.

                If timeout a positive integer, then wait up to timeout seconds
                for the first item, and get the rest without waiting.
                """
                
                block = bool(timeout)
                try:
                    while True:
                        my_yzable = queues[-1].get(block=block, timeout=timeout)

                        # After a single success, we become non-blocking and only
                        # eat available items.
                        timeout = None
                        block = False

                        self.in_flight.acquire()
                        self.in_flight.value -= 1
                        self.in_flight.release()

                        self.total_processed += 1
                except Queue.Empty:
                    pass

            self.last_in_flight = None
            last_in_flight_error_report = 0
            get_new = True

            while get_new or self.in_flight.value > 0:
                #syslog(
                #    LOG_DEBUG, "%d in_flight %d ever %d inQ.qsize %s" \
                #        % (in_flight, total_processed,
                #           self.inQ.qsize(),
                #           multiprocessing.active_children()))

                # We simply *must* always attempt to get objects from the queue if
                # they are available.  it is never acceptable to block on putting things
                # in the next queue, as there may be things in flight we need.

                if get_new:
                    try:
                        yzable = self.inQ.get(timeout=self.queue_wait_sleep)
                    except Queue.Empty:
                        yzable = None

                    if isinstance(yzable, Canary):
                        get_new = False
                        self.logger.info("Got canary, stopping.")
                    elif yzable is not None:
                        self.enqueue_yzable(queues[0], yzable, pop_queue)
                else:
                    pop_queue(timeout=self.queue_wait_sleep)

                # if none are in_flight, then we can sleep here
                curtime = time()
                # FIXME: we may not want to sleep if things are in-flight
                # and we have successfully popped an item.  But we want to
                # ensure we don't spin here.

                if self.in_flight.value > 0 and self.timewarn and \
                        curtime - self.last_in_flight > self.timewarn:
                    # report warnings if we've been waiting too long!
                    if curtime - last_in_flight_error_report > self.timewarn:
                        self.logger.warning('Most recent in-flight packet over %d seconds old, ' \
                                            '%d outstanding!' % (curtime - self.last_in_flight,
                                                                 self.in_flight.value))
                        last_in_flight_error_report = curtime

                if self.in_flight.value != 0 and not get_new and \
                        self.timeout and (curtime - self.last_in_flight) > self.timeout:
                    break

            # go is clear and none in_flight, stop all analyzers
            self.logger.info("Finished main loop")
            try:
                self.inQ.close()
                self.inQ.cancel_join_thread()
            except Exception, exc:
                multi_syslog(exc, logger=self.logger.warning)
            # stop all yzers
            for pos in range(len(self._yzers)):
                for yzer in self._yzers[pos]:
                    try:
                        yzer.stop()
                    except Exception, exc:
                        multi_syslog("%s(%d)" % (yzer.name, yzer.pid), exc,
                                     logger=self.logger.warning)
예제 #7
0
            try:
                self.inQ.close()
                self.inQ.cancel_join_thread()
            except Exception, exc:
                multi_syslog(exc, logger=self.logger.warning)
            # stop all yzers
            for pos in range(len(self._yzers)):
                for yzer in self._yzers[pos]:
                    try:
                        yzer.stop()
                    except Exception, exc:
                        multi_syslog("%s(%d)" % (yzer.name, yzer.pid), exc,
                                     logger=self.logger.warning)
            self.logger.debug("stopped all Analyzers")
        except Exception, exc:
            multi_syslog(exc, logger=self.logger.warning)
        try:
            self.cleanup_process()
        except Exception, exc:
            multi_syslog(exc, logger=self.logger.warning)

    def list_analyzers(self):
        """
        Returns a list of tuples (analyzer.name, copies)
        """
        return [(copies[0].name, len(copies)) 
                for copies in self._yzers]

class Analyzer(Process):
    """
    Super class for all content analyzers.  This creates Queues for