def _setup_stats_plugins(self, redis_conn): ''' Sets up the plugin stats collectors @param redis_conn: the redis connection ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total
def _setup_stats_status_codes(self): ''' Sets up the status code stats collectors ''' hostname = self._get_hostname() # we chose to handle 504's here as well as in the middleware # in case the middleware is disabled self.logger.debug("Setting up log retry middleware stats") status_code = 504 temp_key = 'stats:crawler:{h}:{n}:{s}'.format( h=hostname, n=self.name, s=status_code) for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict[time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up LRM status code {s}, {n} spider,"\ " host {h} Stats Collector '{i}'"\ .format(h=hostname, n=self.name, s=status_code, i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up status code {s}, {n} spider,"\ "host {h} Stats Collector 'lifetime'"\ .format(h=hostname, n=self.name, s=status_code)) self.stats_dict['lifetime'] = total
def _setup_stats_plugins(self): ''' Sets up the plugin stats collectors ''' self.stats_dict['plugins'] = {} for key in self.plugins_dict: plugin_name = self.plugins_dict[key]['instance'].__class__.__name__ temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name) self.stats_dict['plugins'][plugin_name] = {} for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['plugins'][plugin_name][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\ .format(p=plugin_name, i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\ .format(p=plugin_name)) self.stats_dict['plugins'][plugin_name]['lifetime'] = total
def _setup_stats_total(self, redis_conn): ''' Sets up the total stats collectors @param redis_conn: the redis connection ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:kafka-monitor:total' temp_key2 = 'stats:kafka-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter( redis_conn=redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2
def _setup_stats_total(self): ''' Sets up the total stats collectors ''' self.stats_dict['total'] = {} self.stats_dict['fail'] = {} temp_key1 = 'stats:redis-monitor:total' temp_key2 = 'stats:redis-monitor:fail' for item in self.settings['STATS_TIMES']: try: time = getattr(StatsCollector, item) self.stats_dict['total'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key1, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.stats_dict['fail'][time] = StatsCollector \ .get_rolling_time_window( redis_conn=self.redis_conn, key='{k}:{t}'.format(k=temp_key2, t=time), window=time, cycle_time=self.settings['STATS_CYCLE']) self.logger.debug("Set up total/fail Stats Collector '{i}'"\ .format(i=item)) except AttributeError as e: self.logger.warning("Unable to find Stats Time '{s}'"\ .format(s=item)) total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key1), cycle_time=self.settings['STATS_CYCLE'], roll=False) total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn, key='{k}:lifetime'.format(k=temp_key2), cycle_time=self.settings['STATS_CYCLE'], roll=False) self.logger.debug("Set up total/fail Stats Collector 'lifetime'") self.stats_dict['total']['lifetime'] = total1 self.stats_dict['fail']['lifetime'] = total2
def _create_rule_counter(self, rule_id): """ Create a rule counter :param rule_id: id of the rule to create a counter for :return: stats_collector: StatsCollector rolling time window """ collection_window = int(os.getenv('STATS_COLLECTION_WINDOW', 900)) stats_key = 'stats:{}:{}:{}'.format(self.traptor_type, self.traptor_id, rule_id) stats_collector = StatsCollector.get_rolling_time_window(redis_conn=self.redis_conn, key=stats_key, window=collection_window) return stats_collector
default=False) parser.add_argument('-r', '--redis-host', action='store', required=True, help="The Redis host ip") parser.add_argument('-p', '--redis-port', action='store', default='6379', help="The Redis port") parser.add_argument('-P', '--redis-password', action='store', default=None, help="The Redis password") args = vars(parser.parse_args()) the_window = StatsCollector.SECONDS_1_MINUTE if args['rolling_window']: counter = StatsCollector.get_rolling_time_window(host=args['redis_host'], port=args['redis_port'], password=args['redis_password'], window=the_window, cycle_time=1) else: counter = StatsCollector.get_time_window(host=args['redis_host'], port=args['redis_port'], password=args['redis_password'], window=the_window, keep_max=3) print("Kill this program by pressing `ENTER` when done") the_time = int(time()) floor_time = the_time % the_window final_time = the_time - floor_time
description='Example key press stats collector.\n') parser.add_argument('-rw', '--rolling-window', action='store_true', required=False, help="Use a RollingTimeWindow counter", default=False) parser.add_argument('-r', '--redis-host', action='store', required=True, help="The Redis host ip") parser.add_argument('-p', '--redis-port', action='store', default='6379', help="The Redis port") args = vars(parser.parse_args()) the_window = StatsCollector.SECONDS_1_MINUTE if args['rolling_window']: counter = StatsCollector.get_rolling_time_window(host=args['redis_host'], port=args['redis_port'], window=the_window, cycle_time=1) else: counter = StatsCollector.get_time_window(host=args['redis_host'], port=args['redis_port'], window=the_window, keep_max=3) print "Kill this program by pressing `ENTER` when done" the_time = int(time()) floor_time = the_time % the_window final_time = the_time - floor_time pressed_enter = False while not pressed_enter:
def parse(self, response): self._logger.info("crawled url {}".format(response.request.url)) cur_depth = 0 if 'curdepth' in response.meta: cur_depth = response.meta['curdepth'] # capture raw response item = RawResponseItem() # populated from response.meta item['appid'] = response.meta['appid'] item['crawlid'] = response.meta['crawlid'] item['attrs'] = response.meta['attrs'] # populated from raw HTTP response item["url"] = response.request.url item["response_url"] = response.url item["status_code"] = response.status item["status_msg"] = "OK" item["response_headers"] = self.reconstruct_headers(response) item["request_headers"] = response.request.headers item["body"] = response.body item["links"] = [] if item['crawlid'] not in self.counter: key='{k}:{i}'.format(k=self.temp_key, i=item['crawlid'] ) self._logger.info("no counter for {crawlid} key {key} ".format(crawlid=item['crawlid'], key=key)) self.counter[item['crawlid']] = StatsCollector.get_counter(redis_conn=self.redis_conn, key=key, roll=False) self._logger.info("counter {crawlid} preincrement".format(crawlid = item['crawlid'])) self.counter[item['crawlid']].increment() self._logger.info("counter {crawlid} postincrement".format(crawlid=item['crawlid'])) # determine whether to continue spidering if cur_depth >= response.meta['maxdepth']: self._logger.info("Not spidering links in '{}' because" \ " cur_depth={} >= maxdepth={}".format( response.url, cur_depth, response.meta['maxdepth'])) else: # we are spidering -- yield Request for each discovered link link_extractor = LinkExtractor( allow_domains=response.meta['allowed_domains'], allow=response.meta['allow_regex'], deny=response.meta['deny_regex'], deny_extensions=response.meta['deny_extensions']) for link in link_extractor.extract_links(response): # link that was discovered the_url = link.url the_url = the_url.replace('\n', '') item["links"].append({"url": the_url, "text": link.text, }) req = Request(the_url, callback=self.parse) req.meta['priority'] = response.meta['priority'] - 10 req.meta['curdepth'] = response.meta['curdepth'] + 1 if 'useragent' in response.meta and \ response.meta['useragent'] is not None: req.headers['User-Agent'] = response.meta['useragent'] self._logger.debug("Trying to follow link '{}'".format(req.url)) yield req # raw response has been processed, yield to item pipeline yield item