示例#1
0
    def _setup_stats_plugins(self, redis_conn):
        '''
        Sets up the plugin stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:kafka-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(
                redis_conn=redis_conn,
                key='{k}:lifetime'.format(k=temp_key),
                cycle_time=self.settings['STATS_CYCLE'],
                roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total
    def _setup_stats_status_codes(self):
        '''
        Sets up the status code stats collectors
        '''
        hostname = self._get_hostname()
        # we chose to handle 504's here as well as in the middleware
        # in case the middleware is disabled
        self.logger.debug("Setting up log retry middleware stats")
        status_code = 504
        temp_key = 'stats:crawler:{h}:{n}:{s}'.format(
            h=hostname, n=self.name, s=status_code)
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)

                self.stats_dict[time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up LRM status code {s}, {n} spider,"\
                    " host {h} Stats Collector '{i}'"\
                    .format(h=hostname, n=self.name, s=status_code, i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        self.logger.debug("Set up status code {s}, {n} spider,"\
                    "host {h} Stats Collector 'lifetime'"\
                        .format(h=hostname, n=self.name, s=status_code))
        self.stats_dict['lifetime'] = total
示例#3
0
    def _setup_stats_plugins(self):
        '''
        Sets up the plugin stats collectors
        '''
        self.stats_dict['plugins'] = {}
        for key in self.plugins_dict:
            plugin_name = self.plugins_dict[key]['instance'].__class__.__name__
            temp_key = 'stats:redis-monitor:{p}'.format(p=plugin_name)
            self.stats_dict['plugins'][plugin_name] = {}
            for item in self.settings['STATS_TIMES']:
                try:
                    time = getattr(StatsCollector, item)

                    self.stats_dict['plugins'][plugin_name][time] = StatsCollector \
                            .get_rolling_time_window(
                                    redis_conn=self.redis_conn,
                                    key='{k}:{t}'.format(k=temp_key, t=time),
                                    window=time,
                                    cycle_time=self.settings['STATS_CYCLE'])
                    self.logger.debug("Set up {p} plugin Stats Collector '{i}'"\
                            .format(p=plugin_name, i=item))
                except AttributeError as e:
                    self.logger.warning("Unable to find Stats Time '{s}'"\
                            .format(s=item))
            total = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                            key='{k}:lifetime'.format(k=temp_key),
                            cycle_time=self.settings['STATS_CYCLE'],
                            roll=False)
            self.logger.debug("Set up {p} plugin Stats Collector 'lifetime'"\
                            .format(p=plugin_name))
            self.stats_dict['plugins'][plugin_name]['lifetime'] = total
示例#4
0
    def _setup_stats_total(self, redis_conn):
        '''
        Sets up the total stats collectors

        @param redis_conn: the redis connection
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:kafka-monitor:total'
        temp_key2 = 'stats:kafka-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])
                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key1),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        total2 = StatsCollector.get_hll_counter(
            redis_conn=redis_conn,
            key='{k}:lifetime'.format(k=temp_key2),
            cycle_time=self.settings['STATS_CYCLE'],
            roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2
示例#5
0
    def _setup_stats_total(self):
        '''
        Sets up the total stats collectors
        '''
        self.stats_dict['total'] = {}
        self.stats_dict['fail'] = {}
        temp_key1 = 'stats:redis-monitor:total'
        temp_key2 = 'stats:redis-monitor:fail'
        for item in self.settings['STATS_TIMES']:
            try:
                time = getattr(StatsCollector, item)
                self.stats_dict['total'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key1, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])

                self.stats_dict['fail'][time] = StatsCollector \
                        .get_rolling_time_window(
                                redis_conn=self.redis_conn,
                                key='{k}:{t}'.format(k=temp_key2, t=time),
                                window=time,
                                cycle_time=self.settings['STATS_CYCLE'])

                self.logger.debug("Set up total/fail Stats Collector '{i}'"\
                        .format(i=item))
            except AttributeError as e:
                self.logger.warning("Unable to find Stats Time '{s}'"\
                        .format(s=item))
        total1 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key1),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        total2 = StatsCollector.get_hll_counter(redis_conn=self.redis_conn,
                        key='{k}:lifetime'.format(k=temp_key2),
                        cycle_time=self.settings['STATS_CYCLE'],
                        roll=False)
        self.logger.debug("Set up total/fail Stats Collector 'lifetime'")
        self.stats_dict['total']['lifetime'] = total1
        self.stats_dict['fail']['lifetime'] = total2
示例#6
0
    def _create_rule_counter(self, rule_id):
        """
        Create a rule counter

        :param rule_id: id of the rule to create a counter for
        :return: stats_collector: StatsCollector rolling time window
        """
        collection_window = int(os.getenv('STATS_COLLECTION_WINDOW', 900))
        stats_key = 'stats:{}:{}:{}'.format(self.traptor_type, self.traptor_id, rule_id)
        stats_collector = StatsCollector.get_rolling_time_window(redis_conn=self.redis_conn,
                                                                 key=stats_key,
                                                                 window=collection_window)

        return stats_collector
示例#7
0
                    default=False)
parser.add_argument('-r', '--redis-host', action='store', required=True,
                    help="The Redis host ip")
parser.add_argument('-p', '--redis-port', action='store', default='6379',
                    help="The Redis port")
parser.add_argument('-P', '--redis-password', action='store', default=None,
                    help="The Redis password")

args = vars(parser.parse_args())

the_window = StatsCollector.SECONDS_1_MINUTE

if args['rolling_window']:
    counter = StatsCollector.get_rolling_time_window(host=args['redis_host'],
                                                     port=args['redis_port'],
                                                     password=args['redis_password'],
                                                     window=the_window,
                                                     cycle_time=1)
else:
    counter = StatsCollector.get_time_window(host=args['redis_host'],
                                                     port=args['redis_port'],
                                                     password=args['redis_password'],
                                                     window=the_window,
                                                     keep_max=3)

print("Kill this program by pressing `ENTER` when done")

the_time = int(time())
floor_time = the_time % the_window
final_time = the_time - floor_time
示例#8
0
    description='Example key press stats collector.\n')
parser.add_argument('-rw', '--rolling-window', action='store_true',
                    required=False, help="Use a RollingTimeWindow counter",
                    default=False)
parser.add_argument('-r', '--redis-host', action='store', required=True,
                    help="The Redis host ip")
parser.add_argument('-p', '--redis-port', action='store', default='6379',
                    help="The Redis port")

args = vars(parser.parse_args())

the_window = StatsCollector.SECONDS_1_MINUTE

if args['rolling_window']:
    counter = StatsCollector.get_rolling_time_window(host=args['redis_host'],
                                                     port=args['redis_port'],
                                                     window=the_window,
                                                     cycle_time=1)
else:
    counter = StatsCollector.get_time_window(host=args['redis_host'],
                                                     port=args['redis_port'],
                                                     window=the_window,
                                                     keep_max=3)

print "Kill this program by pressing `ENTER` when done"

the_time = int(time())
floor_time = the_time % the_window
final_time = the_time - floor_time

pressed_enter = False
while not pressed_enter:
示例#9
0
    def parse(self, response):
        self._logger.info("crawled url {}".format(response.request.url))
        cur_depth = 0
        if 'curdepth' in response.meta:
            cur_depth = response.meta['curdepth']

        # capture raw response
        item = RawResponseItem()
        # populated from response.meta
        item['appid'] = response.meta['appid']
        item['crawlid'] = response.meta['crawlid']
        item['attrs'] = response.meta['attrs']

        # populated from raw HTTP response
        item["url"] = response.request.url
        item["response_url"] = response.url
        item["status_code"] = response.status
        item["status_msg"] = "OK"
        item["response_headers"] = self.reconstruct_headers(response)
        item["request_headers"] = response.request.headers
        item["body"] = response.body
        item["links"] = []

        if item['crawlid']  not in self.counter:

            key='{k}:{i}'.format(k=self.temp_key, i=item['crawlid'] )
            self._logger.info("no counter for {crawlid} key {key} ".format(crawlid=item['crawlid'], key=key))
            self.counter[item['crawlid']] = StatsCollector.get_counter(redis_conn=self.redis_conn,
                            key=key, roll=False)

        self._logger.info("counter {crawlid} preincrement".format(crawlid = item['crawlid']))
        self.counter[item['crawlid']].increment()
        self._logger.info("counter {crawlid} postincrement".format(crawlid=item['crawlid']))

        # determine whether to continue spidering
        if cur_depth >= response.meta['maxdepth']:
            self._logger.info("Not spidering links in '{}' because" \
                " cur_depth={} >= maxdepth={}".format(
                                                      response.url,
                                                      cur_depth,
                                                      response.meta['maxdepth']))
        else:
            # we are spidering -- yield Request for each discovered link
            link_extractor = LinkExtractor(
                            allow_domains=response.meta['allowed_domains'],
                            allow=response.meta['allow_regex'],
                            deny=response.meta['deny_regex'],
                            deny_extensions=response.meta['deny_extensions'])

            for link in link_extractor.extract_links(response):
                # link that was discovered
                the_url = link.url
                the_url = the_url.replace('\n', '')
                item["links"].append({"url": the_url, "text": link.text, })
                req = Request(the_url, callback=self.parse)

                req.meta['priority'] = response.meta['priority'] - 10
                req.meta['curdepth'] = response.meta['curdepth'] + 1

                if 'useragent' in response.meta and \
                        response.meta['useragent'] is not None:
                    req.headers['User-Agent'] = response.meta['useragent']

                self._logger.debug("Trying to follow link '{}'".format(req.url))
                yield req

        # raw response has been processed, yield to item pipeline
        yield item