def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) #self.logger.setLevel(logging.DEBUG) self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) # stats setup self.stats_dict = {} self.settings = settings self.name = self.settings['SPIDER_NAME'] if self.settings['STATS_STATUS_CODES']: self.redis_conn = redis.Redis(host=self.settings.get('REDIS_HOST'), port=self.settings.get('REDIS_PORT')) self._setup_stats_status_codes()
def _setup(self): """ Set up Traptor. Load everything up. Note that any arg here will override both default and custom settings. """ traptor_name = 'traptor-{}-{}'.format(os.getenv('TRAPTOR_TYPE', 'track'), os.getenv('TRAPTOR_ID', 0)) # Set up logging self.logger = LogFactory.get_instance(json=os.getenv('LOG_JSON', settings.LOG_JSON) == 'True', name=os.getenv('LOG_NAME', settings.LOG_NAME), stdout=os.getenv('LOG_STDOUT', settings.LOG_STDOUT) == 'True', level=os.getenv('LOG_LEVEL', settings.LOG_LEVEL), dir=os.getenv('LOG_DIR', settings.LOG_DIR), file=os.getenv('LOG_FILE', settings.LOG_FILE)) if settings.DW_ENABLED: dw_config(settings.DW_CONFIG) self.logger.register_callback('>=INFO', dw_callback) # Set the restart_flag to False self.restart_flag = False # Set up required connections self._setup_kafka() self._setup_birdy() # Create the locations_rule dict if this is a locations traptor self.locations_rule = {}
def _setup(self): """ Set up Traptor. Load everything up. Note that any arg here will override both default and custom settings. """ traptor_name = 'traptor-{}-{}'.format(os.getenv('TRAPTOR_TYPE', 'track'), os.getenv('TRAPTOR_ID', 0)) # Set up logging self.logger = LogFactory.get_instance(json=True, stdout=False, name=traptor_name, level=self.log_level, dir=self.log_dir, file=self.log_file_name) # Set the restart_flag to False self.restart_flag = False # Set up required connections self._setup_kafka() self._setup_birdy() # Create the locations_rule dict if this is a locations traptor self.locations_rule = {}
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex)
def __init__(self): self.logger = LogFactory.get_instance( json=os.getenv('LOG_JSON', 'True') == 'True', name=os.getenv('LOG_NAME', 'traptor-healthcheck'), stdout=os.getenv('LOG_STDOUT', 'False') == 'True', level=os.getenv('LOG_LEVEL', 'INFO'), dir=os.getenv('LOG_DIR', 'logs'), file=os.getenv('LOG_FILE', 'traptor-healthcheck.log')) # Set the default timeout for all sockets socket.setdefaulttimeout(int(settings.HEALTHCHECK_TIMEOUT)) if settings.DW_ENABLED: self.logger.debug("Enabling dogwhistle") default_tags = { "tags": [ "traptor_type:{}".format(os.getenv('TRAPTOR_TYPE')), "traptor_id:{}".format(os.getenv('TRAPTOR_ID')) ] } dw_settings = dict(settings.DW_CONFIG.items() + default_tags.items()) dw_config(dw_settings) self.logger.register_callback('>=INFO', dw_callback) signal.signal(signal.SIGINT, self.close) signal.signal(signal.SIGTERM, self.close)
def from_settings(cls, settings): my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) my_appids = settings.get('KAFKA_APPID_TOPICS', False) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) try: kafka = KafkaClient(settings['KAFKA_HOSTS']) producer = SimpleProducer(kafka) except KafkaUnavailableError: logger.error("Unable to connect to Kafka in Pipeline"\ ", raising exit flag.") # this is critical so we choose to exit sys.exit(1) topic_prefix = settings['KAFKA_TOPIC_PREFIX'] use_base64 = settings['KAFKA_BASE_64_ENCODE'] return cls(producer, topic_prefix, kafka, logger, appids=my_appids, use_base64=use_base64)
def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator)
def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() self.logger.debug("Successfully connected to Redis") except ConnectionError: self.logger.error("Failed to connect to Redis") # essential to functionality sys.exit(1) self._load_plugins() self._setup_stats()
def _setup(self): """Create the logger and set up the rest of the configuration.""" self.logger = LogFactory.get_instance(json=True, stdout=False, name='pypi_scraper', level=self.log_level, dir=self.log_dir, file=self.log_file_name)
def setup(self): """从配置文件中加载配置信息""" self.settings = self.wrapper.load('settings.py') self.logger = LogFactory.get_instance(json=self.settings['LOG_JSON'], stdout=self.settings['LOG_STDOUT'], level=self.settings['LOG_LEVEL'], name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS'])
def from_settings(cls, settings): server = redis.Redis( host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB'), password=settings.get('REDIS_PASSWORD'), decode_responses=True, socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) global_page_per_domain_limit = settings.get( 'GLOBAL_PAGE_PER_DOMAIN_LIMIT', None) global_page_per_domain_limit_timeout = settings.get( 'GLOBAL_PAGE_PER_DOMAIN_LIMIT_TIMEOUT', 600) domain_max_page_timeout = settings.get('DOMAIN_MAX_PAGE_TIMEOUT', 600) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, global_page_per_domain_limit, global_page_per_domain_limit_timeout, domain_max_page_timeout)
def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) self.retry_http_codes = set( int(x) for x in settings.getlist('RETRY_HTTP_CODES')) # stats setup self.stats_dict = {} self.settings = settings self.name = self.settings['SPIDER_NAME'] if self.settings['STATS_STATUS_CODES']: self.redis_conn = redis.Redis( host=self.settings.get('REDIS_HOST'), port=self.settings.get('REDIS_PORT'), db=settings.get('REDIS_DB'), password=self.settings.get('REDIS_PASSWORD'), decode_responses=True, socket_timeout=self.settings.get('REDIS_SOCKET_TIMEOUT'), socket_connect_timeout=self.settings.get( 'REDIS_SOCKET_TIMEOUT')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in LogRetryMiddleware") except ConnectionError: self.logger.error( "Failed to connect to Redis in LogRetryMiddleware") # plugin is essential to functionality sys.exit(1) self._setup_stats_status_codes()
def _setup(self): """ Load everything up. Note that any arg here will override both default and custom settings. """ # Set up logging self.logger = LogFactory.get_instance(name='traptor', level=self.log_level) # Set the restart_flag to False self.restart_flag = False # Set up required connections self._setup_kafka() self._setup_birdy()
def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) # stats setup self.stats_dict = {} self.settings = settings self.name = self.settings['SPIDER_NAME'] if self.settings['STATS_STATUS_CODES']: self.redis_conn = redis.Redis(host=self.settings.get('REDIS_HOST'), port=self.settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in LogRetryMiddleware") except ConnectionError: self.logger.error("Failed to connect to Redis in LogRetryMiddleware") # plugin is essential to functionality sys.exit(1) self._setup_stats_status_codes()
def from_settings(cls, settings): my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) my_appids = settings.get('KAFKA_APPID_TOPICS', False) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) try: producer = KafkaProducer( bootstrap_servers=settings['KAFKA_HOSTS'], retries=3, linger_ms=settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES'], value_serializer=lambda m: m.encode('utf-8'), max_request_size=settings['KAFKA_PRODUCER_MAX_REQUEST_SIZE']) except Exception as e: logger.error("Unable to connect to Kafka in Pipeline"\ ", raising exit flag.") # this is critical so we choose to exit. # exiting because this is a different thread from the crawlers # and we want to ensure we can connect to Kafka when we boot sys.exit(1) topic_prefix = settings['KAFKA_TOPIC_PREFIX'] use_base64 = settings['KAFKA_BASE_64_ENCODE'] return cls(producer, topic_prefix, logger, appids=my_appids, use_base64=use_base64)
def setup(self, level=None, log_file=None, json=None): """ Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json """ self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance( json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self._decorate_routes() self._spawn_redis_connection_thread() self._spawn_kafka_connection_thread() # spawn heartbeat processing loop self._heartbeat_thread = Thread(target=self._heartbeat_loop) self._heartbeat_thread.setDaemon(True) self._heartbeat_thread.start() self.start_time = self.get_time() # disable flask logger if self.settings['FLASK_LOGGING_ENABLED'] == False: log = logging.getLogger('werkzeug') log.disabled = True self._load_schemas()
def from_settings(cls, settings): my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(logger)
def from_crawler(cls, crawler): settings = crawler.settings my_level = settings.get('SC_LOG_LEVEL', 'DEBUG') my_name = "%s_%s" % (crawler.spidercls.name, get_raspberrypi_ip_address()) my_output = settings.get('SC_LOG_STDOUT', False) my_json = settings.get('SC_LOG_JSON', True) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = "%s_%s.log" % (crawler.spidercls.name, get_raspberrypi_ip_address()) my_backups = settings.get('SC_LOG_BACKUPS', 5) cls.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(crawler.settings)
def setup(self, level=None, log_file=None, json=None): """ Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json """ self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self._decorate_routes() self._spawn_redis_connection_thread() self._spawn_kafka_connection_thread() # spawn heartbeat processing loop self._heartbeat_thread = Thread(target=self._heartbeat_loop) self._heartbeat_thread.setDaemon(True) self._heartbeat_thread.start() self.start_time = self.get_time() # disable flask logger if self.settings['FLASK_LOGGING_ENABLED'] == False: log = logging.getLogger('werkzeug') log.disabled = True self._load_schemas()
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout)
def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups)
def setup(self, settings): ''' Does the actual setup of the middleware ''' # set up the default sc logger my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SClogger_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) self.settings = settings self.stats_dict = {} # set up redis self.redis_conn = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ScraperHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in Stats Middleware") # plugin is essential to functionality sys.exit(1)
def from_settings(cls, settings): my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) my_appids = settings.get('KAFKA_APPID_TOPICS', False) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) try: producer = KafkaProducer(bootstrap_servers=settings['KAFKA_HOSTS'], retries=3, linger_ms=settings['KAFKA_PRODUCER_BATCH_LINGER_MS'], buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES']) except Exception as e: logger.error("Unable to connect to Kafka in Pipeline"\ ", raising exit flag.") # this is critical so we choose to exit. # exiting because this is a different thread from the crawlers # and we want to ensure we can connect to Kafka when we boot sys.exit(1) topic_prefix = settings['KAFKA_TOPIC_PREFIX'] use_base64 = settings['KAFKA_BASE_64_ENCODE'] return cls(producer, topic_prefix, logger, appids=my_appids, use_base64=use_base64)
def from_settings(cls, settings): server = redis.Redis(host=settings.get("REDIS_HOST"), port=settings.get("REDIS_PORT")) persist = settings.get("SCHEDULER_PERSIST", True) up_int = settings.get("SCHEDULER_QUEUE_REFRESH", 10) hits = settings.get("QUEUE_HITS", 10) window = settings.get("QUEUE_WINDOW", 60) mod = settings.get("QUEUE_MODERATED", False) timeout = settings.get("DUPEFILTER_TIMEOUT", 600) ip_refresh = settings.get("SCHEDULER_IP_REFRESH", 60) add_type = settings.get("SCHEDULER_TYPE_ENABLED", False) add_ip = settings.get("SCHEDULER_IP_ENABLED", False) retries = settings.get("SCHEUDLER_ITEM_RETRIES", 3) ip_regex = settings.get("IP_ADDR_REGEX", ".*") my_level = settings.get("SC_LOG_LEVEL", "INFO") my_name = settings.get("SC_LOGGER_NAME", "sc-logger") my_output = settings.get("SC_LOG_STDOUT", True) my_json = settings.get("SC_LOG_JSON", False) my_dir = settings.get("SC_LOG_DIR", "logs") my_bytes = settings.get("SC_LOG_MAX_BYTES", "10MB") my_file = settings.get("SC_LOG_FILE", "main.log") my_backups = settings.get("SC_LOG_BACKUPS", 5) logger = LogFactory.get_instance( json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups, ) return cls( server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex )
def run(self): """Do your thang!""" time.sleep(30) # Set up a logger self.logger = LogFactory.get_instance(json=True, stdout=False, name="zktesting", level=self.log_level, dir=self.log_dir, file=self.log_file) # Connect to Zookeeper self.logger.info( "Connecting to zookeeper and obtaining configuration info") self.zk = KazooClient(hosts=self.zk_hosts) self.zk.add_listener(self._my_listener) self.zk.start() # Wait until there is a config available, then grab the first one and start "processing" while True: self._wait_for_config() # Act like we're in business self.logger.info("Configuration complete. Starting main loop.") self._do_your_thang()
def setup(self, level=None, log_file=None, json=None): ''' Load everything up. Note that any arg here will override both default and custom settings @param level: the log level @param log_file: boolean t/f whether to log to a file, else stdout @param json: boolean t/f whether to write the logs in json ''' self.settings = self.wrapper.load(self.settings_name) my_level = level if level else self.settings['LOG_LEVEL'] # negate because logger wants True for std out my_output = not log_file if log_file else self.settings['LOG_STDOUT'] my_json = json if json else self.settings['LOG_JSON'] self.logger = LogFactory.get_instance(json=my_json, stdout=my_output, level=my_level, name=self.settings['LOGGER_NAME'], dir=self.settings['LOG_DIR'], file=self.settings['LOG_FILE'], bytes=self.settings['LOG_MAX_BYTES'], backups=self.settings['LOG_BACKUPS']) self.validator = self.extend_with_default(Draft4Validator)
def main(): """ Command line interface to run a traptor instance. Can pass it flags for debug levels and also --stdout mode, which means it will not write to kafka but stdout instread. """ # Redis connections redis_host = os.getenv('REDIS_HOST', 'localhost') redis_port = int(os.getenv('REDIS_PORT', 6379)) redis_db = int(os.getenv('REDIS_DB', 5)) redis_conn = StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True) # Redis pubsub connection pubsub_conn = StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Redis heartbeat connection heartbeat_conn = StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Logging log_level = os.getenv('LOG_LEVEL', 'INFO') log_dir = os.getenv('LOG_DIR', '/var/log/traptor') log_file_name = os.getenv('LOG_FILE_NAME', 'traptor.log') # Twitter api keys api_keys = { 'CONSUMER_KEY': os.getenv('CONSUMER_KEY'), 'CONSUMER_SECRET': os.getenv('CONSUMER_SECRET'), 'ACCESS_TOKEN': os.getenv('ACCESS_TOKEN'), 'ACCESS_TOKEN_SECRET': os.getenv('ACCESS_TOKEN_SECRET') } # Create the traptor instance traptor_instance = Traptor(redis_conn=redis_conn, pubsub_conn=pubsub_conn, heartbeat_conn=heartbeat_conn, traptor_notify_channel=os.getenv('REDIS_PUBSUB_CHANNEL', 'traptor-notify'), rule_check_interval=int(os.getenv('RULE_CHECK_INTERVAL', 60)), traptor_type=os.getenv('TRAPTOR_TYPE', 'track'), traptor_id=int(os.getenv('TRAPTOR_ID', 0)), apikeys=api_keys, kafka_enabled=os.getenv('KAFKA_ENABLED', 'true'), kafka_hosts=os.getenv('KAFKA_HOSTS', 'localhost:9092'), kafka_topic=os.getenv('KAFKA_TOPIC', 'traptor'), use_sentry=os.getenv('USE_SENTRY', 'false'), sentry_url=os.getenv('SENTRY_URL', None), log_level=log_level, log_dir=log_dir, log_file_name=log_file_name, test=False ) # Logger for this main function. The traptor has it's own logger traptor_name = 'traptor-{}-{}'.format(os.getenv('TRAPTOR_TYPE', 'track'), os.getenv('TRAPTOR_ID', 0)) logger = LogFactory.get_instance(json=True, stdout=False, name=traptor_name, level=log_level, dir=log_dir, file=log_file_name) # Wait until all the other containers are up and going... time.sleep(30) # Run the traptor instance try: logger.info('Starting Traptor') logger.debug("Traptor info: {}".format(traptor_instance.__repr__())) traptor_instance.run() except Exception as e: if os.getenv('USE_SENTRY') == 'true': client = Client(os.getenv('SENTRY_URL')) client.captureException() logger.error("Caught exception when starting Traptor", extra={ 'ex': traceback.format_exc() })
import dateutil.parser as parser import os import traceback import json from functools import wraps from dog_whistle import dw_config, dw_callback from scutils.log_factory import LogFactory from traptor import settings from __strings__ import * # Initialize Logging logger = LogFactory.get_instance( name=os.getenv('LOG_NAME', settings.LOG_NAME), json=os.getenv('LOG_JSON', settings.LOG_JSON) == 'True', stdout=os.getenv('LOG_STDOUT', settings.LOG_STDOUT) == 'True', level=os.getenv('LOG_LEVEL', settings.LOG_LEVEL), dir=os.getenv('LOG_DIR', settings.LOG_DIR), file=os.getenv('LOG_FILE', settings.LOG_FILE)) if settings.API_BACKEND == 'piscina': from backends.piscina import get_userid_for_username, get_recent_tweets_by_keyword else: from backends.local import get_userid_for_username, get_recent_tweets_by_keyword if settings.DW_ENABLED: dw_config(settings.DW_CONFIG) logger.register_callback('>=INFO', dw_callback) def validate(rule): status_code = 200
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default='default', help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') logger.debug("Connecting to {0}...".format(kafka_host)) try: kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) if args['command'] == 'list': logger.debug('Running list command') print "Topics:" for topic in kafka.topic_partitions.keys(): print "-", topic return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] @MethodTimer.timeout(5, None) def _hidden(): try: logger.debug("Ensuring topic {t} exists".format(t=topic)) kafka.ensure_topic_exists(topic) logger.debug("Getting Kafka consumer") consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) return consumer except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) consumer = _hidden() if consumer is None: logger.error("Could not fully connect to Kafka within the timeout") sys.exit(1) if args["from_beginning"]: logger.debug("Seeking to beginning") consumer.seek(0, 0) else: logger.debug("Reading from the end") consumer.seek(0, 2) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer.get_messages(): if message is None: logger.debug("no message") break logger.debug("Received message") val = message.message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print json.dumps(item, indent=4) else: print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = float(total_bytes) / (1024 * 1024) if item is not None: print "Last item:" print json.dumps(item, indent=4) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") kafka.close() return 0
def main(): """ Command line interface to run a traptor instance. Can pass it flags for debug levels and also --stdout mode, which means it will not write to kafka but stdout instread. """ # Redis connections redis_host = os.getenv('REDIS_HOST', 'localhost') redis_port = int(os.getenv('REDIS_PORT', 6379)) redis_db = int(os.getenv('REDIS_DB', 5)) redis_conn = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True) # Redis pubsub connection pubsub_conn = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Redis heartbeat connection heartbeat_conn = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Twitter api keys api_keys = { 'CONSUMER_KEY': os.getenv('CONSUMER_KEY'), 'CONSUMER_SECRET': os.getenv('CONSUMER_SECRET'), 'ACCESS_TOKEN': os.getenv('ACCESS_TOKEN'), 'ACCESS_TOKEN_SECRET': os.getenv('ACCESS_TOKEN_SECRET') } # Create the traptor instance traptor_instance = Traptor(redis_conn=redis_conn, pubsub_conn=pubsub_conn, heartbeat_conn=heartbeat_conn, traptor_notify_channel=os.getenv('REDIS_PUBSUB_CHANNEL', 'traptor-notify'), rule_check_interval=int(os.getenv('RULE_CHECK_INTERVAL', 60)), traptor_type=os.getenv('TRAPTOR_TYPE', 'track'), traptor_id=int(os.getenv('TRAPTOR_ID', 0)), apikeys=api_keys, kafka_enabled=os.getenv('KAFKA_ENABLED', 'true'), kafka_hosts=os.getenv('KAFKA_HOSTS', 'localhost:9092'), kafka_topic=os.getenv('KAFKA_TOPIC', 'traptor'), use_sentry=os.getenv('USE_SENTRY', 'false'), sentry_url=os.getenv('SENTRY_URL', None), test=False, enable_stats_collection=os.getenv('ENABLE_STATS_COLLECTION', 'true') ) # Logger for this main function. The traptor has it's own logger traptor_name = 'traptor-{}-{}'.format(os.getenv('TRAPTOR_TYPE', 'track'), os.getenv('TRAPTOR_ID', 0)) logger = LogFactory.get_instance(name=traptor_name, json=os.getenv('LOG_JSON', settings.LOG_JSON) == 'True', stdout=os.getenv('LOG_STDOUT', settings.LOG_STDOUT) == 'True', level=os.getenv('LOG_LEVEL', settings.LOG_LEVEL), dir=os.getenv('LOG_DIR', settings.LOG_DIR), file=os.getenv('LOG_FILE', settings.LOG_FILE)) if settings.DW_ENABLED: dw_config(settings.DW_CONFIG) logger.register_callback('>=INFO', dw_callback) # Wait until all the other containers are up and going... time.sleep(30) # Run the traptor instance try: logger.info('Starting Traptor') logger.debug("Traptor info: {}".format(traptor_instance.__repr__())) traptor_instance.run() except Exception as e: if os.getenv('USE_SENTRY') == 'true': client = Client(os.getenv('SENTRY_URL')) client.captureException() logger.error("Caught exception when starting Traptor", extra={ 'ex': traceback.format_exc() }) dd_monitoring.increment('traptor_error_occurred', tags=['error_type:traptor_start']) raise e
def setUp(self): self.logger = LogFactory.get_instance(name='test', dir='./', level='DEBUG', propagate=True)
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( # 消费来自demo.crawled_firehose话题的消息 topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings[ 'KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings[ 'KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings[ 'KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024 * 1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
import time import pymongo from settings_kafkadump import MONGODB_DB, MONGODB_PORT, MONGODB_SERVER, IMAGES_STORE from docopt import docopt from scutils.log_factory import LogFactory mongodb_server = MONGODB_SERVER mongodb_port = MONGODB_PORT mongodb_conn = pymongo.MongoClient(mongodb_server, mongodb_port) mongodb_db = mongodb_conn[MONGODB_DB] logger = LogFactory.get_instance(json=False, stdout=False, level='DEBUG', name='dump_to_mongodb', dir='logs', file='dump_to_mongodb.log', bytes='1000MB', backups=5) def _insert_item_to_monggodb(item): if 'meta' not in item: return try: collection = mongodb_db[item['meta']['collection_name']] collection.insert(item) print("item['meta']['collection_name']===", item['meta']['collection_name']) logger.info("item['meta']['collection_name']===%s" %
import time import datetime import importlib import uuid from docopt import docopt from kafka import KafkaClient, SimpleConsumer, SimpleProducer from os.path import splitext, basename, exists from urlparse import urlparse import hashlib from websocket import create_connection from scutils.log_factory import LogFactory logger = LogFactory.get_instance(json=False, stdout=False, level='DEBUG', name='aria2_dispatch', dir='logs', file='aria2_dispatch.log', bytes='1000MB', backups=5) def sha1(x): return hashlib.sha1(x).hexdigest() class Aria2Dispatcher: def __init__(self, host, topic, consumer_id, settings): self.host = host self.topic = topic self.consumer_id = consumer_id or "Aria2Dispatcher" self.settings = importlib.import_module(settings[:-3])
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument('-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default='default', help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') logger.debug("Connecting to {0}...".format(kafka_host)) try: kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) if args['command'] == 'list': logger.debug('Running list command') print "Topics:" for topic in kafka.topic_partitions.keys(): print "-", topic return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] @MethodTimer.timeout(5, None) def _hidden(): try: logger.debug("Ensuring topic {t} exists".format(t=topic)) kafka.ensure_topic_exists(topic) logger.debug("Getting Kafka consumer") consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024*100, fetch_size_bytes=1024*100, max_buffer_size=None ) return consumer except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) consumer = _hidden() if consumer is None: logger.error("Could not fully connect to Kafka within the timeout") sys.exit(1) if args["from_beginning"]: logger.debug("Seeking to beginning") consumer.seek(0, 0) else: logger.debug("Reading from the end") consumer.seek(0, 2) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer.get_messages(): if message is None: logger.debug("no message") break logger.debug("Received message") val = message.message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print json.dumps(item, indent=4) else: print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = float(total_bytes) / (1024*1024) if item is not None: print "Last item:" print json.dumps(item, indent=4) if num_records > 0: logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}" .format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") kafka.close() return 0
from settings_kafkadump import MONGODB_DB, MONGODB_PORT, MONGODB_SERVER, IMAGES_STORE from docopt import docopt from scutils.log_factory import LogFactory mongodb_server = MONGODB_SERVER mongodb_port = MONGODB_PORT mongodb_conn = pymongo.MongoClient(mongodb_server, mongodb_port) mongodb_db = mongodb_conn[MONGODB_DB] logger = LogFactory.get_instance( json=False, stdout=False, level='DEBUG', name='dump_to_mongodb', dir='logs', file='dump_to_mongodb.log', bytes='1000MB', backups=5 ) def _insert_item_to_monggodb(item): if 'meta' not in item: return try: collection = mongodb_db[item['meta']['collection_name']] collection.insert(item) print("item['meta']['collection_name']===", item['meta']['collection_name']) logger.info("item['meta']['collection_name']===%s" % item['meta']['collection_name'])
'--log-json', action='store_const', required=False, const=True, default=False, help="Log the data in JSON format") parser.add_argument('-ie', '--include-extra', action='store_const', const=True, default=False, help="Print the 'extra' dict if not logging" " to json") args = vars(parser.parse_args()) logger = LogFactory.get_instance(level=args['log_level'], stdout=args['log_file'], json=args['log_json'], include_extra=args['include_extra']) my_var = 1 def the_callback(log_message, log_extras): global my_var my_var += 5 def the_callback_2(log_message, log_extras): global my_var my_var *= 2
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument('-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") dump_parser.add_argument('-m', '--mongodb', action="store", help="Set mongodb to save webpages") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings['KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings['LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = SimpleClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings['KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings['KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings['KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = str(base64.b64decode(item['body'])) if args['no_body'] and 'body' in item: del item['body'] except BaseException, msg: logger.info("Message is not a JSON object") logger.info("base64 error: ", msg) item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024*1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info("Num Records: {n}, Total MBs: {m}, kb per message: {kb}" .format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
import yaml from scutils.log_factory import LogFactory from kazoo.client import KazooClient, KazooState from kazoo.exceptions import NoNodeError # Set up logging logger = LogFactory.get_instance(json=True, stdout=True, name="zksetup", level='INFO', dir='logs', file='zksetup.log') # Create a zookeeper listener def _my_listener(state): if state == KazooState.LOST: # Register somewhere that the session was lost logger.warning("Zookeeper session lost: {}".format(state)) elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper logger.warning("Zookeeper session suspended: {}".format(state)) else: # Handle being connected/reconnected to Zookeeper logger.info("Connected to zookeeper: {}".format(state)) # Connect to Zookeeper try: logger.info("Connecting to zookeeper") zk = KazooClient(hosts='localhost:2181') zk.add_listener(_my_listener) zk.start()
'gauges': [("Too high!", [ ("too_high", "wrapper.value"), ("too_high_gauge2", "key2"), ])] }, 'options': { # use statsd for local testing, see docs 'statsd_host': 'localhost', 'statsd_port': 8125, 'local': True, } } dw_config(settings) logger = LogFactory.get_instance(level='INFO') logger.register_callback('*', dw_callback) for i in xrange(0, 10): logger.info("this is a test") val = 50 for i in xrange(0, 10): val += randrange(-1, 2, 1) logger.info("I have a guage and this message is really long and stuff", {'value': i}) #logger.warn("bad line " + str(val)) if val > 50: logger.error("Too high!", {'wrapper': {'value': val}, 'key2': 11})