def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test
def __init__(self, tasks, redis_conn): self.tasks = tasks # 初始URL种子队列 self.redis_conn = redis_conn self.wrapper = SettingsWrapper() self.spiders = [] # 当前运行爬虫节点 self.spiders_weights = None # 当前爬虫节点的权值 self.settings = None self.logger = None
def __init__(self, settings_name, unit_test=False): ''' @param settings_name: the local settings file name @param unit_test: whether running unit tests or not ''' self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.unit_test = unit_test self.my_uuid = str(uuid.uuid4()).split('-')[4]
def __init__(self, tasks, server): self.tasks = tasks # 初始URL种子队列 self.server = server self.wrapper = SettingsWrapper() self.spiders = [] # 当前运行爬虫节点 self.spider_count = 0 # 当前运行爬虫节点个数 self.chose = None # 一致性哈希分布 self.settings = None self.logger = None
def __init__(self, settings_name): """ @param settings_name: the local settings file name """ self.settings_name = settings_name self.wrapper = SettingsWrapper() self.logger = None self.app = Flask(__name__) self.kafka_connected = False self.redis_connected = False self.my_uuid = str(uuid.uuid4()).split('-')[4] self.uuids = {} self.uuids_lock = threading.Lock() self.validator = self._extend_with_default(Draft4Validator) self.schemas = {}
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default=None, help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') if args['command'] == 'list': try: logger.debug("Connecting to {0}...".format(kafka_host)) kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) logger.debug('Running list command') print("Topics:") for topic in list(kafka.topic_partitions.keys()): print("-", topic) kafka.close() return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] try: logger.debug("Getting Kafka consumer") offset = 'earliest' if args["from_beginning"] else 'latest' consumer = KafkaConsumer( # 消费来自demo.crawled_firehose话题的消息 topic, group_id=consumer_id, bootstrap_servers=kafka_host, consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'], auto_offset_reset=offset, auto_commit_interval_ms=settings[ 'KAFKA_CONSUMER_COMMIT_INTERVAL_MS'], enable_auto_commit=settings[ 'KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'], max_partition_fetch_bytes=settings[ 'KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES']) except NoBrokersAvailable as ex: logger.error('Unable to connect to Kafka') sys.exit(1) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer: if message is None: logger.debug("no message") break logger.debug("Received message") val = message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print(json.dumps(item, indent=4)) else: print(item) num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = old_div(float(total_bytes), (1024 * 1024)) if item is not None: print("Last item:") print(json.dumps(item, indent=4)) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") try: consumer.close() except: # Exception is thrown when group_id is None. # See https://github.com/dpkp/kafka-python/issues/619 pass return 0
import argparse from scutils.settings_wrapper import SettingsWrapper # set up arg parser parser = argparse.ArgumentParser( description='Example SettingsWrapper parser.\n') parser.add_argument('-s', '--settings', action='store', required=False, help="The default settings file", default="settings.py") parser.add_argument('-o', '--override-settings', action='store', required=False, help="The override settings file", default="localsettings.py") parser.add_argument('-v', '--variable', action='store', required=False, help="The variable to print out", default=None) args = vars(parser.parse_args()) # load up settings wrapper = SettingsWrapper() my_settings = wrapper.load(default=args['settings'], local=args['override_settings']) if args['variable'] is not None: if args['variable'] in my_settings: print(args['variable'], '=', my_settings[args['variable']]) else: print(args['variable'], "not in loaded settings") else: print("Full settings:", my_settings)
def setUp(self): self.wrapper = SettingsWrapper()
def setUp(self): self.wrapper = SettingsWrapper() self.wrapper.my_settings = {}
def main(): # initial main parser setup parser = argparse.ArgumentParser( description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for ' 'debugging.', add_help=False) parser.add_argument('-h', '--help', action=ArgparseHelper, help='show this help message and exit') subparsers = parser.add_subparsers(help='commands', dest='command') # args to use for all commands base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('-kh', '--kafka-host', action='store', required=False, help="The override Kafka host") base_parser.add_argument('-s', '--settings', action='store', required=False, help="The settings file to read from", default="localsettings.py") base_parser.add_argument( '-ll', '--log-level', action='store', required=False, help="The log level", default=None, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) # list command list_parser = subparsers.add_parser('list', help='List all Kafka topics', parents=[base_parser]) # dump command dump_parser = subparsers.add_parser('dump', help='Dump a Kafka topic', parents=[base_parser]) dump_parser.add_argument('-t', '--topic', action='store', required=True, help="The Kafka topic to read from") dump_parser.add_argument('-c', '--consumer', action='store', required=False, default='default', help="The Kafka consumer id to use") dump_parser.add_argument('-b', '--from-beginning', action='store_const', required=False, const=True, help="Read the topic from the beginning") dump_parser.add_argument('-nb', '--no-body', action='store_const', required=False, const=True, default=False, help="Do not include the raw html 'body' key in" " the json dump of the topic") dump_parser.add_argument('-p', '--pretty', action='store_const', required=False, const=True, default=False, help="Pretty print the json objects consumed") dump_parser.add_argument('-d', '--decode-base64', action='store_const', required=False, const=True, default=False, help="Decode the base64 encoded raw html body") args = vars(parser.parse_args()) wrapper = SettingsWrapper() settings = wrapper.load(args['settings']) kafka_host = args['kafka_host'] if args['kafka_host'] else settings[ 'KAFKA_HOSTS'] log_level = args['log_level'] if args['log_level'] else settings[ 'LOG_LEVEL'] logger = LogFactory.get_instance(level=log_level, name='kafkadump') logger.debug("Connecting to {0}...".format(kafka_host)) try: kafka = KafkaClient(kafka_host) logger.info("Connected to {0}".format(kafka_host)) except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) if args['command'] == 'list': logger.debug('Running list command') print "Topics:" for topic in kafka.topic_partitions.keys(): print "-", topic return 0 elif args['command'] == 'dump': logger.debug('Running dump command') topic = args["topic"] consumer_id = args["consumer"] @MethodTimer.timeout(5, None) def _hidden(): try: logger.debug("Ensuring topic {t} exists".format(t=topic)) kafka.ensure_topic_exists(topic) logger.debug("Getting Kafka consumer") consumer = SimpleConsumer(kafka, consumer_id, topic, buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) return consumer except KafkaUnavailableError as ex: message = "An exception '{0}' occured. Arguments:\n{1!r}" \ .format(type(ex).__name__, ex.args) logger.error(message) sys.exit(1) consumer = _hidden() if consumer is None: logger.error("Could not fully connect to Kafka within the timeout") sys.exit(1) if args["from_beginning"]: logger.debug("Seeking to beginning") consumer.seek(0, 0) else: logger.debug("Reading from the end") consumer.seek(0, 2) num_records = 0 total_bytes = 0 item = None while True: try: for message in consumer.get_messages(): if message is None: logger.debug("no message") break logger.debug("Received message") val = message.message.value try: item = json.loads(val) if args['decode_base64'] and 'body' in item: item['body'] = base64.b64decode(item['body']) if args['no_body'] and 'body' in item: del item['body'] except ValueError: logger.info("Message is not a JSON object") item = val body_bytes = len(item) if args['pretty']: print json.dumps(item, indent=4) else: print item num_records = num_records + 1 total_bytes = total_bytes + body_bytes except KeyboardInterrupt: logger.debug("Keyboard interrupt received") break except: logger.error(traceback.print_exc()) break total_mbs = float(total_bytes) / (1024 * 1024) if item is not None: print "Last item:" print json.dumps(item, indent=4) if num_records > 0: logger.info( "Num Records: {n}, Total MBs: {m}, kb per message: {kb}". format(n=num_records, m=total_mbs, kb=(float(total_bytes) / num_records / 1024))) else: logger.info("No records consumed") num_records = 0 logger.info("Closing Kafka connection") kafka.close() return 0