Exemplo n.º 1
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
Exemplo n.º 2
0
    def __init__(self, tasks, redis_conn):

        self.tasks = tasks  # 初始URL种子队列
        self.redis_conn = redis_conn
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spiders_weights = None  # 当前爬虫节点的权值
        self.settings = None
        self.logger = None
Exemplo n.º 3
0
 def __init__(self, settings_name, unit_test=False):
     '''
     @param settings_name: the local settings file name
     @param unit_test: whether running unit tests or not
     '''
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.unit_test = unit_test
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
Exemplo n.º 4
0
    def __init__(self, tasks, server):

        self.tasks = tasks  # 初始URL种子队列
        self.server = server
        self.wrapper = SettingsWrapper()

        self.spiders = []  # 当前运行爬虫节点
        self.spider_count = 0  # 当前运行爬虫节点个数
        self.chose = None  # 一致性哈希分布
        self.settings = None
        self.logger = None
Exemplo n.º 5
0
 def __init__(self, settings_name):
     """
     @param settings_name: the local settings file name
     """
     self.settings_name = settings_name
     self.wrapper = SettingsWrapper()
     self.logger = None
     self.app = Flask(__name__)
     self.kafka_connected = False
     self.redis_connected = False
     self.my_uuid = str(uuid.uuid4()).split('-')[4]
     self.uuids = {}
     self.uuids_lock = threading.Lock()
     self.validator = self._extend_with_default(Draft4Validator)
     self.schemas = {}
Exemplo n.º 6
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
        'debugging.',
        add_help=False)
    parser.add_argument('-h',
                        '--help',
                        action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh',
                             '--kafka-host',
                             action='store',
                             required=False,
                             help="The override Kafka host")
    base_parser.add_argument('-s',
                             '--settings',
                             action='store',
                             required=False,
                             help="The settings file to read from",
                             default="localsettings.py")
    base_parser.add_argument(
        '-ll',
        '--log-level',
        action='store',
        required=False,
        help="The log level",
        default=None,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list',
                                        help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump',
                                        help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t',
                             '--topic',
                             action='store',
                             required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c',
                             '--consumer',
                             action='store',
                             required=False,
                             default=None,
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b',
                             '--from-beginning',
                             action='store_const',
                             required=False,
                             const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb',
                             '--no-body',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p',
                             '--pretty',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d',
                             '--decode-base64',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Decode the base64 encoded raw html body")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings[
        'KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings[
        'LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    if args['command'] == 'list':
        try:
            logger.debug("Connecting to {0}...".format(kafka_host))
            kafka = KafkaClient(kafka_host)
            logger.info("Connected to {0}".format(kafka_host))
        except KafkaUnavailableError as ex:
            message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                .format(type(ex).__name__, ex.args)
            logger.error(message)
            sys.exit(1)
        logger.debug('Running list command')
        print("Topics:")
        for topic in list(kafka.topic_partitions.keys()):
            print("-", topic)
        kafka.close()
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        try:
            logger.debug("Getting Kafka consumer")

            offset = 'earliest' if args["from_beginning"] else 'latest'

            consumer = KafkaConsumer(  # 消费来自demo.crawled_firehose话题的消息
                topic,
                group_id=consumer_id,
                bootstrap_servers=kafka_host,
                consumer_timeout_ms=settings['KAFKA_CONSUMER_TIMEOUT'],
                auto_offset_reset=offset,
                auto_commit_interval_ms=settings[
                    'KAFKA_CONSUMER_COMMIT_INTERVAL_MS'],
                enable_auto_commit=settings[
                    'KAFKA_CONSUMER_AUTO_COMMIT_ENABLE'],
                max_partition_fetch_bytes=settings[
                    'KAFKA_CONSUMER_FETCH_MESSAGE_MAX_BYTES'])
        except NoBrokersAvailable as ex:
            logger.error('Unable to connect to Kafka')
            sys.exit(1)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer:
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = base64.b64decode(item['body'])

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except ValueError:
                        logger.info("Message is not a JSON object")
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print(json.dumps(item, indent=4))
                    else:
                        print(item)
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = old_div(float(total_bytes), (1024 * 1024))
        if item is not None:
            print("Last item:")
            print(json.dumps(item, indent=4))
        if num_records > 0:
            logger.info(
                "Num Records: {n}, Total MBs: {m}, kb per message: {kb}".
                format(n=num_records,
                       m=total_mbs,
                       kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        try:
            consumer.close()
        except:
            # Exception is thrown when group_id is None.
            # See https://github.com/dpkp/kafka-python/issues/619
            pass
        return 0
Exemplo n.º 7
0
import argparse
from scutils.settings_wrapper import SettingsWrapper

# set up arg parser
parser = argparse.ArgumentParser(
    description='Example SettingsWrapper parser.\n')
parser.add_argument('-s', '--settings', action='store', required=False,
                    help="The default settings file",
                    default="settings.py")
parser.add_argument('-o', '--override-settings', action='store', required=False,
                    help="The override settings file",
                    default="localsettings.py")
parser.add_argument('-v', '--variable', action='store', required=False,
                    help="The variable to print out",
                    default=None)
args = vars(parser.parse_args())

# load up settings
wrapper = SettingsWrapper()
my_settings = wrapper.load(default=args['settings'],
                           local=args['override_settings'])

if args['variable'] is not None:
    if args['variable'] in my_settings:
        print(args['variable'], '=', my_settings[args['variable']])
    else:
        print(args['variable'], "not in loaded settings")
else:
    print("Full settings:", my_settings)
Exemplo n.º 8
0
 def setUp(self):
     self.wrapper = SettingsWrapper()
 def setUp(self):
     self.wrapper = SettingsWrapper()
     self.wrapper.my_settings = {}
Exemplo n.º 10
0
def main():
    # initial main parser setup
    parser = argparse.ArgumentParser(
        description='Kafka Dump: Scrapy Cluster Kafka topic dump utility for '
        'debugging.',
        add_help=False)
    parser.add_argument('-h',
                        '--help',
                        action=ArgparseHelper,
                        help='show this help message and exit')

    subparsers = parser.add_subparsers(help='commands', dest='command')

    # args to use for all commands
    base_parser = argparse.ArgumentParser(add_help=False)
    base_parser.add_argument('-kh',
                             '--kafka-host',
                             action='store',
                             required=False,
                             help="The override Kafka host")
    base_parser.add_argument('-s',
                             '--settings',
                             action='store',
                             required=False,
                             help="The settings file to read from",
                             default="localsettings.py")
    base_parser.add_argument(
        '-ll',
        '--log-level',
        action='store',
        required=False,
        help="The log level",
        default=None,
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])

    # list command
    list_parser = subparsers.add_parser('list',
                                        help='List all Kafka topics',
                                        parents=[base_parser])

    # dump command
    dump_parser = subparsers.add_parser('dump',
                                        help='Dump a Kafka topic',
                                        parents=[base_parser])
    dump_parser.add_argument('-t',
                             '--topic',
                             action='store',
                             required=True,
                             help="The Kafka topic to read from")
    dump_parser.add_argument('-c',
                             '--consumer',
                             action='store',
                             required=False,
                             default='default',
                             help="The Kafka consumer id to use")
    dump_parser.add_argument('-b',
                             '--from-beginning',
                             action='store_const',
                             required=False,
                             const=True,
                             help="Read the topic from the beginning")
    dump_parser.add_argument('-nb',
                             '--no-body',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Do not include the raw html 'body' key in"
                             " the json dump of the topic")
    dump_parser.add_argument('-p',
                             '--pretty',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Pretty print the json objects consumed")
    dump_parser.add_argument('-d',
                             '--decode-base64',
                             action='store_const',
                             required=False,
                             const=True,
                             default=False,
                             help="Decode the base64 encoded raw html body")

    args = vars(parser.parse_args())

    wrapper = SettingsWrapper()
    settings = wrapper.load(args['settings'])

    kafka_host = args['kafka_host'] if args['kafka_host'] else settings[
        'KAFKA_HOSTS']
    log_level = args['log_level'] if args['log_level'] else settings[
        'LOG_LEVEL']
    logger = LogFactory.get_instance(level=log_level, name='kafkadump')

    logger.debug("Connecting to {0}...".format(kafka_host))
    try:
        kafka = KafkaClient(kafka_host)
        logger.info("Connected to {0}".format(kafka_host))
    except KafkaUnavailableError as ex:
        message = "An exception '{0}' occured. Arguments:\n{1!r}" \
            .format(type(ex).__name__, ex.args)
        logger.error(message)
        sys.exit(1)

    if args['command'] == 'list':
        logger.debug('Running list command')
        print "Topics:"
        for topic in kafka.topic_partitions.keys():
            print "-", topic
        return 0
    elif args['command'] == 'dump':
        logger.debug('Running dump command')
        topic = args["topic"]
        consumer_id = args["consumer"]

        @MethodTimer.timeout(5, None)
        def _hidden():
            try:
                logger.debug("Ensuring topic {t} exists".format(t=topic))
                kafka.ensure_topic_exists(topic)

                logger.debug("Getting Kafka consumer")
                consumer = SimpleConsumer(kafka,
                                          consumer_id,
                                          topic,
                                          buffer_size=1024 * 100,
                                          fetch_size_bytes=1024 * 100,
                                          max_buffer_size=None)
                return consumer
            except KafkaUnavailableError as ex:
                message = "An exception '{0}' occured. Arguments:\n{1!r}" \
                    .format(type(ex).__name__, ex.args)
                logger.error(message)
                sys.exit(1)

        consumer = _hidden()

        if consumer is None:
            logger.error("Could not fully connect to Kafka within the timeout")
            sys.exit(1)

        if args["from_beginning"]:
            logger.debug("Seeking to beginning")
            consumer.seek(0, 0)
        else:
            logger.debug("Reading from the end")
            consumer.seek(0, 2)

        num_records = 0
        total_bytes = 0
        item = None

        while True:
            try:
                for message in consumer.get_messages():
                    if message is None:
                        logger.debug("no message")
                        break
                    logger.debug("Received message")
                    val = message.message.value
                    try:
                        item = json.loads(val)
                        if args['decode_base64'] and 'body' in item:
                            item['body'] = base64.b64decode(item['body'])

                        if args['no_body'] and 'body' in item:
                            del item['body']
                    except ValueError:
                        logger.info("Message is not a JSON object")
                        item = val
                    body_bytes = len(item)

                    if args['pretty']:
                        print json.dumps(item, indent=4)
                    else:
                        print item
                    num_records = num_records + 1
                    total_bytes = total_bytes + body_bytes
            except KeyboardInterrupt:
                logger.debug("Keyboard interrupt received")
                break
            except:
                logger.error(traceback.print_exc())
                break

        total_mbs = float(total_bytes) / (1024 * 1024)
        if item is not None:
            print "Last item:"
            print json.dumps(item, indent=4)
        if num_records > 0:
            logger.info(
                "Num Records: {n}, Total MBs: {m}, kb per message: {kb}".
                format(n=num_records,
                       m=total_mbs,
                       kb=(float(total_bytes) / num_records / 1024)))
        else:
            logger.info("No records consumed")
            num_records = 0

        logger.info("Closing Kafka connection")
        kafka.close()
        return 0