def test_switch_leader_simple_consumer(self):
     producer = Producer(self.client, async=False)
     consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10)
     self._send_random_messages(producer, self.topic, 0, 2)
     consumer.get_messages()
     self._kill_leader(self.topic, 0)
     consumer.get_messages()
 def test_switch_leader_simple_consumer(self):
     producer = Producer(self.client, async=False)
     consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10)
     self._send_random_messages(producer, self.topic, 0, 2)
     consumer.get_messages()
     self._kill_leader(self.topic, 0)
     consumer.get_messages()
예제 #3
0
    def test_simple_consumer_failed_payloads(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        def failed_payloads(payload):
            return FailedPayloadsError(payload)

        client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads)

        # This should not raise an exception
        consumer.get_messages(5)
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consumer Loading topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    #log_has_at_least_one = False #did we log at least one entry?
    while True:
        # get 1000 messages at a time, non blocking
        messages = kafka_consumer.get_messages(count=100, block=False)
        if not messages:
            #print "no messages to read"
            continue  # If no messages are received, wait until there are more
        for message in messages:
            #log_has_at_least_one = True
            print(message.message.value)
            #tempfile.write(message.message.value + "\n")    # lose the '\n'?
            tempfile.write(message.message.value)
            tempfile.write("\n")
        if tempfile.tell() > 12000:  # file size > 120MB
            print "Note: file is large enough to write to hdfs. Writing now..."
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit(
        )  # inform zookeeper of position in the kafka queue
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 5000 messages at a time, non blocking
        if not messages:
            os.system("sleep 30s")
        continue
        #break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            print message
        kafka_consumer.commit()  #save position in the kafka queue
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir)

    #get timestamp
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000)
    
    #open file for writing
    tempfile_path = "/tmp/kafka_stockTwits_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter)
    tempfile = open(tempfile_path,"w")
    log_has_at_least_one = False #did we log at least one entry?
    while True:
        messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking
        if not messages:
	       os.system("sleep 300s") # sleep 5mins
	       continue
           
        for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            log_has_at_least_one = True
            #print(message.message.value)
            tempfile.write(message.message.value + "\n")
        if tempfile.tell() > 10000000: #10000000: #file size > 10MB
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit() #save position in the kafka queue
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit() #save position in the kafka queue
    return 0
class KafkaConsumer(object):
    def __init__(self, conf):
        self.log = logging.getLogger(__name__)
        self.client = KafkaClient(conf["kafka_server"])
        self.total_inserts = 0
        self.inserts = 0
        self.listenstore = None


    def start_listens(self, listenstore):
        self.listenstore = listenstore
        return self.start(b"listen-group", b"listens")


    def start(self, group_name, topic_name):
        self.group_name = group_name
        self.topic_name = topic_name
        self.log.info("KafkaConsumer subscribed to %s -> %s" % (group_name, topic_name))
        self.consumer = SimpleConsumer(self.client, self.group_name, self.topic_name)

        t0 = 0
        last_offset = -1
        while True:
            listens = []
            if t0 == 0:
                t0 = time()

            messages = self.consumer.get_messages(count=CASSANDRA_BATCH_SIZE, block=True, timeout=KAFKA_READ_TIMEOUT)
            for message in messages:
                try:
                    data = ujson.loads(message.message.value)
                    listens.append(Listen.from_json(data))
                except ValueError as e:
                    self.log.error("Cannot parse JSON: %s\n'%s'" % (str(e), message.message.value))
                    continue

                last_offset = message.offset

            if listens:
                broken = True
                while broken:
                    try:
                        self.listenstore.insert_batch(listens)
                        broken = False
                    except ValueError as e:
                        self.log.error("Cannot insert listens: %s" % unicode(e))
                        broken = False
                    except NoHostAvailable as e:
                        self.log.error("Cannot insert listens: %s. Sleeping, trying again." % unicode(e))
                        sleep(5)


            self.inserts += len(messages)
            if self.inserts >= REPORT_FREQUENCY:
                t1 = time()
                self.total_inserts += self.inserts
                self.log.info("Inserted %d rows in %.1fs (%.2f listens/sec). Total %d rows. last offset: %d" % \
                    (self.inserts, t1 - t0, self.inserts / (t1 - t0), self.total_inserts, last_offset))
                self.inserts = 0
                t0 = 0
예제 #8
0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (
        topic, group, output_dir)
    #get timestamp
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka,
                                    group,
                                    topic,
                                    max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp,
                                                    batch_counter)
    tempfile = open(tempfile_path, "w")
    log_has_at_least_one = False  #did we log at least one entry?
    while True:
        messages = kafka_consumer.get_messages(
            count=1000,
            block=False)  #get 1000 messages at a time, non blocking
        if not messages:
            break
        for message in messages:  #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            log_has_at_least_one = True
            #print(message.message.value)
            tempfile.write(message.message.value + "\n")
        if tempfile.tell() > 10000000:  #file size > 10MB
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit()
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit()  #save position in the kafka queue
    return 0
예제 #9
0
class ScoringWorker(object):
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                       settings.get('SCORING_GROUP'),
                                       settings.get('INCOMING_TOPIC'),
                                       buffer_size=1048576,
                                       max_buffer_size=10485760,
                                       partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model, self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlingStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0


    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        try:
            for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError), e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    batch.append(msg)
                    if type == 'add_seeds':
                        _, seeds = msg
                        fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds))
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        fingerprints.add(response.meta['fingerprint'])
                        fingerprints.update(map(lambda x: x.meta['fingerprint'], links))
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        fingerprints.add(request.meta['fingerprint'])
                        continue

                    raise TypeError('Unknown message type %s' % type)
                finally:
                    consumed += 1
예제 #10
0
    def test_simple_consumer_unknown_topic_partition(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        # Mock so that only the first request gets a valid response
        def unknown_topic_partition(request):
            return FetchResponsePayload(request.topic, request.partition,
                                 UnknownTopicOrPartitionError.errno, -1, ())

        client.send_fetch_request.side_effect = self.fail_requests_factory(unknown_topic_partition)

        # This should not raise an exception
        with self.assertRaises(UnknownTopicOrPartitionError):
            consumer.get_messages(20)
예제 #11
0
    def test_ts(self):

        kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2"))

        # consumer = SimpleConsumer(kafka, "my-group112", "test")
        consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC,
                                  fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000)

        while True:
            print("HELLO")
            # Prepare data for insert and copy to S3
            # data_str = StringIO()
            count = 0
            # last_offset = 2

            consumer.seek(2, 0)

            for message in consumer.get_messages(count=100, block=False, timeout=0.1):
                count += 1

                print(message.message.value)

            #     # Write tweets to StringIO
            #     self.write_to_data_str(message, data_str)

            # # Store batch tweets to S3
            # self.write_to_s3(data_str, last_offset)

            if count != 100:
                break
예제 #12
0
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consumer Loading topic '%s' in consumer group %s into %s..." % (topic, group, output_dir)
    timestamp = standardized_timestamp(frequency)
    kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000)

    #open file for writing
    tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter)
    tempfile = open(tempfile_path, "w")
    #log_has_at_least_one = False #did we log at least one entry?
    while True:
        # get 1000 messages at a time, non blocking
        messages = kafka_consumer.get_messages(count=100, block=False)
        if not messages:
            #print "no messages to read"
            continue   # If no messages are received, wait until there are more
        for message in messages:
            #log_has_at_least_one = True
            #print(message.message.value)
            #tempfile.write(message.message.value + "\n")    # lose the '\n'?
            tempfile.write(message.message.value)
        if tempfile.tell() > 120000000:  # file size > 120MB
            print "Note: file is large enough to write to hdfs. Writing now..."
            flush_to_hdfs(output_dir, topic)
        kafka_consumer.commit()  # inform zookeeper of position in the kafka queue
def consume_save(group,topic):
#	tmp_save=open(tmp_file_path,"w")
	while True:
		kafka_consumer=SimpleConsumer(kafka,group,topic)
		messages= kafka_consumer.get_messages(count=1000, block=False)
		if not messages:
			print "Consumer didn't read any messages"
		for message in messages:
	#		tmp_save.write( message.message.value+"\n")
			print message.message.value+"\n"
예제 #14
0
    def test_simple_consumer_leader_change(self):
        client = MagicMock()
        consumer = SimpleConsumer(client, group=None,
                                  topic='topic', partitions=[0, 1],
                                  auto_commit=False)

        # Mock so that only the first request gets a valid response
        def not_leader(request):
            return FetchResponsePayload(request.topic, request.partition,
                                 NotLeaderForPartitionError.errno, -1, ())

        client.send_fetch_request.side_effect = self.fail_requests_factory(not_leader)

        # This should not raise an exception
        consumer.get_messages(20)

        # client should have updated metadata
        self.assertGreaterEqual(client.reset_topic_metadata.call_count, 1)
        self.assertGreaterEqual(client.load_metadata_for_topics.call_count, 1)
예제 #15
0
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Friendsquare topic then save it to a temporary
        file: temp_dir, then transfer the file to hdfs.
        Create a kafka receiver to grap messages
        '''

        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        # Create a temp file to store messages
        self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic)

        # Create a hdfs directory to store output files
        os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir)

        while self.count < self.max_count:

            # Get 1000 messages each time
            messages = kafka_receiver.get_messages(count=1000, block=False)

            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # Set each file size at 20 M
            if temp_file.tell() > 20000000:
                temp_file.close()

                # Put the file to hdfs
                hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
    def consume_topic(self, topic, group, temp_dir):
        '''
        This function receive messages from Kafka then save it to a temporary
        first, then transfer the file to hdfs.
        '''
        # Create a kafka receiver to grap messages
        kafka_receiver = SimpleConsumer(kafka,
                                        group,
                                        topic,
                                        max_buffer_size=1310720000)

        self.timestamp = self.getTimestamp()
        # Create a temp file to store messages
        self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp,
                                                str(self.count))

        temp_file = open(self.temp_file_path, 'w')

        while self.count < self.max_count:
            # Get 100 messages each time
            messages = kafka_receiver.get_messages(count=100, block=False)
            if not messages:
                continue

            # Write the messages to a file, one message per line
            for message in messages:
                temp_file.write(message.message.value + '\n')

            # For structured streaming, files need to be small at this point, set the size at 2 M
            if temp_file.tell() > 2000000:
                temp_file.close()

                # Copy the file to hdfs
                output_dir = "%s/%s" % (self.hdfs_dir, topic)
                os.system("hdfs dfs -mkdir %s" % output_dir)
                hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp,
                                              self.count)
                os.system("hdfs dfs -put -f %s %s" %
                          (self.temp_file_path, hdfs_path))

                #remove the old file
                os.remove(self.temp_file_path)

                #  Create a new temp file to store messages
                self.count += 1
                self.timestamp = self.getTimestamp()
                self.temp_file_path = "%s/%s_%s.txt" % (
                    temp_dir, self.timestamp, str(self.count))
                temp_file = open(self.temp_file_path, 'w')

            # Inform zookeeper of position in the kafka queue
            kafka_receiver.commit()

        temp_file.close()
예제 #17
0
def consume_save(group,topic):
	tmp_save=open(tmp_file_path,"w")
	kafka_consumer=SimpleConsumer(kafka,group,topic)
	messages= kafka_consumer.get_messages(count=1000, block=False)
	if not messages:
		print "Consumer didn't read any messages"
	for message in messages:
		tmp_save.write( message.message.value+"\n")
#		print message.message.value+"\n"
	kafka_consumer.commit() # inform zookeeper of position in the kafka queu
	print ".... ... .. .."
	print "Message from topic \"%s\" consumed \n" % topic
예제 #18
0
    def run(self, options=None):

        # try:

        # Create table if it doesn't exist in the database
        if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False:
            self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE)

        kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2"))

        consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000,
                                  buffer_size=2000000000, max_buffer_size=2000000000)

        while True:

            # Prepare data for insert and copy to S3
            data_str = StringIO()
            csv_str = StringIO()
            count = 0

            # Get Offset from previous read
            s3_last_offset = self.get_s3_offset()

            (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0]
            last_offset = last_offset if last_offset else 0

            # Resolve difference in offset (s3 offset does not carry over from day to day)
            if s3_last_offset > last_offset:
                last_offset = s3_last_offset
                self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset))

            print(last_offset)

            # Read from Offset
            consumer.seek(last_offset, 0)

            for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5):

                # Write tweets to StringIO
                self.write_to_data_str(message, data_str, csv_str)

                count += 1
                last_offset += 1

            # Store batch tweets to S3
            self.write_to_s3(data_str, csv_str, last_offset)

            # Track Kafka Offset
            self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset))

            if count != self.BATCH_SIZE:
                break
예제 #19
0
class Consumer(BaseStreamConsumer):
    """
    Used in DB and SW worker. SW consumes per partition.
    """
    def __init__(self, conn, topic, group, partition_id):
        self._conn = conn
        self._group = group
        self._topic = topic
        self._partition_ids = [partition_id] if partition_id is not None else None

        self._cons = None
        self._connect_consumer()

    def _connect_consumer(self):
        if self._cons is None:
            try:
                self._cons = SimpleConsumer(
                    self._conn,
                    self._group,
                    self._topic,
                    partitions=self._partition_ids,
                    buffer_size=1048576,
                    max_buffer_size=10485760)
            except BrokerResponseError:
                self._cons = None
                logger.warning("Could not connect consumer to Kafka server")
                return False
        return True

    def get_messages(self, timeout=0.1, count=1):
        if not self._connect_consumer():
            yield
        while True:
            try:
                for offmsg in self._cons.get_messages(
                        count,
                        timeout=timeout):
                    try:
                        yield offmsg.message.value
                    except ValueError:
                        logger.warning(
                            "Could not decode {0} message: {1}".format(
                                self._topic,
                                offmsg.message.value))
            except Exception as err:
                logger.warning("Error %s" % err)
            finally:
                break

    def get_offset(self):
        return 0
예제 #20
0
파일: smoke_tests.py 프로젝트: kharus/samza
def validate_samza_job():
  """
  Validates that negate-number negated all messages, and sent the output to 
  samza-test-topic-output.
  """
  logger.info('Running validate_samza_job')
  kafka = _get_kafka_client()
  kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
  consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC)
  messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60)
  message_count = len(messages)
  assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count)
  for message in map(lambda m: m.message.value, messages):
    assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message)
  kafka.close()
예제 #21
0
def validate_samza_job():
    """
  Validates that negate-number negated all messages, and sent the output to 
  samza-test-topic-output.
  """
    logger.info("Running validate_samza_job")
    kafka = util.get_kafka_client()
    kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
    consumer = SimpleConsumer(kafka, "samza-test-group", TEST_OUTPUT_TOPIC)
    messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300)
    message_count = len(messages)
    assert NUM_MESSAGES == message_count, "Expected {0} lines, but found {1}".format(NUM_MESSAGES, message_count)
    for message in map(lambda m: m.message.value, messages):
        assert int(message) < 0, "Expected negative integer but received {0}".format(message)
    kafka.close()
예제 #22
0
def consume_save(group,topic):
	i=0
	tmp_save=open(tmp_file_path,"w")
	while True:
		kafka_consumer=SimpleConsumer(kafka,group,topic)
		messages= kafka_consumer.get_messages(count=1000, block=False)
#		if not messages:
#			print "Consumer didn't read any messages"
		for message in messages:
			tmp_save.write( message.message.value+"\n")
			print message.message.value+"\n"
		# file size > 20MB
                if tmp_save.tell() > 20000000:
                    push_to_hdfs(tmp_file_path)
		kafka_consumer.commit() # inform zookeeper of position in the kafka queu
예제 #23
0
class Consumer(Thread):
    def __init__(self, args=()):
        super(Consumer, self).__init__()
        self.host = args[0]
        self.port = args[1]
        self.topic = args[2]
        print '[KafkaConsumer] host: {0}, port: {1}, topic: {2}'.format(self.host, self.port, self.topic)
        self.consumer = None
        self.consumer_keep_run = True
        self.consumer_paused = False
        self.consumer_subscribers = []

    def run(self):
        client = kafka_client(self.host, self.port)
        self.consumer = SimpleConsumer(client, None, self.topic)
        self.consumer.seek(0, 1)

        while self.consumer_keep_run:
            print '[KafkaConsumer] looping..'
            if not self.consumer_paused:
                for message in self.consumer.get_messages(block=False):
                    offset = message.offset
                    value = message.message.value
                    j_encoded = json.dumps({'offset': offset, 'message': value})
                    print '[KafkaConsumer] {}'.format(j_encoded)

                    for subscriber in self.consumer_subscribers:
                        IOLoop.instance().add_callback(partial(subscriber.send_message, j_encoded))
            time.sleep(1)

    def pause_consumer(self, paused):
        self.consumer_paused = paused

    def stop_consumer(self):
        self.consumer_keep_run = False

    def add_subscriber(self, subscriber):
        self.consumer_subscribers.append(subscriber)

    def remove_subscriber(self, subscriber):
        self.consumer_subscribers.remove(subscriber)

    def get_subscribers_length(self):
        length = len(self.consumer_subscribers)
        return length

    def get_subscribers(self):
        return self.subscribers
예제 #24
0
class Consumer(BaseStreamConsumer):
    """
    Used in DB and SW worker. SW consumes per partition.
    """
    def __init__(self, conn, topic, group, partition_id):
        self._conn = conn
        self._group = group
        self._topic = topic
        self._partition_ids = [partition_id
                               ] if partition_id is not None else None

        self._cons = None
        self._connect_consumer()

    def _connect_consumer(self):
        if self._cons is None:
            try:
                self._cons = SimpleConsumer(self._conn,
                                            self._group,
                                            self._topic,
                                            partitions=self._partition_ids,
                                            buffer_size=1048576,
                                            max_buffer_size=10485760)
            except BrokerResponseError:
                self._cons = None
                logger.warning("Could not connect consumer to Kafka server")
                return False
        return True

    def get_messages(self, timeout=0.1, count=1):
        if not self._connect_consumer():
            yield
        while True:
            try:
                for offmsg in self._cons.get_messages(count, timeout=timeout):
                    try:
                        yield offmsg.message.value
                    except ValueError:
                        logger.warning(
                            "Could not decode {0} message: {1}".format(
                                self._topic, offmsg.message.value))
            except Exception as err:
                logger.warning("Error %s" % err)
            finally:
                break

    def get_offset(self):
        return 0
예제 #25
0
class KafkaSpout(Spout):

	def initialize(self, stormconf, context):
		# self.words = itertools.cycle(['dog', 'cat',
		# 								'zebra', 'elephant'])
		self.kafka = KafkaClient("cloud.soumet.com:9092")
		self.consumer = SimpleConsumer(self.kafka, "storm", "realtime", max_buffer_size=1310720000)
		



	def next_tuple(self):
		for message in self.consumer.get_messages(count=500, block=False):#, timeout=1):
			#transaction_data = TransactionFull()
			#transaction_data.ParseFromString(base64.b64decode(message.message.value))
			#self.emit([transaction_data])
			self.emit([message.message.value])
		self.consumer.commit()
예제 #26
0
def validate_kafka_read_write_performance():
  """
  Validates that all messages were sent to the output topic.
  """
  logger.info('Running validate_kafka_read_write_performance')
  kafka = util.get_kafka_client()
  kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
  consumer = SimpleConsumer(
    kafka, 
    'samza-test-group', 
    TEST_OUTPUT_TOPIC,
    fetch_size_bytes=1000000,
    buffer_size=32768,
    max_buffer_size=None)
  # wait 5 minutes to get all million messages
  messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300)
  message_count = len(messages)
  assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count)
  kafka.close()
def consume_topic(topic, group, output_dir, frequency):
    global timestamp, tempfile_path, tempfile
    print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir)
    #get timestamp
    kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000)
    
    while True:
        messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking
        if not messages:
	       os.system("sleep 30s")
	continue
            #break
        for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message'))
            print message
        kafka_consumer.commit() #save position in the kafka queue
    #exit loop
    if log_has_at_least_one:
        flush_to_hdfs(output_dir, topic)
    kafka_consumer.commit() #save position in the kafka queue
    return 0
예제 #28
0
def CommitData():
    """API: 呼叫時提供topic, guid, group,API會依照guid查詢當初取資料時的last offset,並依last offset 對 topic 進行 Commit"""
    api_message = ExecuteResult()
    api_message.message = 'ok'
    api_statuscode = 200

    try:
        original_topic = request.json['topic'].encode('utf-8') + '_error_msg'
        topic = request.json['topic'].encode('utf-8') + '_error_msg_log'
        guid = request.json['guid'].encode('utf-8')
        group = request.json['group'].encode('utf-8')

        if not (CheckTopicExsited(topic)):
            return "Topic cannot be found! This may have not been created.", 200

        client3 = KafkaClient(tmpbootstrap_servers)
        simplecon = SimpleConsumer(client3, group, topic, auto_commit=False)
        simplecon_messages = simplecon.get_messages(count=500)

        ii = 0

        for msg in simplecon_messages:
            msgGuid = getMsgDataGuid(msg.message.value)
            ii += 1
            print(str(ii))
            if msgGuid == guid:
                msgInfos2 = get_last_offset_data(msg.message.value)
                msgInfos = json.loads(msgInfos2)
                for offset_data in msgInfos:
                    commitTopic(original_topic, group,
                                int(offset_data['partition_ID']),
                                int(offset_data['get_last_offset']))

    except Exception as e:
        api_message.message = str(e)
        print(str(e))
        api_statuscode = 500

    finally:
        return json.dumps(api_message,
                          default=encode_ExecuteResult), api_statuscode
예제 #29
0
def validate_kafka_read_write_performance():
    """
  Validates that all messages were sent to the output topic.
  """
    logger.info('Running validate_kafka_read_write_performance')
    kafka = util.get_kafka_client()
    kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC)
    consumer = SimpleConsumer(kafka,
                              'samza-test-group',
                              TEST_OUTPUT_TOPIC,
                              fetch_size_bytes=1000000,
                              buffer_size=32768,
                              max_buffer_size=None)
    # wait 5 minutes to get all million messages
    messages = consumer.get_messages(count=NUM_MESSAGES,
                                     block=True,
                                     timeout=300)
    message_count = len(messages)
    assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(
        NUM_MESSAGES, message_count)
    kafka.close()
예제 #30
0
    def test_ts(self):

        kafka = KafkaClient(
            config.get("kafka.host1") + "," + config.get("kafka.host2"))

        # consumer = SimpleConsumer(kafka, "my-group112", "test")
        consumer = SimpleConsumer(kafka,
                                  self.GROUP_NAME,
                                  self.KAFKA_TOPIC,
                                  fetch_size_bytes=3000000,
                                  buffer_size=2000000000,
                                  max_buffer_size=2000000000)

        while True:
            print("HELLO")
            # Prepare data for insert and copy to S3
            # data_str = StringIO()
            count = 0
            # last_offset = 2

            consumer.seek(2, 0)

            for message in consumer.get_messages(count=100,
                                                 block=False,
                                                 timeout=0.1):
                count += 1

                print(message.message.value)

            #     # Write tweets to StringIO
            #     self.write_to_data_str(message, data_str)

            # # Store batch tweets to S3
            # self.write_to_s3(data_str, last_offset)

            if count != 100:
                break
    def _run(self):
	pcount = 0
        while True:
            try:
                self._logger.info("New KafkaClient %d" % self._partition)
                self._kfk = KafkaClient(self._brokers ,str(os.getpid()))
                try:
                    consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32)
                    #except:
                except Exception as ex:
                    template = "Consumer Failure {0} occured. Arguments:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.info("%s" % messag)
                    raise RuntimeError(messag)

                self._logger.info("Starting %d" % self._partition)

                # Find the offset of the last message that has been queued
                consumer.seek(0,2)
                try:
                    mi = consumer.get_message(timeout=0.1)
                    consumer.commit()
                except common.OffsetOutOfRangeError:
                    mi = None
                #import pdb; pdb.set_trace()
                self._logger.info("Last Queued for %d is %s" % \
                                  (self._partition,str(mi)))
                self.start_partition()

                # start reading from last previously processed message
                consumer.seek(0,1)

                if self._limit:
                    raise gevent.GreenletExit

                while True:
                    try:
                        mlist = consumer.get_messages(10)
                        for mm in mlist:
                            if mm is None:
                                continue
                            self._logger.debug("%d Reading offset %d" % \
                                    (self._partition, mm.offset))
                            consumer.commit()
                            pcount += 1
                            if not self.msg_handler(mm):
                                self._logger.info("%d could not handle %s" % (self._partition, str(mm)))
                                raise gevent.GreenletExit
                    except TypeError:
                        gevent.sleep(0.1)
                    except common.FailedPayloadsError as ex:
                        self._logger.info("Payload Error: %s" %  str(ex.args))
                        gevent.sleep(0.1)
            except gevent.GreenletExit:
                break
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.info("%s : traceback %s" % \
                                  (messag, traceback.format_exc()))
                self.stop_partition()
                gevent.sleep(2)
        self._logger.info("Stopping %d pcount %d" % (self._partition, pcount))
        return self._partoffset, self._partdb
예제 #32
0
class OneteraScheduler(FronteraScheduler):
    def __init__(self, crawler):
        super(OneteraScheduler, self).__init__(crawler)
        self.job_config = {}
        self.is_active = False
        self.results = []
        self.results_sent = 0
        self.last_result_iteration = None

        settings = self.frontier.manager.settings
        self.results_topic = settings.get("ONETERA_RESULTS_TOPIC")
        kafka = KafkaClient(settings.get("KAFKA_LOCATION"))
        self.consumer = SimpleConsumer(
            kafka,
            settings.get("ONETERA_GROUP"),
            settings.get("ONETERA_INCOMING_TOPIC"),
            buffer_size=262144,
            max_buffer_size=10485760,
            auto_commit_every_n=1,
        )
        self.producer = SimpleProducer(kafka)
        self.status_updates_topic = settings.get("ONETERA_STATUS_UPDATES_TOPIC")
        self.stats = {}

    def result_callback(self, result):
        self.results.append(result)

    def open(self, spider):
        super(OneteraScheduler, self).open(spider)
        spider.set_result_callback(self.result_callback)

    def has_pending_requests(self):
        if not self.is_active:
            return False
        return super(OneteraScheduler, self).has_pending_requests()

    def next_request(self):
        if not self.is_active:
            self._check_incoming()
        if self.is_active:
            return super(OneteraScheduler, self).next_request()
        return None

    def process_spider_output(self, response, result, spider):
        self._send_results()
        self._check_finished()
        return super(OneteraScheduler, self).process_spider_output(response, result, spider)

    def process_exception(self, request, exception, spider):
        super(OneteraScheduler, self).process_exception(request, exception, spider)
        self._send_results()
        self._check_finished()

    def _check_finished(self):
        if not self.is_active:
            return
        if self.results_sent > self.job_config["nResults"]:
            logger.info("Crawler reached the number of requested results. Crawling is stopping.")
            self.is_active = False
        if self.last_result_iteration and self.frontier.manager.iteration - self.last_result_iteration > 10:
            logger.info("It looks like crawler get stuck. Stopping crawling.")
            self.is_active = False

    def _check_incoming(self):
        consumed = 0
        try:
            for m in self.consumer.get_messages(count=1):
                try:
                    msg = loads(m.message.value)
                except ValueError, ve:
                    logger.error("Decoding error %s, message %s" % (ve, m.message.value))
                else:
                    logger.info("Got incoming message %s from incoming topic." % m.message.value)

                    self.frontier.manager.backend.cleanup()
                    self._pending_requests.clear()
                    self.results = []
                    self.results_sent = 0
                    self.last_result_iteration = None

                    self.job_config = {
                        "workspace": msg["workspace"],
                        "nResults": msg["nResults"],
                        "excluded": msg["excluded"],
                        "included": msg["included"],
                        "relevantUrl": msg["relevantUrl"],
                        "irrelevantUrl": msg["irrelevantUrl"],
                    }
                    requests = [Request(url, meta={"score": 1.0}) for url in msg["relevantUrl"]]
                    if not requests:
                        raise Exception("Empty seeds list, can't bootstrap crawler.")
                    self.frontier.add_seeds(requests)
                    self.frontier.spider.configure(self.job_config)
                    self.is_active = True
                finally:
                    consumed += 1
    def _run(self):
	pcount = 0
        pause = False
        while True:
            try:
                if pause:
                    gevent.sleep(2)
                    pause = False
                self._logger.error("New KafkaClient %s" % self._topic)
                self._kfk = KafkaClient(self._brokers , "kc-" + self._topic)
                try:
                    consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32)
                    #except:
                except Exception as ex:
                    template = "Consumer Failure {0} occured. Arguments:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("Error: %s trace %s" % \
                        (messag, traceback.format_exc()))
                    raise RuntimeError(messag)

                self._logger.error("Starting %s" % self._topic)

                # Find the offset of the last message that has been queued
                consumer.seek(-1,2)
                try:
                    mi = consumer.get_message(timeout=0.1)
                    consumer.commit()
                except common.OffsetOutOfRangeError:
                    mi = None
                #import pdb; pdb.set_trace()
                self._logger.info("Last Queued for %s is %s" % \
                                  (self._topic,str(mi)))

                # start reading from last previously processed message
                if mi != None:
                    consumer.seek(-1,1)
                else:
                    consumer.seek(0,0)

                if self._limit:
                    raise gevent.GreenletExit

                while True:
                    try:
                        mlist = consumer.get_messages(10,timeout=0.5)
                        if not self.msg_handler(mlist):
                            raise gevent.GreenletExit
                        consumer.commit()
                        pcount += len(mlist) 
                    except TypeError as ex:
                        self._logger.error("Type Error: %s trace %s" % \
                                (str(ex.args), traceback.format_exc()))
                        gevent.sleep(0.1)
                    except common.FailedPayloadsError as ex:
                        self._logger.error("Payload Error: %s" %  str(ex.args))
                        gevent.sleep(0.1)
            except gevent.GreenletExit:
                break
            except AssertionError as ex:
                self._partoffset = ex
                break
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s" % \
                                  (messag, traceback.format_exc()))
                self.stop_partition()
                pause = True

        self._logger.error("Stopping %s pcount %d" % (self._topic, pcount))
        partdb = self.stop_partition()
        return self._partoffset, partdb
예제 #34
0
# more advanced consumer -- multiple topics w/ auto commit offset
# management

import sys

kclient = KafkaClient("52.24.239.65:9092")


consumer = SimpleConsumer(kclient, "bf-group", sys.argv[1], partitions=[0,1])
consumer.max_buffer_size=None


consumer.seek(0,1)

while True:
    for message in consumer.get_messages():
        print("OFFSET: "+str(message[0])+"\t MSG: "+str(message[1][3]) + "KEY: " + str(message.message.key) )


sys.exit(0)


client = KafkaClient(['52.24.239.65:9092'], client_id='bfleming')
consumer = SimpleConsumer(client, "bfleming", 'bfleming00615')

print consumer.get_messages(count=10)
                         # auto_offset_reset='smallest')
import ipdb
ipdb.set_trace()
# Infinite iteration
# for m in consumer:
예제 #35
0
class TestLinkSpider(TestCase):

    example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\
        "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\
        "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\
        "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\
        "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\
        "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\
        "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\
        "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\
        "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u."

    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(self.kafka_conn,
                                       "demo-id",
                                       "demo_test.crawled_firehose",
                                       buffer_size=1024 * 100,
                                       fetch_size_bytes=1024 * 100,
                                       max_buffer_size=None)
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)

    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())

        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()

        reactor.run()

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                if the_dict is not None and the_dict['appid'] == 'testapp' \
                        and the_dict['crawlid'] == '01234567890abcdefghijklmn':
                    message_count += 1

        self.assertEquals(message_count, 1)

    def tearDown(self):
        keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
        keys = keys + self.redis_conn.keys('test-spider:*')
        for key in keys:
            self.redis_conn.delete(key)
예제 #36
0
class KafkaDriver:

    def __init__(self, driver_args, event_loop):
        self.logger = logging.getLogger('KafkaDriver') # possible TODO: get logger from invoker
        self.logger.setLevel(logging.INFO)
        console_log_handler = logging.StreamHandler(sys.stdout)
        self.logger.addHandler(console_log_handler)

        self.logger.info("KafkaDriver initialized; driver_args=%s" % (driver_args))
        self.event_loop = event_loop
        if driver_args is "":
            kafka_server_addr =  "localhost:9092"
        else:
            kafka_server_addr = driver_args
        client_id = "KafkaDriver-%d-%d" % (time.time(), os.getpid()) # generate a unique client ID so that Kafka doesn't confuse us with a different instance
        self.kafka = KafkaClient(kafka_server_addr, client_id=client_id)

        self.queue_name = None
        ## APPEND direction
        self.get_message_stream = None
        # how frequently to add check for messages and (space permitting) to add them to the GET message stream, in seconds
        self.MESSAGE_CHECK_FREQ = 0.010
        # how many message we have sent from various queues
        self.get_message_count = 0
        self.producer = None
        ## GET direction
        self.consumer = None
        self.get_message_count = 0
        self.MAX_KAFKA_REQ_BATCH_MSGS = 200 # most number of messages that we will request from Kafka at a time

    ######## APPEND direction ########

    # called to tell driver of a new stream of appends than are going to come in; these should go to the end of the named queue
    def prepare_for_append_stream(self, queue_name):
        self.logger.info("KafkaDriver prepare_for_append_stream got: queue_name=%s" % (queue_name))
        self.queue_name = str(queue_name)
        self.producer = SimpleProducer(
            self.kafka,
            async=True,
            req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
            ack_timeout=5000,
            batch_send=True,
            batch_send_every_n= 100,
            batch_send_every_t=1000,
            random_start=True
        )

    def append(self, payload, ttl):
        ttl = int(ttl)
        self.logger.debug("KafkaDriver append got: ttl=%d, payload='%s'" % (ttl, payload))
        try:
            self.producer.send_messages(self.queue_name,payload)
        except UnknownTopicOrPartitionError:
            self.logger.warn("Kafka reports unknown topic or invalid partition number: " + str(sys.exc_info()))
            return 500
        except:
            self.logger.warn("Got exception from kafka-python SimpleProducer:" + str(sys.exc_info()))
            return 500

        # if random.uniform(0,1) < self.FRACTION_MSGS_TO_FAKE_APPEND_ERROR:
        #     self.logger.debug("faking error")
        #     return 400
        return 100

    def cancel_append_stream(self):
        self.logger.info("KafkaDriver cancel_append_stream got called")
        self.producer.stop()
        self.producer = None
        self.queue_name = None

    ######## GET direction ########

    # called to tell driver that a new stream of messages is needed for return to a client.   message_stream_queue is an instance of MessageStream to use to put messages the driver has available as a response to this request.  Other arguments have same meaning as in the Marconi API.
    def init_get_stream(self, get_message_stream, queue_name_spec, starting_marker, echo_requested, include_claimed):
        self.logger.info("KafkaDriver prepare_to_get_messages got: queue_name=%s, echo_requested=%s, include_claimed=%s, starting_marker=%s" % (queue_name_spec,str(echo_requested),str(include_claimed),starting_marker))
        self.logger.info("warning: KafkaDriver ignores echo_requested and include_claimed in GET requests")
        self.consume_group = "cg1" # default consume group
        if len(starting_marker) > 0:
            self.consume_group = starting_marker
        self.logger.info("consume group="+self.consume_group)

        # if the queue name contains "/n"  at the end, we interpret that is referring to partition to read from
        queue_name, partition_part = queue_name_spec.split("/",2)
        if partition_part is None:
            partition = None
        else:
            partition = int(partition_part)
            self.logger.info("limiting topic %s to partition %d" % (queue_name, partition))

        self.get_message_stream = get_message_stream
        self.queue_name = str(queue_name)
        self.consumer = SimpleConsumer(
            client=self.kafka,
            group=self.consume_group,
            topic=self.queue_name,
            partitions=[partition],
            auto_commit=False, # it seems we cannot do any kind of commit when using kafka-pythong 0.9.1 with Kafka versions before 0.8.1 because kafka-python will send a OffsetFetchReqeust (request type 9) or OffsetCommitRequest (request type 8) which is not supported
            fetch_size_bytes= self.MAX_KAFKA_REQ_BATCH_MSGS*4096, # in Marconi,messages can be up to 4k
            iter_timeout=None,
        )
        self.logger.debug("KafkaDriver: seeking to head of %s" % (self.queue_name))
        self.consumer.seek(0,0) # seek to head of topic; TODO: should get starting position from starting_marker param

        self.periodically_check_for_new_messages() # kick of periodic attainment of new messages (space permitting)

    def periodically_check_for_new_messages(self):
        #self.logger.debug("KafkaDriver.periodically_check_for_new_messages()")
        if self.get_message_stream is not None: # still providing messages
            self.check_for_new_messages()
            # TODO: call call_soon() rather than call_later() if we got some messages and there is still space available in the MessageStream
            self.new_msg_check_callback = self.event_loop.call_later(self.MESSAGE_CHECK_FREQ, self.periodically_check_for_new_messages) # schedules self to run again after MESSAGE_CHECK_FREQ seconds

    def check_for_new_messages(self):
        self.logger.debug("KafkaDriver.check_for_new_messages (start): space_used=%d, amount_of_space_avail=%d" % (self.get_message_stream.space_used(), self.get_message_stream.amount_of_space_avail()))
        max_number_of_messages = self.get_message_stream.amount_of_space_avail()
        if max_number_of_messages == 0:
            return # no space left to add message, so don't look for any

        # now try to get up to max_number_of_messages messages from the topic, but in a non-blocking manner
        messages = self.consumer.get_messages(count=max_number_of_messages, block=False)
        self.logger.debug("got %d messages from Kafka" % (len(messages)))
        assert len(messages) <= max_number_of_messages

        #add the messages to message stream
        for message_and_offset in messages:
            self.get_message_count += 1
            offset_str = "%016x" % (message_and_offset.offset) # make offset into 16 hex chars
            # construct a new message and add it to stream
            self.get_message_stream.add_message(
                payload = str(message_and_offset.message.value),
                marker = offset_str, # TODO: this is supposed to a value that we use a as start_marker but this doesn't indicate the partition, so is not unique
                id = offset_str,
                ttl = (2**31)-1, # we don't store the original TTL so (for now at least) just send max signed 32 bit int
                age = 0,
            )

        #self.logger.debug("KafkaDriver.check_for_new_messages (end): space_used=%d, amount_of_space_avail=%d" % (self.get_message_stream.space_used(), self.get_message_stream.amount_of_space_avail()))

    # called to let the driver know that there no more messages are needed for the previously requested stream of messages and that it should free up any associated resources.
    def cancel_get_stream(self):
        self.new_msg_check_callback.cancel() # cancel call to periodically_check_for_new_messages()
        self.consumer.stop()
        self.consumer = None
        self.get_message_stream = None
        self.queue_name = None
예제 #37
0
from __future__ import absolute_import, print_function  #, unicode_literals

import itertools
from streamparse.spout import Spout

import base64
import sys

from kafka import KafkaClient, SimpleProducer, SimpleConsumer
#from kafka.client import KafkaClient
#from kafka.consumer import SimpleConsumer

kafka = KafkaClient("cloud.soumet.com:9092")
kafka_consumer = SimpleConsumer(
    kafka, "storm", "realtime",
    max_buffer_size=1310720000)  #, max_buffer_size=1310720000)

for message in kafka_consumer.get_messages(
        count=5000, block=False):  #, block=True, timeout=4):
    print(message.message.value)

kafka_consumer.commit()
    def _run(self):
        pcount = 0
        pause = False
        while True:
            try:
                if pause:
                    gevent.sleep(5)
                    pause = False
                self._logger.error("New KafkaClient %s" % self._topic)
                self._kfk = KafkaClient(self._brokers,
                                        "kc-" + self._topic,
                                        timeout=5)
                self._failed = False
                try:
                    consumer = SimpleConsumer(self._kfk, self._group, self._topic,\
                            buffer_size = 4096*4*4, max_buffer_size=4096*32*4,\
                            auto_commit = False)
                except Exception as ex:
                    template = "Consumer Failure {0} occured. Arguments:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("Error: %s trace %s" % \
                        (messag, traceback.format_exc()))
                    self._failed = True
                    raise RuntimeError(messag)

                self._logger.error("Starting %s" % self._topic)

                # Start consuming from the latest message
                consumer.seek(0, 2)

                if self._limit:
                    raise gevent.GreenletExit

                while True:
                    try:
                        mlist = consumer.get_messages(10, timeout=0.5)
                        if not self.msg_handler(mlist):
                            raise gevent.GreenletExit
                        pcount += len(mlist)
                    except TypeError as ex:
                        self._logger.error("Type Error: %s trace %s" % \
                                (str(ex.args), traceback.format_exc()))
                        gevent.sleep(0.1)
                    except common.FailedPayloadsError as ex:
                        self._logger.error("Payload Error: %s" % str(ex.args))
                        gevent.sleep(0.1)
            except gevent.GreenletExit:
                break
            except AssertionError as ex:
                self._partoffset = ex
                break
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s %s : traceback %s" % \
                                  (self._topic, messag, traceback.format_exc()))
                self.stop_partition()
                self._failed = True
                pause = True
                if hasattr(ex, 'errno'):
                    # This is an unrecoverable error
                    if ex.errno == errno.EMFILE:
                        raise SystemExit(1)

        self._logger.error("Stopping %s pcount %d" % (self._topic, pcount))
        partdb = self.stop_partition()
        return self._partoffset, partdb
예제 #39
0
class TestLinkSpider(TestCase):

    example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\
        "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\
        "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\
        "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\
        "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\
        "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\
        "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\
        "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\
        "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u."

    def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print "Could not connect to Redis"
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose")
        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.crawled_firehose",
            buffer_size=1024*100,
            fetch_size_bytes=1024*100,
            max_buffer_size=None
        )
        # move cursor to end of kafka topic
        self.consumer.seek(0, 2)

    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())

        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()

        reactor.run()

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                if the_dict is not None and the_dict['appid'] == 'testapp' \
                        and the_dict['crawlid'] == '01234567890abcdefghijklmn':
                    message_count += 1

        self.assertEquals(message_count, 1)

    def tearDown(self):
        keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
        keys = keys + self.redis_conn.keys('test-spider:*')
        for key in keys:
            self.redis_conn.delete(key)
예제 #40
0
from kafka import SimpleProducer, SimpleClient, SimpleConsumer

# To consume messages
client = SimpleClient('localhost:9092')
consumer = SimpleConsumer(client, "my-group", "my-topic")
for message in consumer:
    # message is raw byte string -- decode if necessary!
    # e.g., for unicode: `message.decode('utf-8')`
    print(message)

# Use multiprocessing for parallel consumers
from kafka import MultiProcessConsumer

# This will split the number of partitions among two processes
consumer = MultiProcessConsumer(client, "my-group", "my-topic", num_procs=2)

# This will spawn processes such that each handles 2 partitions max
consumer = MultiProcessConsumer(client,
                                "my-group",
                                "my-topic",
                                partitions_per_proc=2)

for message in consumer:
    print(message)

for message in consumer.get_messages(count=5, block=True, timeout=4):
    print(message)

client.close()
class KafkaSpiderMixin1(object):

    """
    Mixin class to implement reading urls from a kafka queue.

    :type kafka_topic: str
    """
    kafka_topic = None

    def process_kafka_message(self, message):
        """"
        Tell this spider how to extract urls from a kafka message

        :param message: A Kafka message object
        :type message: kafka.common.OffsetAndMessage
        :rtype: str or None
        """
        if not message:
            return None

        return message.value


    # override method
    def make_requests_from_url(self, url, id=None, attr=None):
        #request = Request(url,headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" }, dont_filter=True)
        request = Request(url,headers={'Origin': 'https://www.amazon.de', 'Referer':'https://www.amazon.de', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.8', 'Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36, LuminadBot/1.0 ([email protected])', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,'Cache-Control': 'max-age=0' }, dont_filter=True)
        request.cookies ={'s_pers':'%20s_fid%3D300B8810F7CDBDE1-10092DE00A8359D7%7C1558680220920%3B%20s_dl%3D1%7C1495610020921%3B%20gpv_page%3DDE%253AAZ%253ASOA-Landing%7C1495610020924%3B%20s_ev15%3D%255B%255B%2527AZDEGNOSellC%2527%252C%25271495608209183%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608216403%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220916%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220925%2527%255D%255D%7C1653374620925%3B%20s_eVar26%3DAmazon%2520Services%2520DE%7C1498200220927%3B', 'amznacsleftnav-656eac4a-695b-3a6a-946f-db61e4deb392':'1', 'amznacsleftnav-fdfd699f-c863-3b78-85b2-8a649c6b58f6':'1', 'x-amz-captcha-1':'1508482986892769', 'x-amz-captcha-2':'hw3RhTh0tvhX81cdMFkFgQ==', 'lc-acbde':'de_DE', 'session-id':'261-5677163-4561642', 'ubid-acbde':'259-8821950-7904223', 'a-ogbcbff':'1', 'x-acbde':'"V0z3CSC5jraR2B7OY6OiPR3wrDO7GbRjA9fTg2AJTorXXbAPToPEDvMAo8KTh@7M"', 'at-acbde':'Atza|IwEBIHwqc3CD45BqlJs_5aa-V8dGYqRemzUHaOJhdARXf-o6rlAp0DANlQO8ZPGB23Uek573IjBb2qkX4mlZWKna1Xn3pOzTpiUd0SQO7gh-uTZnxF5r2p22mMsR4_clEZvBBlZBMJYXD6HPxW7_sEYtklqCkY-Br197rDnz9KPza3y5u7XzgezJIBdXCaeq4vAqo9Wrl0uG0RGKSr41-4rKK9hpnGK1nN4UbO_qWxnLSwzA6LwgXczqe0C5EyH1HIp12IlKFB7OgxIEsH0QZAiT0eh0D7sFwlVG6eHfqPNWfix03SZ7apAC7C7jQ-vw1lmICAeJciD9QmumuCNEDDCT-GGWCkrAh-gxMRhKpm7Q5_gOtJijbqoLi3VfPO9QrCA7hYW8Atc-kFRIW3Y6vtRc8OZzZipCneewy-Rj_xYUMFVWMCmHs_ljfe2W6vxWgiRfmyw', 'sess-at-acbde':'"NbwPRqfG4oPuznYLUmFM5Y5JSvyizaA9ZJz6vTkNQL4="', 'sst-acbde':'Sst1|PQEs5smXCO43G8WIotdsANHyCEBZ9TkcZ_OdLYTgnk2mCfAy4Z5W77Y7zX74BQuxS7UKtfnUM6KkKhmcu01A2Fq7xshyjesDvnQDYp9QYcrFDvlceaVvpWqQfpEt2Q9XIM0VQFdd2EMpXc4C9QlehgHT0URfOlUmC47BkfeJr5dpb4Pv_dbnFASQli0k7Cln9sN_Vf4Wqz4km-6UTpsNlVJxJE48_RK6Zsk7bklH_cpJE8tfltiPzdhyhY2oDh7SieUx6CNKphxtIezjzr-0SbD8cg', 'x-wl-uid':'11PAl+O2T6FeY67SmgtWeMBtyZ538YMsy2Zcpov67B4kL2DVIv3Nx7rEprTLBkI4W3ZZ954YAADFuG1oAMSt9uIgNhk3yQfBCY6pDMJUcXUzK6rFTPF4tPnrWr3utKPzHqJATwvQOHKE=', 'session-token':'"tzfdQwuhV4SLJ9/PfV3QSfg2b3LxOcRlqovsFb3AsrqZSnkxHCjhgMsO3d7NbIS7rOee9CPoh7Lxo8LF7EdVopNDFYLMzzOtDGVhnY4czMEVNS5VHAxjtdaDvRNDJC0OloD0EvRMDfHeXG70D93/wWVNfqU0c6nKEv0yTLU7pFpIbTicUYQQFeDZYf9tPQEepQxbZ1pBOU+0FjTwWUj3SnNdDf/SVmmk+feDLRuqn+WcP6w6CPQ1G03W/TACUuIHBz9mSMRFPU0il4m+s0KyzA=="', 'csm-hit':'s-F8Q4HD9WHE8M6GMQKQT4|1519186540551', 'session-id-time':'2082754801l' }
        #request.cookies ={ 'x-wl-uid':'1yOwLjX2WnY9mLM7WsqYh6e6V1fXMd1ZMNtSL2K4PXEdSmASj6jCPPBezf56CZBu8dNd+B0dbGk6FSb6sv3/5Z2bObc/d7RBUn4jelvgzhpzxeiQQPCByKtKt+rFfaF6lordo7OBLv6I=', 's_vn':'1538041742354%26vn%3D1','s_fid':'7FA70D7094115718-2F7725F9CDA62241', 'regStatus':'pre-register', 's_nr':'1506673939908-Repeat', 's_vnum':'1938673939908%26vn%3D1', 's_dslv':'1506673939908', 'JSESSIONID':'7D8C49FEC5F5D74FBFB8C44B4582E920', 'skin':'noskin','session-token':'fMF7GsLbD9OFUtBEffIAbQYQ+k+oGY4qtqc4L+jpdCrQuiLu4c9Hm8YSsbtiO5c9mfQ3IRuuQojX/N/SOZ1vcQVF58RRX0RpMeXLEPvV50aTQq+f/s/rV8yGoETGydD/29yEVxxEqc4cWCblz5+V28+sOHeSSoUiYwysN7+jUIC+ICgHh8EJAM1aQiONRz31', 'ubid-main':'131-1502033-8002851', 'session-id-time':'2082787201l', 'session-id':'143-4281452-3926723', 'csm-hit':'%7B%22tb%22%3A%223FYTGMTG10SZNP3AYFTN%2Bs-TWA04Y4WMDA93A0N8PZQ%7C1507802966608%22%7D' }

        if id:request.meta['id']=id
        if attr:request.meta['attr']=attr
        # set the meta['item'] to use the item in the next call back
        return request

    def setup_kafka(self):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name
            self.topic ='general-starturls'

        _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092')
        _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0))
        _group = self.settings.get("GROUP","scrapy-crawler")
        _conn = KafkaClient(_server)
        self.topic1 = self.settings.get('TOPIC', 'frontier-todo')
        mongo_server = self.settings.get("MONGODB_SERVER", 'localhost')
        mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT')
        self.mng_client = MongoClient(mongo_server, mongo_port)
        
        self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) 
        self.producer = KafkaProducer(bootstrap_servers=[_server])
        self.MONGODB_DB = self.settings.get("MONGODB_DB")
        self.MONGODB_COLLECTION = "shop"
        self.SPIDER_NAME = self.settings.get("SPIDER_NAME")
        self.JOB_NAME = self.settings.get("JOB_NAME")
        self.LOCALE = self.settings.get("LOCALE",'us')
        self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr")
        self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7)
        self.JOB_INPUT_COLLECTION =  self.settings.get("JOB_INPUT_COLLECTION", "job_input3")
        self.ITEM_INPUT_COLLECTION =  self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4')
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)

    def next_request(self):
        """
        Returns a request to be scheduled.

        :rtype: str or None
        """
        message = self.consumer.get_messages(1)
        print "messsssssssssssssssssss",message 
        if message :
            url = self.process_kafka_message(message[0].message).split(",")[0].replace(",",' ').replace("#", ' ').replace("&", ' ').replace('"','').replace("'",'').replace("(",' ').replace(")",' ')
            #url = 'https://www.amazon.de/gp/offer-listing/' + url + '/ref=dp_olp_new?ie=UTF8&condition=new'
            url = 'https://www.amazon.de/dp/' + url +'/?th=1&psc=1'
            id = self.process_kafka_message(message[0].message).split(",")[1]
            attr =",".join( self.process_kafka_message(message[0].message).split(",")[2:])
            db = self.mng_client['shop_url_out']
            result1 = db.shop.find_one({'product_id':id ,'spider':self.SPIDER_NAME})
            print "asfdsdfasdfafsdfsdf",result1
            if  result1: return None

        else: url=None

        if not url:
            time_data =int(time.time())-98000            
            mng_db = self.mng_client[self.MONGODB_DB_INPUT] 
            db_cm = mng_db[self.JOB_INPUT_COLLECTION]
            try:result = list(db_cm.find({"spider_name":self.JOB_NAME, 'locale': self.LOCALE}).sort('start_time', -1))[0]
            except:result = list(db_cm.find({"repeate": {"$lt": self.NUM_REPETE}}).sort('start_time', -1))[0]
            result = list(db_cm.find({"spider_name":self.JOB_NAME ,'locale': self.LOCALE }).sort('start_time', -1))[0]
            print result
            
            if result:
                job_id = result['job_id']
                db = self.mng_client[self.MONGODB_DB] 
                #result1 = db.shop_url_out.find({'job_id':job_id,'seller_info':[]})
                result2 = db.scrap_input_mapping.find({'job_id':job_id, 'price': {"$ne": '' }  })
                #result2 = db.scrap_input_mapping.find({'job_id':job_id})
                time_data =int(time.time())-3600
                #print db.scrap_input_mapping.find({'job_id':job_id,'spider':self.SPIDER_NAME, 'price': {"$ne": '' }  }).count(),"ggggggggg"
                #return
                scraped_data = []  
                #scraped_data = [res["product_id"] for res in result1 ]
                scraped_data.extend([res.get("product_id",'') for res in result2  ])
                
                #print scraped_data, "rassssssssssssssssssssss"
                collection_name = self.ITEM_INPUT_COLLECTION
                db_cm = mng_db['scrap_input']
                #print db_cm,  list(db_cm.find({'job_id':job_id, "title": { "$exists": False}})), "ddddddddddddddddddddddd"
                #print "ddddddddddddddddddddddddddddddddd",list(db_cm.find({'job_id':job_id}))
                count = 0
                for document in list(db_cm.find({'job_id':job_id})):
                    try:
                        product_id = str(document[u'_id'])
                        #print product_id
                        if product_id not in scraped_data:
                            product_name = str(document[u'DPID'])
                            if product_name == '': continue                            
                            upc = str(document[u'UPC']).replace("'","").replace('"','')
                            msg = json.dumps({'job_id':str(job_id), 'DPID':product_name, 'UPC':upc})
                            future = self.producer.send(self.topic1 , product_name+ "," +product_id+ "," + msg)
                            record_metadata = future.get(timeout=10)
                            count = count+1
                            print record_metadata, count
                    except:
                        pass
            return None
        return self.make_requests_from_url(url,id,attr)

    def schedule_next_request(self):
        """Schedules a request if available"""
        req = self.next_request()
        if req:
            print dir(self)
            #req.headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" }
            self.crawler.engine.crawl(req, spider=self)
        else:
            print "Rahulllllllllllllllll"

    def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        try:
            self.schedule_next_request()
        except:
            pass
        raise DontCloseSpider

    def item_scraped(self, *args, **kwargs):
        """Avoids waiting for the spider to  idle before scheduling the next request"""
        self.schedule_next_request()
    def _run(self):
	pcount = 0
        pause = False
        while True:
            try:
                if pause:
                    gevent.sleep(2)
                    pause = False
                self._logger.error("New KafkaClient %s" % self._topic)
                self._kfk = KafkaClient(self._brokers , "kc-" + self._topic)
		self._failed = False
                try:
                    consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32)
                    #except:
                except Exception as ex:
                    template = "Consumer Failure {0} occured. Arguments:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("Error: %s trace %s" % \
                        (messag, traceback.format_exc()))
		    self._failed = True
                    raise RuntimeError(messag)

                self._logger.error("Starting %s" % self._topic)

                # Find the offset of the last message that has been queued
                consumer.seek(-1,2)
                try:
                    mi = consumer.get_message(timeout=0.1)
                    consumer.commit()
                except common.OffsetOutOfRangeError:
                    mi = None
                #import pdb; pdb.set_trace()
                self._logger.info("Last Queued for %s is %s" % \
                                  (self._topic,str(mi)))

                # start reading from last previously processed message
                if mi != None:
                    consumer.seek(-1,1)
                else:
                    consumer.seek(0,0)

                if self._limit:
                    raise gevent.GreenletExit

                while True:
                    try:
                        mlist = consumer.get_messages(10,timeout=0.5)
                        if not self.msg_handler(mlist):
                            raise gevent.GreenletExit
                        consumer.commit()
                        pcount += len(mlist) 
                    except TypeError as ex:
                        self._logger.error("Type Error: %s trace %s" % \
                                (str(ex.args), traceback.format_exc()))
                        gevent.sleep(0.1)
                    except common.FailedPayloadsError as ex:
                        self._logger.error("Payload Error: %s" %  str(ex.args))
                        gevent.sleep(0.1)
            except gevent.GreenletExit:
                break
            except AssertionError as ex:
                self._partoffset = ex
                break
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s" % \
                                  (messag, traceback.format_exc()))
                self.stop_partition()
		self._failed = True
                pause = True

        self._logger.error("Stopping %s pcount %d" % (self._topic, pcount))
        partdb = self.stop_partition()
        return self._partoffset, partdb
예제 #43
0
    def run(self, options=None):

        # try:

        # Create table if it doesn't exist in the database
        if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False:
            self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE)

        kafka = KafkaClient(
            config.get("kafka.host1") + "," + config.get("kafka.host2"))

        consumer = SimpleConsumer(kafka,
                                  self.GROUP_NAME,
                                  self.KAFKA_TOPIC,
                                  fetch_size_bytes=3000000,
                                  buffer_size=2000000000,
                                  max_buffer_size=2000000000)

        while True:

            # Prepare data for insert and copy to S3
            data_str = StringIO()
            csv_str = StringIO()
            count = 0

            # Get Offset from previous read
            s3_last_offset = self.get_s3_offset()

            (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0]
            last_offset = last_offset if last_offset else 0

            # Resolve difference in offset (s3 offset does not carry over from day to day)
            if s3_last_offset > last_offset:
                last_offset = s3_last_offset
                self.REDSHIFT.execute(
                    self.UPDATE_OFFSET_QUERY %
                    (self.GROUP_NAME, self.PARTITION, last_offset))

            print(last_offset)

            # Read from Offset
            consumer.seek(last_offset, 0)

            for message in consumer.get_messages(count=self.BATCH_SIZE,
                                                 block=False,
                                                 timeout=5):

                # Write tweets to StringIO
                self.write_to_data_str(message, data_str, csv_str)

                count += 1
                last_offset += 1

            # Store batch tweets to S3
            self.write_to_s3(data_str, csv_str, last_offset)

            # Track Kafka Offset
            self.REDSHIFT.execute(
                self.UPDATE_OFFSET_QUERY %
                (self.GROUP_NAME, self.PARTITION, last_offset))

            if count != self.BATCH_SIZE:
                break
예제 #44
0
class OneteraScheduler(FronteraScheduler):
    def __init__(self, crawler):
        super(OneteraScheduler, self).__init__(crawler)
        self.job_config = {}
        self.is_active = False
        self.results = []
        self.results_sent = 0
        self.last_result_iteration = None

        settings = self.frontier.manager.settings
        self.results_topic = settings.get('ONETERA_RESULTS_TOPIC')
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self.consumer = SimpleConsumer(kafka,
                                       settings.get('ONETERA_GROUP'),
                                       settings.get('ONETERA_INCOMING_TOPIC'),
                                       buffer_size=262144,
                                       max_buffer_size=10485760,
                                       auto_commit_every_n=1)
        self.producer = SimpleProducer(kafka)
        self.status_updates_topic = settings.get(
            'ONETERA_STATUS_UPDATES_TOPIC')
        self.stats = {}

    def result_callback(self, result):
        self.results.append(result)

    def open(self, spider):
        super(OneteraScheduler, self).open(spider)
        spider.set_result_callback(self.result_callback)

    def has_pending_requests(self):
        if not self.is_active:
            return False
        return super(OneteraScheduler, self).has_pending_requests()

    def next_request(self):
        if not self.is_active:
            self._check_incoming()
        if self.is_active:
            return super(OneteraScheduler, self).next_request()
        return None

    def process_spider_output(self, response, result, spider):
        self._send_results()
        self._check_finished()
        return super(OneteraScheduler,
                     self).process_spider_output(response, result, spider)

    def process_exception(self, request, exception, spider):
        super(OneteraScheduler,
              self).process_exception(request, exception, spider)
        self._send_results()
        self._check_finished()

    def _check_finished(self):
        if not self.is_active:
            return
        if self.results_sent > self.job_config['nResults']:
            logger.info(
                "Crawler reached the number of requested results. Crawling is stopping."
            )
            self.is_active = False
        if self.last_result_iteration and self.frontier.manager.iteration - self.last_result_iteration > 10:
            logger.info("It looks like crawler get stuck. Stopping crawling.")
            self.is_active = False

    def _check_incoming(self):
        consumed = 0
        try:
            for m in self.consumer.get_messages(count=1):
                try:
                    msg = loads(m.message.value)
                except ValueError, ve:
                    logger.error("Decoding error %s, message %s" %
                                 (ve, m.message.value))
                else:
                    logger.info(
                        "Got incoming message %s from incoming topic." %
                        m.message.value)

                    self.frontier.manager.backend.cleanup()
                    self._pending_requests.clear()
                    self.results = []
                    self.results_sent = 0
                    self.last_result_iteration = None

                    self.job_config = {
                        'workspace': msg['workspace'],
                        'nResults': msg['nResults'],
                        'excluded': msg['excluded'],
                        'included': msg['included'],
                        'relevantUrl': msg['relevantUrl'],
                        'irrelevantUrl': msg['irrelevantUrl'],
                    }
                    requests = [
                        Request(url, meta={'score': 1.0})
                        for url in msg['relevantUrl']
                    ]
                    if not requests:
                        raise Exception(
                            "Empty seeds list, can't bootstrap crawler.")
                    self.frontier.add_seeds(requests)
                    self.frontier.spider.configure(self.job_config)
                    self.is_active = True
                finally:
                    consumed += 1
예제 #45
0
class TestRedisMonitor(TestCase):

    maxDiff = None
    queue_key = "link:istresearch.com:queue"

    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load(
            "localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(
            self.redis_monitor.settings['KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(self.kafka_conn, "demo-id",
                                       "demo_test.outbound_firehose")

    def test_process_item(self):
        # we only want to go to the end now, not after this test is ran
        self.consumer.seek(0, 2)

        # set the info flag
        key = "info-test:blah"
        value = "ABC123"
        self.redis_monitor.redis_conn.set(key, value)

        # process the request
        plugin = self.redis_monitor.plugins_dict.items()[0][1]
        self.redis_monitor._process_plugin(plugin)

        # ensure the key is gone
        self.assertEquals(self.redis_monitor.redis_conn.get(key), None)

    def test_sent_to_kafka(self):
        success = {u'info-test': "ABC123", u"appid": u"someapp"}

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                self.assertEquals(success, the_dict)
                message_count += 1

        self.assertEquals(message_count, 1)
예제 #46
0
class ScoringWorker(object):
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0

    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        try:
            for m in self._in_consumer.get_messages(
                    count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError) as e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    batch.append(msg)
                    if type == 'add_seeds':
                        _, seeds = msg
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], seeds))
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        fingerprints.add(response.meta['fingerprint'])
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], links))
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        fingerprints.add(request.meta['fingerprint'])
                        continue

                    raise TypeError('Unknown message type %s' % type)
                finally:
                    consumed += 1
        except OffsetOutOfRangeError as e:
            # https://github.com/mumrah/kafka-python/issues/263
            self._in_consumer.seek(0, 2)  # moving to the tail of the log
            logger.info(
                "Caught OffsetOutOfRangeError, moving to the tail of the log.")

        self.backend.fetch_states(list(fingerprints))
        fingerprints.clear()
        results = []
        for msg in batch:
            if len(results) > 1024:
                self._producer.send_messages(self.outgoing_topic, *results)
                results = []

            type = msg[0]
            if type == 'add_seeds':
                _, seeds = msg
                for seed in seeds:
                    seed.meta['jid'] = self.job_id
                results.extend(self.on_add_seeds(seeds))
                continue

            if type == 'page_crawled':
                _, response, links = msg
                if response.meta['jid'] != self.job_id:
                    continue
                results.extend(self.on_page_crawled(response, links))
                continue

            if type == 'request_error':
                _, request, error = msg
                if request.meta['jid'] != self.job_id:
                    continue
                results.extend(self.on_request_error(request, error))
                continue
        if len(results):
            self._producer.send_messages(self.outgoing_topic, *results)

        if self.cache_flush_counter == 30:
            logger.info("Flushing states")
            self.backend.flush_states(is_clear=False)
            logger.info("Flushing states finished")
            self.cache_flush_counter = 0

        self.cache_flush_counter += 1

        if self.strategy.finished():
            logger.info("Succesfully reached the crawling goal. Exiting.")
            exit(0)

        logger.info("Consumed %d items.", consumed)
        self.stats['last_consumed'] = consumed
        self.stats['last_consumption_run'] = asctime()

    def run(self):
        while True:
            self.work()

    def on_add_seeds(self, seeds):
        logger.info('Adding %i seeds', len(seeds))
        seed_map = dict(
            map(lambda seed: (seed.meta['fingerprint'], seed), seeds))
        self.backend.update_states(seeds, False)
        scores = self.strategy.add_seeds(seeds)
        self.backend.update_states(seeds, True)

        output = []
        for fingerprint, score in scores.iteritems():
            seed = seed_map[fingerprint]
            logger.debug('URL: %s', seed.url)
            if score is not None:
                encoded = self._encoder.encode_update_score(
                    seed.meta['fingerprint'], score, seed.url, True)
                output.append(encoded)
        return output

    def on_page_crawled(self, response, links):
        logger.debug("Page crawled %s", response.url)
        objs_list = [response]
        objs_list.extend(links)
        objs = dict(map(lambda obj: (obj.meta['fingerprint'], obj), objs_list))
        self.backend.update_states(objs_list, False)
        scores = self.strategy.page_crawled(response, links)
        self.backend.update_states(objs_list, True)

        output = []
        for fingerprint, score in scores.iteritems():
            obj = objs[fingerprint]
            if score is not None:
                encoded = self._encoder.encode_update_score(
                    obj.meta['fingerprint'], score, obj.url, True)
                output.append(encoded)
        return output

    def on_request_error(self, request, error):
        self.backend.update_states(request, False)
        scores = self.strategy.page_error(request, error)
        self.backend.update_states(request, True)
        assert len(scores) == 1
        fingerprint, score = scores.popitem()
        if score is not None:
            encoded = self._encoder.encode_update_score(
                request.meta['fingerprint'], score, request.url, False)
            return [encoded]
        return []
예제 #47
0
# In[ ]:

import json


# In[1]:

from kafka import KafkaClient, SimpleConsumer
kafka = KafkaClient("ec2-52-26-15-148.us-west-2.compute.amazonaws.com:9092")
consumer = SimpleConsumer(kafka, "my-group", "moviereview9")


# In[10]:

#time.sleep(6)
messages = consumer.get_messages(100)
print messages


# In[41]:

jsonList = [json.loads(message.message.value). for message in messages]
print jsonList


# In[42]:

print json.dumps(jsonList)


# In[43]:
예제 #48
0
from __future__ import absolute_import, print_function#, unicode_literals

import itertools
from streamparse.spout import Spout

import base64
import sys



from kafka import KafkaClient, SimpleProducer, SimpleConsumer
#from kafka.client import KafkaClient
#from kafka.consumer import SimpleConsumer


kafka = KafkaClient("cloud.soumet.com:9092")
kafka_consumer = SimpleConsumer(kafka, "storm", "realtime", max_buffer_size=1310720000)#, max_buffer_size=1310720000)
		
for message in kafka_consumer.get_messages(count=5000, block=False):#, block=True, timeout=4):
	print(message.message.value)

kafka_consumer.commit()
예제 #49
0
if len(sys.argv) < 2:
    print 'Usage: {} <config>'.format(sys.argv[0])
    sys.exit(-1)

config = ConfigParser.SafeConfigParser()
config.read(sys.argv[1])

#initalize data_feed
data_feed = KafkaClient(config.get('data_feed', 'host'))
consumer = SimpleConsumer(data_feed,
                          group=config.get('data_feed', 'group'),
                          topic=config.get('data_feed', 'topic'))
fetch_num_messages = config.getint('data_feed', 'fetch_num_messages')
#initialize data_sink
sink_config = dict(config.items('data_sink'))
class_name = sink_config.pop('class')
classobject = utils.load_class(class_name, storage.VerdictDB)
db = classobject(None, **sink_config)
removed = added = 0
for msg in consumer.get_messages(count=fetch_num_messages, block=False):
    try:
        msg = json.loads(msg.message.value)
        blacklist_type = msg['type']
        map(partial(db.insert, blacklist_type), msg['add'])
        added += len(msg['add'])
        map(partial(db.delete, blacklist_type), msg['remove'])
        removed += len(msg['remove'])
    except Exception as e:
        print '[{}] Error ingesting msg {}'.format(e, msg)
print 'added %d urls removed %d urls' % (added, removed)
예제 #50
0
import os
import sys
from kafka import KafkaClient, SimpleConsumer

#kafka=KafkaClient("ec2-52-8-194-192.us-west-1.compute.amazonaws.com:9092")
kafka=KafkaClient("ec2-52-8-194-192.us-west-1.compute.amazonaws.com")

kafka_consumer=SimpleConsumer(kafka,"my_group","filmon-topic2")

messages= kafka_consumer.get_messages(count=1000, block=False)
if not messages:
            print "no messages to read"
for message in messages:
	print  message.message.value
예제 #51
0
# stdlib
from collections import defaultdict

# 3p
from kafka import SimpleClient, SimpleConsumer

kafka_conn = SimpleClient("192.168.208.2:9092")
consumer = SimpleConsumer(kafka_conn,
                          "sample_check",
                          "test-topic",
                          auto_commit=True)

for message in consumer.get_messages(count=10):
    print message.offset
    consumer.commit()
예제 #52
0
    def _run(self):
        pcount = 0
        while True:
            try:
                self._logger.error("New KafkaClient %d" % self._partition)
                self._kfk = KafkaClient(self._brokers, str(os.getpid()))
                try:
                    consumer = SimpleConsumer(self._kfk,
                                              self._group,
                                              self._topic,
                                              buffer_size=4096 * 4,
                                              max_buffer_size=4096 * 32)
                    #except:
                except Exception as ex:
                    template = "Consumer Failure {0} occured. Arguments:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.info("%s" % messag)
                    raise RuntimeError(messag)

                self._logger.error("Starting %d" % self._partition)

                # Find the offset of the last message that has been queued
                consumer.seek(0, 2)
                try:
                    mi = consumer.get_message(timeout=0.1)
                    consumer.commit()
                except common.OffsetOutOfRangeError:
                    mi = None
                #import pdb; pdb.set_trace()
                self._logger.info("Last Queued for %d is %s" % \
                                  (self._partition,str(mi)))

                # start reading from last previously processed message
                if mi != None:
                    consumer.seek(0, 1)
                else:
                    consumer.seek(0, 0)

                if self._limit:
                    raise gevent.GreenletExit

                while True:
                    try:
                        self.resource_check()
                        mlist = consumer.get_messages(10, timeout=0.2)
                        for mm in mlist:
                            if mm is None:
                                continue
                            self._logger.debug("%d Reading offset %d" % \
                                    (self._partition, mm.offset))
                            consumer.commit()
                            pcount += 1
                            if not self.msg_handler(mm):
                                self._logger.info("%d could not handle %s" %
                                                  (self._partition, str(mm)))
                                raise gevent.GreenletExit
                    except TypeError as ex:
                        self._logger.error("Type Error: %s trace %s" % \
                                (str(ex.args), traceback.format_exc()))
                        gevent.sleep(0.1)
                    except common.FailedPayloadsError as ex:
                        self._logger.error("Payload Error: %s" % str(ex.args))
                        gevent.sleep(0.1)
            except gevent.GreenletExit:
                break
            except AssertionError as ex:
                self._partoffset = ex
                break
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s" % \
                                  (messag, traceback.format_exc()))
                self.stop_partition()
                gevent.sleep(2)

        partdb = {}
        for coll in self._uvedb.keys():
            partdb[coll] = {}
            for gen in self._uvedb[coll].keys():
                partdb[coll][gen] = {}
                for tab in self._uvedb[coll][gen].keys():
                    for rkey in self._uvedb[coll][gen][tab].keys():
                        uk = tab + ":" + rkey
                        partdb[coll][gen][uk] = \
                            set(self._uvedb[coll][gen][tab][rkey].keys())

        self._logger.error("Stopping %d pcount %d" % (self._partition, pcount))
        self.stop_partition()
        return self._partoffset, partdb
예제 #53
0
class ScoringWorker(object):
    def __init__(self, settings, strategy_module):
        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY)
        partition_id = settings.get('SCORING_PARTITION_ID')
        if partition_id == None or type(partition_id) != int:
            raise AttributeError("Scoring worker partition id isn't set.")
        self._in_consumer = SimpleConsumer(kafka,
                                           settings.get('SCORING_GROUP'),
                                           settings.get('INCOMING_TOPIC'),
                                           buffer_size=1048576,
                                           max_buffer_size=10485760,
                                           partitions=[partition_id])

        self._manager = FrontierManager.from_settings(settings)
        self._decoder = Decoder(self._manager.request_model,
                                self._manager.response_model)
        self._encoder = Encoder(self._manager.request_model)

        self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128)
        self.outgoing_topic = settings.get('SCORING_TOPIC')
        self.strategy = strategy_module.CrawlStrategy()
        self.backend = self._manager.backend
        self.stats = {}
        self.cache_flush_counter = 0
        self.job_id = 0

    def work(self):
        consumed = 0
        batch = []
        fingerprints = set()
        try:
            for m in self._in_consumer.get_messages(
                    count=self.consumer_batch_size, block=True, timeout=1.0):
                try:
                    msg = self._decoder.decode(m.message.value)
                except (KeyError, TypeError), e:
                    logger.error("Decoding error: %s", e)
                    continue
                else:
                    type = msg[0]
                    batch.append(msg)
                    if type == 'add_seeds':
                        _, seeds = msg
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], seeds))
                        continue

                    if type == 'page_crawled':
                        _, response, links = msg
                        fingerprints.add(response.meta['fingerprint'])
                        fingerprints.update(
                            map(lambda x: x.meta['fingerprint'], links))
                        continue

                    if type == 'request_error':
                        _, request, error = msg
                        fingerprints.add(request.meta['fingerprint'])
                        continue

                    raise TypeError('Unknown message type %s' % type)
                finally:
                    consumed += 1
예제 #54
0
class TestRedisMonitor(TestCase):

    maxDiff = None
    queue_key = "link:istresearch.com:queue"

    def setUp(self):
        self.redis_monitor = RedisMonitor("localsettings.py")
        self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py")
        self.redis_monitor.logger = MagicMock()
        self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test"
        self.redis_monitor.settings['STATS_TOTAL'] = False
        self.redis_monitor.settings['STATS_PLUGINS'] = False
        self.redis_monitor.settings['PLUGINS'] = {
            'plugins.info_monitor.InfoMonitor': None,
            'plugins.stop_monitor.StopMonitor': None,
            'plugins.expire_monitor.ExpireMonitor': None,
            'tests.tests_online.CustomMonitor': 100,
        }
        self.redis_monitor.redis_conn = redis.Redis(
            host=self.redis_monitor.settings['REDIS_HOST'],
            port=self.redis_monitor.settings['REDIS_PORT'])

        self.redis_monitor._load_plugins()
        self.redis_monitor.stats_dict = {}

        self.kafka_conn = KafkaClient(self.redis_monitor.settings[
                                      'KAFKA_HOSTS'])
        self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose")

        self.consumer = SimpleConsumer(
            self.kafka_conn,
            "demo-id",
            "demo_test.outbound_firehose"
        )

    def test_process_item(self):
        # we only want to go to the end now, not after this test is ran
        self.consumer.seek(0, 2)

        # set the info flag
        key = "info-test:blah"
        value = "ABC123"
        self.redis_monitor.redis_conn.set(key, value)

        # process the request
        plugin = self.redis_monitor.plugins_dict.items()[0][1]
        self.redis_monitor._process_plugin(plugin)

        # ensure the key is gone
        self.assertEquals(self.redis_monitor.redis_conn.get(key), None)

    def test_sent_to_kafka(self):
        success = {
            u'info-test': "ABC123",
            u"appid": u"someapp"
        }

        # ensure it was sent out to kafka
        message_count = 0
        for message in self.consumer.get_messages():
            if message is None:
                break
            else:
                the_dict = json.loads(message.message.value)
                self.assertEquals(success, the_dict)
                message_count += 1

        self.assertEquals(message_count, 1)
예제 #55
0
class HHStrategyWorker(ScoringWorker):

    worker_prefix = 'hh-strategy-worker'

    def __init__(self, settings):
        super(HHStrategyWorker, self).__init__(settings, topic)
        self.slot = Slot(log_processing=self.work,
                         incoming=self.incoming,
                         outgoing=self.outgoing,
                         is_master=settings.get("FRONTERA_MASTER"))
        kafka_hh = KafkaClient(settings.get('KAFKA_LOCATION_HH'))
        self.consumer_hh = SimpleConsumer(
            kafka_hh,
            settings.get('FRONTERA_GROUP'),
            settings.get('FRONTERA_INCOMING_TOPIC'),
            buffer_size=262144,
            max_buffer_size=10485760,
            auto_commit_every_n=1)
        self.producer_hh = SimpleProducer(kafka_hh)
        self.results_topic = settings.get("FRONTERA_RESULTS_TOPIC")
        self.job_config = {}
        self.zookeeper = ZookeeperSession(settings.get('ZOOKEEPER_LOCATION'),
                                          name_prefix=self.worker_prefix)

        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self.partitions_count = len(
            kafka.get_partition_ids_for_topic(settings.get('INCOMING_TOPIC')))
        self.null_cycles = 0

    def set_process_info(self, process_info):
        self.process_info = process_info
        self.zookeeper.set(process_info)

    def run(self):
        def onStart():
            self.reset()
            self.slot.schedule()

        if self.slot.is_master:
            reset = CallLaterOnce(onStart)
            reset.setErrback(self.slot.error)
            reset.schedule(5.0)
        else:
            self.slot.schedule()
        reactor.run()

    def work(self):
        super(HHStrategyWorker, self).work()
        if self.stats['last_consumed'] == 0:
            self.null_cycles += 1
        else:
            self.null_cycles = 0
        if self.null_cycles == 500:
            logger.info("It seems crawler got stuck, performing self reset.")
            self.reset()
            self.null_cycles = 0

    def incoming(self):
        if self.slot.is_active:
            return

        if not self.slot.is_master:
            logger.warn(
                "Incoming topic shouldn't be consumed on slave instances.")

        consumed = 0
        try:
            for m in self.consumer_hh.get_messages(count=1):
                try:
                    msg = loads(m.message.value)
                except ValueError, ve:
                    logger.error("Decoding error %s, message %s" %
                                 (ve, m.message.value))
                else:
                    logger.info(
                        "Got incoming message %s from incoming topic." %
                        m.message.value)
                    job_config = {
                        'workspace': msg['workspace'],
                        'nResults':
                        msg.get('nResults', 0) / self.partitions_count,
                        'excluded': msg['excluded'],
                        'included': msg['included'],
                        'relevantUrl': msg['relevantUrl'],
                        'irrelevantUrl': msg['irrelevantUrl'],
                    }
                    self.reset()
                    self.setup(job_config['relevantUrl'], job_config)
                finally:
                    consumed += 1