def test_switch_leader_simple_consumer(self): producer = Producer(self.client, async=False) consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10) self._send_random_messages(producer, self.topic, 0, 2) consumer.get_messages() self._kill_leader(self.topic, 0) consumer.get_messages()
def test_simple_consumer_failed_payloads(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) def failed_payloads(payload): return FailedPayloadsError(payload) client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads) # This should not raise an exception consumer.get_messages(5)
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) tempfile.write("\n") if tempfile.tell() > 12000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit( ) # inform zookeeper of position in the kafka queue
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_stockTwits_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path,"w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 300s") # sleep 5mins continue for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
class KafkaConsumer(object): def __init__(self, conf): self.log = logging.getLogger(__name__) self.client = KafkaClient(conf["kafka_server"]) self.total_inserts = 0 self.inserts = 0 self.listenstore = None def start_listens(self, listenstore): self.listenstore = listenstore return self.start(b"listen-group", b"listens") def start(self, group_name, topic_name): self.group_name = group_name self.topic_name = topic_name self.log.info("KafkaConsumer subscribed to %s -> %s" % (group_name, topic_name)) self.consumer = SimpleConsumer(self.client, self.group_name, self.topic_name) t0 = 0 last_offset = -1 while True: listens = [] if t0 == 0: t0 = time() messages = self.consumer.get_messages(count=CASSANDRA_BATCH_SIZE, block=True, timeout=KAFKA_READ_TIMEOUT) for message in messages: try: data = ujson.loads(message.message.value) listens.append(Listen.from_json(data)) except ValueError as e: self.log.error("Cannot parse JSON: %s\n'%s'" % (str(e), message.message.value)) continue last_offset = message.offset if listens: broken = True while broken: try: self.listenstore.insert_batch(listens) broken = False except ValueError as e: self.log.error("Cannot insert listens: %s" % unicode(e)) broken = False except NoHostAvailable as e: self.log.error("Cannot insert listens: %s. Sleeping, trying again." % unicode(e)) sleep(5) self.inserts += len(messages) if self.inserts >= REPORT_FREQUENCY: t1 = time() self.total_inserts += self.inserts self.log.info("Inserted %d rows in %.1fs (%.2f listens/sec). Total %d rows. last offset: %d" % \ (self.inserts, t1 - t0, self.inserts / (t1 - t0), self.total_inserts, last_offset)) self.inserts = 0 t0 = 0
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % ( topic, group, output_dir) #get timestamp timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.dat" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") log_has_at_least_one = False #did we log at least one entry? while True: messages = kafka_consumer.get_messages( count=1000, block=False) #get 1000 messages at a time, non blocking if not messages: break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) log_has_at_least_one = True #print(message.message.value) tempfile.write(message.message.value + "\n") if tempfile.tell() > 10000000: #file size > 10MB flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlingStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages(count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update(map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update(map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1
def test_simple_consumer_unknown_topic_partition(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) # Mock so that only the first request gets a valid response def unknown_topic_partition(request): return FetchResponsePayload(request.topic, request.partition, UnknownTopicOrPartitionError.errno, -1, ()) client.send_fetch_request.side_effect = self.fail_requests_factory(unknown_topic_partition) # This should not raise an exception with self.assertRaises(UnknownTopicOrPartitionError): consumer.get_messages(20)
def test_ts(self): kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) # consumer = SimpleConsumer(kafka, "my-group112", "test") consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: print("HELLO") # Prepare data for insert and copy to S3 # data_str = StringIO() count = 0 # last_offset = 2 consumer.seek(2, 0) for message in consumer.get_messages(count=100, block=False, timeout=0.1): count += 1 print(message.message.value) # # Write tweets to StringIO # self.write_to_data_str(message, data_str) # # Store batch tweets to S3 # self.write_to_s3(data_str, last_offset) if count != 100: break
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consumer Loading topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) timestamp = standardized_timestamp(frequency) kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) #open file for writing tempfile_path = "/tmp/kafka_%s_%s_%s_%s.txt" % (topic, group, timestamp, batch_counter) tempfile = open(tempfile_path, "w") #log_has_at_least_one = False #did we log at least one entry? while True: # get 1000 messages at a time, non blocking messages = kafka_consumer.get_messages(count=100, block=False) if not messages: #print "no messages to read" continue # If no messages are received, wait until there are more for message in messages: #log_has_at_least_one = True #print(message.message.value) #tempfile.write(message.message.value + "\n") # lose the '\n'? tempfile.write(message.message.value) if tempfile.tell() > 120000000: # file size > 120MB print "Note: file is large enough to write to hdfs. Writing now..." flush_to_hdfs(output_dir, topic) kafka_consumer.commit() # inform zookeeper of position in the kafka queue
def consume_save(group,topic): # tmp_save=open(tmp_file_path,"w") while True: kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "Consumer didn't read any messages" for message in messages: # tmp_save.write( message.message.value+"\n") print message.message.value+"\n"
def test_simple_consumer_leader_change(self): client = MagicMock() consumer = SimpleConsumer(client, group=None, topic='topic', partitions=[0, 1], auto_commit=False) # Mock so that only the first request gets a valid response def not_leader(request): return FetchResponsePayload(request.topic, request.partition, NotLeaderForPartitionError.errno, -1, ()) client.send_fetch_request.side_effect = self.fail_requests_factory(not_leader) # This should not raise an exception consumer.get_messages(20) # client should have updated metadata self.assertGreaterEqual(client.reset_topic_metadata.call_count, 1) self.assertGreaterEqual(client.load_metadata_for_topics.call_count, 1)
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Friendsquare topic then save it to a temporary file: temp_dir, then transfer the file to hdfs. Create a kafka receiver to grap messages ''' kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) # Create a temp file to store messages self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') hdfs_output_dir = "%s/%s" % (self.hdfs_dir, topic) # Create a hdfs directory to store output files os.system("hdfs dfs -mkdir -p %s" % hdfs_output_dir) while self.count < self.max_count: # Get 1000 messages each time messages = kafka_receiver.get_messages(count=1000, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # Set each file size at 20 M if temp_file.tell() > 20000000: temp_file.close() # Put the file to hdfs hdfs_path = "%s/%s.txt" % (hdfs_output_dir, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.temp_file_path = "%s/%s.txt" % (temp_dir, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consume_topic(self, topic, group, temp_dir): ''' This function receive messages from Kafka then save it to a temporary first, then transfer the file to hdfs. ''' # Create a kafka receiver to grap messages kafka_receiver = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) self.timestamp = self.getTimestamp() # Create a temp file to store messages self.temp_file_path = "%s/%s_%s.txt" % (temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') while self.count < self.max_count: # Get 100 messages each time messages = kafka_receiver.get_messages(count=100, block=False) if not messages: continue # Write the messages to a file, one message per line for message in messages: temp_file.write(message.message.value + '\n') # For structured streaming, files need to be small at this point, set the size at 2 M if temp_file.tell() > 2000000: temp_file.close() # Copy the file to hdfs output_dir = "%s/%s" % (self.hdfs_dir, topic) os.system("hdfs dfs -mkdir %s" % output_dir) hdfs_path = "%s/%s_%s.txt" % (output_dir, self.timestamp, self.count) os.system("hdfs dfs -put -f %s %s" % (self.temp_file_path, hdfs_path)) #remove the old file os.remove(self.temp_file_path) # Create a new temp file to store messages self.count += 1 self.timestamp = self.getTimestamp() self.temp_file_path = "%s/%s_%s.txt" % ( temp_dir, self.timestamp, str(self.count)) temp_file = open(self.temp_file_path, 'w') # Inform zookeeper of position in the kafka queue kafka_receiver.commit() temp_file.close()
def consume_save(group,topic): tmp_save=open(tmp_file_path,"w") kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") # print message.message.value+"\n" kafka_consumer.commit() # inform zookeeper of position in the kafka queu print ".... ... .. .." print "Message from topic \"%s\" consumed \n" % topic
def run(self, options=None): # try: # Create table if it doesn't exist in the database if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False: self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE) kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: # Prepare data for insert and copy to S3 data_str = StringIO() csv_str = StringIO() count = 0 # Get Offset from previous read s3_last_offset = self.get_s3_offset() (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0] last_offset = last_offset if last_offset else 0 # Resolve difference in offset (s3 offset does not carry over from day to day) if s3_last_offset > last_offset: last_offset = s3_last_offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) print(last_offset) # Read from Offset consumer.seek(last_offset, 0) for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5): # Write tweets to StringIO self.write_to_data_str(message, data_str, csv_str) count += 1 last_offset += 1 # Store batch tweets to S3 self.write_to_s3(data_str, csv_str, last_offset) # Track Kafka Offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) if count != self.BATCH_SIZE: break
class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ def __init__(self, conn, topic, group, partition_id): self._conn = conn self._group = group self._topic = topic self._partition_ids = [partition_id] if partition_id is not None else None self._cons = None self._connect_consumer() def _connect_consumer(self): if self._cons is None: try: self._cons = SimpleConsumer( self._conn, self._group, self._topic, partitions=self._partition_ids, buffer_size=1048576, max_buffer_size=10485760) except BrokerResponseError: self._cons = None logger.warning("Could not connect consumer to Kafka server") return False return True def get_messages(self, timeout=0.1, count=1): if not self._connect_consumer(): yield while True: try: for offmsg in self._cons.get_messages( count, timeout=timeout): try: yield offmsg.message.value except ValueError: logger.warning( "Could not decode {0} message: {1}".format( self._topic, offmsg.message.value)) except Exception as err: logger.warning("Error %s" % err) finally: break def get_offset(self): return 0
def validate_samza_job(): """ Validates that negate-number negated all messages, and sent the output to samza-test-topic-output. """ logger.info('Running validate_samza_job') kafka = _get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC) messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=60) message_count = len(messages) assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count) for message in map(lambda m: m.message.value, messages): assert int(message) < 0 , 'Expected negative integer but received {0}'.format(message) kafka.close()
def validate_samza_job(): """ Validates that negate-number negated all messages, and sent the output to samza-test-topic-output. """ logger.info("Running validate_samza_job") kafka = util.get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, "samza-test-group", TEST_OUTPUT_TOPIC) messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300) message_count = len(messages) assert NUM_MESSAGES == message_count, "Expected {0} lines, but found {1}".format(NUM_MESSAGES, message_count) for message in map(lambda m: m.message.value, messages): assert int(message) < 0, "Expected negative integer but received {0}".format(message) kafka.close()
def consume_save(group,topic): i=0 tmp_save=open(tmp_file_path,"w") while True: kafka_consumer=SimpleConsumer(kafka,group,topic) messages= kafka_consumer.get_messages(count=1000, block=False) # if not messages: # print "Consumer didn't read any messages" for message in messages: tmp_save.write( message.message.value+"\n") print message.message.value+"\n" # file size > 20MB if tmp_save.tell() > 20000000: push_to_hdfs(tmp_file_path) kafka_consumer.commit() # inform zookeeper of position in the kafka queu
class Consumer(Thread): def __init__(self, args=()): super(Consumer, self).__init__() self.host = args[0] self.port = args[1] self.topic = args[2] print '[KafkaConsumer] host: {0}, port: {1}, topic: {2}'.format(self.host, self.port, self.topic) self.consumer = None self.consumer_keep_run = True self.consumer_paused = False self.consumer_subscribers = [] def run(self): client = kafka_client(self.host, self.port) self.consumer = SimpleConsumer(client, None, self.topic) self.consumer.seek(0, 1) while self.consumer_keep_run: print '[KafkaConsumer] looping..' if not self.consumer_paused: for message in self.consumer.get_messages(block=False): offset = message.offset value = message.message.value j_encoded = json.dumps({'offset': offset, 'message': value}) print '[KafkaConsumer] {}'.format(j_encoded) for subscriber in self.consumer_subscribers: IOLoop.instance().add_callback(partial(subscriber.send_message, j_encoded)) time.sleep(1) def pause_consumer(self, paused): self.consumer_paused = paused def stop_consumer(self): self.consumer_keep_run = False def add_subscriber(self, subscriber): self.consumer_subscribers.append(subscriber) def remove_subscriber(self, subscriber): self.consumer_subscribers.remove(subscriber) def get_subscribers_length(self): length = len(self.consumer_subscribers) return length def get_subscribers(self): return self.subscribers
class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ def __init__(self, conn, topic, group, partition_id): self._conn = conn self._group = group self._topic = topic self._partition_ids = [partition_id ] if partition_id is not None else None self._cons = None self._connect_consumer() def _connect_consumer(self): if self._cons is None: try: self._cons = SimpleConsumer(self._conn, self._group, self._topic, partitions=self._partition_ids, buffer_size=1048576, max_buffer_size=10485760) except BrokerResponseError: self._cons = None logger.warning("Could not connect consumer to Kafka server") return False return True def get_messages(self, timeout=0.1, count=1): if not self._connect_consumer(): yield while True: try: for offmsg in self._cons.get_messages(count, timeout=timeout): try: yield offmsg.message.value except ValueError: logger.warning( "Could not decode {0} message: {1}".format( self._topic, offmsg.message.value)) except Exception as err: logger.warning("Error %s" % err) finally: break def get_offset(self): return 0
class KafkaSpout(Spout): def initialize(self, stormconf, context): # self.words = itertools.cycle(['dog', 'cat', # 'zebra', 'elephant']) self.kafka = KafkaClient("cloud.soumet.com:9092") self.consumer = SimpleConsumer(self.kafka, "storm", "realtime", max_buffer_size=1310720000) def next_tuple(self): for message in self.consumer.get_messages(count=500, block=False):#, timeout=1): #transaction_data = TransactionFull() #transaction_data.ParseFromString(base64.b64decode(message.message.value)) #self.emit([transaction_data]) self.emit([message.message.value]) self.consumer.commit()
def validate_kafka_read_write_performance(): """ Validates that all messages were sent to the output topic. """ logger.info('Running validate_kafka_read_write_performance') kafka = util.get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer( kafka, 'samza-test-group', TEST_OUTPUT_TOPIC, fetch_size_bytes=1000000, buffer_size=32768, max_buffer_size=None) # wait 5 minutes to get all million messages messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300) message_count = len(messages) assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format(NUM_MESSAGES, message_count) kafka.close()
def consume_topic(topic, group, output_dir, frequency): global timestamp, tempfile_path, tempfile print "Consuming from topic '%s' in consumer group %s into %s..." % (topic, group, output_dir) #get timestamp kafka_consumer = SimpleConsumer(kafka, group, topic, max_buffer_size=1310720000) while True: messages = kafka_consumer.get_messages(count=1000, block=False) #get 5000 messages at a time, non blocking if not messages: os.system("sleep 30s") continue #break for message in messages: #OffsetAndMessage(offset=43, message=Message(magic=0, attributes=0, key=None, value='some message')) print message kafka_consumer.commit() #save position in the kafka queue #exit loop if log_has_at_least_one: flush_to_hdfs(output_dir, topic) kafka_consumer.commit() #save position in the kafka queue return 0
def CommitData(): """API: 呼叫時提供topic, guid, group,API會依照guid查詢當初取資料時的last offset,並依last offset 對 topic 進行 Commit""" api_message = ExecuteResult() api_message.message = 'ok' api_statuscode = 200 try: original_topic = request.json['topic'].encode('utf-8') + '_error_msg' topic = request.json['topic'].encode('utf-8') + '_error_msg_log' guid = request.json['guid'].encode('utf-8') group = request.json['group'].encode('utf-8') if not (CheckTopicExsited(topic)): return "Topic cannot be found! This may have not been created.", 200 client3 = KafkaClient(tmpbootstrap_servers) simplecon = SimpleConsumer(client3, group, topic, auto_commit=False) simplecon_messages = simplecon.get_messages(count=500) ii = 0 for msg in simplecon_messages: msgGuid = getMsgDataGuid(msg.message.value) ii += 1 print(str(ii)) if msgGuid == guid: msgInfos2 = get_last_offset_data(msg.message.value) msgInfos = json.loads(msgInfos2) for offset_data in msgInfos: commitTopic(original_topic, group, int(offset_data['partition_ID']), int(offset_data['get_last_offset'])) except Exception as e: api_message.message = str(e) print(str(e)) api_statuscode = 500 finally: return json.dumps(api_message, default=encode_ExecuteResult), api_statuscode
def validate_kafka_read_write_performance(): """ Validates that all messages were sent to the output topic. """ logger.info('Running validate_kafka_read_write_performance') kafka = util.get_kafka_client() kafka.ensure_topic_exists(TEST_OUTPUT_TOPIC) consumer = SimpleConsumer(kafka, 'samza-test-group', TEST_OUTPUT_TOPIC, fetch_size_bytes=1000000, buffer_size=32768, max_buffer_size=None) # wait 5 minutes to get all million messages messages = consumer.get_messages(count=NUM_MESSAGES, block=True, timeout=300) message_count = len(messages) assert NUM_MESSAGES == message_count, 'Expected {0} lines, but found {1}'.format( NUM_MESSAGES, message_count) kafka.close()
def test_ts(self): kafka = KafkaClient( config.get("kafka.host1") + "," + config.get("kafka.host2")) # consumer = SimpleConsumer(kafka, "my-group112", "test") consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: print("HELLO") # Prepare data for insert and copy to S3 # data_str = StringIO() count = 0 # last_offset = 2 consumer.seek(2, 0) for message in consumer.get_messages(count=100, block=False, timeout=0.1): count += 1 print(message.message.value) # # Write tweets to StringIO # self.write_to_data_str(message, data_str) # # Store batch tweets to S3 # self.write_to_s3(data_str, last_offset) if count != 100: break
def _run(self): pcount = 0 while True: try: self._logger.info("New KafkaClient %d" % self._partition) self._kfk = KafkaClient(self._brokers ,str(os.getpid())) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise RuntimeError(messag) self._logger.info("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) self.start_partition() # start reading from last previously processed message consumer.seek(0,1) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10) for mm in mlist: if mm is None: continue self._logger.debug("%d Reading offset %d" % \ (self._partition, mm.offset)) consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError: gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.info("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() gevent.sleep(2) self._logger.info("Stopping %d pcount %d" % (self._partition, pcount)) return self._partoffset, self._partdb
class OneteraScheduler(FronteraScheduler): def __init__(self, crawler): super(OneteraScheduler, self).__init__(crawler) self.job_config = {} self.is_active = False self.results = [] self.results_sent = 0 self.last_result_iteration = None settings = self.frontier.manager.settings self.results_topic = settings.get("ONETERA_RESULTS_TOPIC") kafka = KafkaClient(settings.get("KAFKA_LOCATION")) self.consumer = SimpleConsumer( kafka, settings.get("ONETERA_GROUP"), settings.get("ONETERA_INCOMING_TOPIC"), buffer_size=262144, max_buffer_size=10485760, auto_commit_every_n=1, ) self.producer = SimpleProducer(kafka) self.status_updates_topic = settings.get("ONETERA_STATUS_UPDATES_TOPIC") self.stats = {} def result_callback(self, result): self.results.append(result) def open(self, spider): super(OneteraScheduler, self).open(spider) spider.set_result_callback(self.result_callback) def has_pending_requests(self): if not self.is_active: return False return super(OneteraScheduler, self).has_pending_requests() def next_request(self): if not self.is_active: self._check_incoming() if self.is_active: return super(OneteraScheduler, self).next_request() return None def process_spider_output(self, response, result, spider): self._send_results() self._check_finished() return super(OneteraScheduler, self).process_spider_output(response, result, spider) def process_exception(self, request, exception, spider): super(OneteraScheduler, self).process_exception(request, exception, spider) self._send_results() self._check_finished() def _check_finished(self): if not self.is_active: return if self.results_sent > self.job_config["nResults"]: logger.info("Crawler reached the number of requested results. Crawling is stopping.") self.is_active = False if self.last_result_iteration and self.frontier.manager.iteration - self.last_result_iteration > 10: logger.info("It looks like crawler get stuck. Stopping crawling.") self.is_active = False def _check_incoming(self): consumed = 0 try: for m in self.consumer.get_messages(count=1): try: msg = loads(m.message.value) except ValueError, ve: logger.error("Decoding error %s, message %s" % (ve, m.message.value)) else: logger.info("Got incoming message %s from incoming topic." % m.message.value) self.frontier.manager.backend.cleanup() self._pending_requests.clear() self.results = [] self.results_sent = 0 self.last_result_iteration = None self.job_config = { "workspace": msg["workspace"], "nResults": msg["nResults"], "excluded": msg["excluded"], "included": msg["included"], "relevantUrl": msg["relevantUrl"], "irrelevantUrl": msg["irrelevantUrl"], } requests = [Request(url, meta={"score": 1.0}) for url in msg["relevantUrl"]] if not requests: raise Exception("Empty seeds list, can't bootstrap crawler.") self.frontier.add_seeds(requests) self.frontier.spider.configure(self.job_config) self.is_active = True finally: consumed += 1
def _run(self): pcount = 0 pause = False while True: try: if pause: gevent.sleep(2) pause = False self._logger.error("New KafkaClient %s" % self._topic) self._kfk = KafkaClient(self._brokers , "kc-" + self._topic) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("Error: %s trace %s" % \ (messag, traceback.format_exc())) raise RuntimeError(messag) self._logger.error("Starting %s" % self._topic) # Find the offset of the last message that has been queued consumer.seek(-1,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %s is %s" % \ (self._topic,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(-1,1) else: consumer.seek(0,0) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10,timeout=0.5) if not self.msg_handler(mlist): raise gevent.GreenletExit consumer.commit() pcount += len(mlist) except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() pause = True self._logger.error("Stopping %s pcount %d" % (self._topic, pcount)) partdb = self.stop_partition() return self._partoffset, partdb
# more advanced consumer -- multiple topics w/ auto commit offset # management import sys kclient = KafkaClient("52.24.239.65:9092") consumer = SimpleConsumer(kclient, "bf-group", sys.argv[1], partitions=[0,1]) consumer.max_buffer_size=None consumer.seek(0,1) while True: for message in consumer.get_messages(): print("OFFSET: "+str(message[0])+"\t MSG: "+str(message[1][3]) + "KEY: " + str(message.message.key) ) sys.exit(0) client = KafkaClient(['52.24.239.65:9092'], client_id='bfleming') consumer = SimpleConsumer(client, "bfleming", 'bfleming00615') print consumer.get_messages(count=10) # auto_offset_reset='smallest') import ipdb ipdb.set_trace() # Infinite iteration # for m in consumer:
class TestLinkSpider(TestCase): example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\ "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\ "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\ "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\ "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\ "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\ "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\ "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\ "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u." def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024 * 100, fetch_size_bytes=1024 * 100, max_buffer_size=None) # move cursor to end of kafka topic self.consumer.seek(0, 2) def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) if the_dict is not None and the_dict['appid'] == 'testapp' \ and the_dict['crawlid'] == '01234567890abcdefghijklmn': message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key)
class KafkaDriver: def __init__(self, driver_args, event_loop): self.logger = logging.getLogger('KafkaDriver') # possible TODO: get logger from invoker self.logger.setLevel(logging.INFO) console_log_handler = logging.StreamHandler(sys.stdout) self.logger.addHandler(console_log_handler) self.logger.info("KafkaDriver initialized; driver_args=%s" % (driver_args)) self.event_loop = event_loop if driver_args is "": kafka_server_addr = "localhost:9092" else: kafka_server_addr = driver_args client_id = "KafkaDriver-%d-%d" % (time.time(), os.getpid()) # generate a unique client ID so that Kafka doesn't confuse us with a different instance self.kafka = KafkaClient(kafka_server_addr, client_id=client_id) self.queue_name = None ## APPEND direction self.get_message_stream = None # how frequently to add check for messages and (space permitting) to add them to the GET message stream, in seconds self.MESSAGE_CHECK_FREQ = 0.010 # how many message we have sent from various queues self.get_message_count = 0 self.producer = None ## GET direction self.consumer = None self.get_message_count = 0 self.MAX_KAFKA_REQ_BATCH_MSGS = 200 # most number of messages that we will request from Kafka at a time ######## APPEND direction ######## # called to tell driver of a new stream of appends than are going to come in; these should go to the end of the named queue def prepare_for_append_stream(self, queue_name): self.logger.info("KafkaDriver prepare_for_append_stream got: queue_name=%s" % (queue_name)) self.queue_name = str(queue_name) self.producer = SimpleProducer( self.kafka, async=True, req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=5000, batch_send=True, batch_send_every_n= 100, batch_send_every_t=1000, random_start=True ) def append(self, payload, ttl): ttl = int(ttl) self.logger.debug("KafkaDriver append got: ttl=%d, payload='%s'" % (ttl, payload)) try: self.producer.send_messages(self.queue_name,payload) except UnknownTopicOrPartitionError: self.logger.warn("Kafka reports unknown topic or invalid partition number: " + str(sys.exc_info())) return 500 except: self.logger.warn("Got exception from kafka-python SimpleProducer:" + str(sys.exc_info())) return 500 # if random.uniform(0,1) < self.FRACTION_MSGS_TO_FAKE_APPEND_ERROR: # self.logger.debug("faking error") # return 400 return 100 def cancel_append_stream(self): self.logger.info("KafkaDriver cancel_append_stream got called") self.producer.stop() self.producer = None self.queue_name = None ######## GET direction ######## # called to tell driver that a new stream of messages is needed for return to a client. message_stream_queue is an instance of MessageStream to use to put messages the driver has available as a response to this request. Other arguments have same meaning as in the Marconi API. def init_get_stream(self, get_message_stream, queue_name_spec, starting_marker, echo_requested, include_claimed): self.logger.info("KafkaDriver prepare_to_get_messages got: queue_name=%s, echo_requested=%s, include_claimed=%s, starting_marker=%s" % (queue_name_spec,str(echo_requested),str(include_claimed),starting_marker)) self.logger.info("warning: KafkaDriver ignores echo_requested and include_claimed in GET requests") self.consume_group = "cg1" # default consume group if len(starting_marker) > 0: self.consume_group = starting_marker self.logger.info("consume group="+self.consume_group) # if the queue name contains "/n" at the end, we interpret that is referring to partition to read from queue_name, partition_part = queue_name_spec.split("/",2) if partition_part is None: partition = None else: partition = int(partition_part) self.logger.info("limiting topic %s to partition %d" % (queue_name, partition)) self.get_message_stream = get_message_stream self.queue_name = str(queue_name) self.consumer = SimpleConsumer( client=self.kafka, group=self.consume_group, topic=self.queue_name, partitions=[partition], auto_commit=False, # it seems we cannot do any kind of commit when using kafka-pythong 0.9.1 with Kafka versions before 0.8.1 because kafka-python will send a OffsetFetchReqeust (request type 9) or OffsetCommitRequest (request type 8) which is not supported fetch_size_bytes= self.MAX_KAFKA_REQ_BATCH_MSGS*4096, # in Marconi,messages can be up to 4k iter_timeout=None, ) self.logger.debug("KafkaDriver: seeking to head of %s" % (self.queue_name)) self.consumer.seek(0,0) # seek to head of topic; TODO: should get starting position from starting_marker param self.periodically_check_for_new_messages() # kick of periodic attainment of new messages (space permitting) def periodically_check_for_new_messages(self): #self.logger.debug("KafkaDriver.periodically_check_for_new_messages()") if self.get_message_stream is not None: # still providing messages self.check_for_new_messages() # TODO: call call_soon() rather than call_later() if we got some messages and there is still space available in the MessageStream self.new_msg_check_callback = self.event_loop.call_later(self.MESSAGE_CHECK_FREQ, self.periodically_check_for_new_messages) # schedules self to run again after MESSAGE_CHECK_FREQ seconds def check_for_new_messages(self): self.logger.debug("KafkaDriver.check_for_new_messages (start): space_used=%d, amount_of_space_avail=%d" % (self.get_message_stream.space_used(), self.get_message_stream.amount_of_space_avail())) max_number_of_messages = self.get_message_stream.amount_of_space_avail() if max_number_of_messages == 0: return # no space left to add message, so don't look for any # now try to get up to max_number_of_messages messages from the topic, but in a non-blocking manner messages = self.consumer.get_messages(count=max_number_of_messages, block=False) self.logger.debug("got %d messages from Kafka" % (len(messages))) assert len(messages) <= max_number_of_messages #add the messages to message stream for message_and_offset in messages: self.get_message_count += 1 offset_str = "%016x" % (message_and_offset.offset) # make offset into 16 hex chars # construct a new message and add it to stream self.get_message_stream.add_message( payload = str(message_and_offset.message.value), marker = offset_str, # TODO: this is supposed to a value that we use a as start_marker but this doesn't indicate the partition, so is not unique id = offset_str, ttl = (2**31)-1, # we don't store the original TTL so (for now at least) just send max signed 32 bit int age = 0, ) #self.logger.debug("KafkaDriver.check_for_new_messages (end): space_used=%d, amount_of_space_avail=%d" % (self.get_message_stream.space_used(), self.get_message_stream.amount_of_space_avail())) # called to let the driver know that there no more messages are needed for the previously requested stream of messages and that it should free up any associated resources. def cancel_get_stream(self): self.new_msg_check_callback.cancel() # cancel call to periodically_check_for_new_messages() self.consumer.stop() self.consumer = None self.get_message_stream = None self.queue_name = None
from __future__ import absolute_import, print_function #, unicode_literals import itertools from streamparse.spout import Spout import base64 import sys from kafka import KafkaClient, SimpleProducer, SimpleConsumer #from kafka.client import KafkaClient #from kafka.consumer import SimpleConsumer kafka = KafkaClient("cloud.soumet.com:9092") kafka_consumer = SimpleConsumer( kafka, "storm", "realtime", max_buffer_size=1310720000) #, max_buffer_size=1310720000) for message in kafka_consumer.get_messages( count=5000, block=False): #, block=True, timeout=4): print(message.message.value) kafka_consumer.commit()
def _run(self): pcount = 0 pause = False while True: try: if pause: gevent.sleep(5) pause = False self._logger.error("New KafkaClient %s" % self._topic) self._kfk = KafkaClient(self._brokers, "kc-" + self._topic, timeout=5) self._failed = False try: consumer = SimpleConsumer(self._kfk, self._group, self._topic,\ buffer_size = 4096*4*4, max_buffer_size=4096*32*4,\ auto_commit = False) except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("Error: %s trace %s" % \ (messag, traceback.format_exc())) self._failed = True raise RuntimeError(messag) self._logger.error("Starting %s" % self._topic) # Start consuming from the latest message consumer.seek(0, 2) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10, timeout=0.5) if not self.msg_handler(mlist): raise gevent.GreenletExit pcount += len(mlist) except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s %s : traceback %s" % \ (self._topic, messag, traceback.format_exc())) self.stop_partition() self._failed = True pause = True if hasattr(ex, 'errno'): # This is an unrecoverable error if ex.errno == errno.EMFILE: raise SystemExit(1) self._logger.error("Stopping %s pcount %d" % (self._topic, pcount)) partdb = self.stop_partition() return self._partoffset, partdb
class TestLinkSpider(TestCase): example_feed = "\x80\x02}q\x00(X\x0f\x00\x00\x00allowed_domainsq\x01NX"\ "\x0b\x00\x00\x00allow_regexq\x02NX\a\x00\x00\x00crawlidq\x03X\x19"\ "\x00\x00\x0001234567890abcdefghijklmnq\x04X\x03\x00\x00\x00urlq\x05X"\ "\x13\x00\x00\x00www.istresearch.comq\x06X\a\x00\x00\x00expiresq\aK"\ "\x00X\b\x00\x00\x00priorityq\bK\x01X\n\x00\x00\x00deny_regexq\tNX\b"\ "\x00\x00\x00spideridq\nX\x0b\x00\x00\x00test-spiderq\x0bX\x05\x00"\ "\x00\x00attrsq\x0cNX\x05\x00\x00\x00appidq\rX\a\x00\x00\x00testappq"\ "\x0eX\x06\x00\x00\x00cookieq\x0fNX\t\x00\x00\x00useragentq\x10NX\x0f"\ "\x00\x00\x00deny_extensionsq\x11NX\b\x00\x00\x00maxdepthq\x12K\x00u." def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT']) try: self.redis_conn.info() except ConnectionError: print "Could not connect to Redis" # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.kafka_conn = KafkaClient(self.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.crawled_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.crawled_firehose", buffer_size=1024*100, fetch_size_bytes=1024*100, max_buffer_size=None ) # move cursor to end of kafka topic self.consumer.seek(0, 2) def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) if the_dict is not None and the_dict['appid'] == 'testapp' \ and the_dict['crawlid'] == '01234567890abcdefghijklmn': message_count += 1 self.assertEquals(message_count, 1) def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key)
from kafka import SimpleProducer, SimpleClient, SimpleConsumer # To consume messages client = SimpleClient('localhost:9092') consumer = SimpleConsumer(client, "my-group", "my-topic") for message in consumer: # message is raw byte string -- decode if necessary! # e.g., for unicode: `message.decode('utf-8')` print(message) # Use multiprocessing for parallel consumers from kafka import MultiProcessConsumer # This will split the number of partitions among two processes consumer = MultiProcessConsumer(client, "my-group", "my-topic", num_procs=2) # This will spawn processes such that each handles 2 partitions max consumer = MultiProcessConsumer(client, "my-group", "my-topic", partitions_per_proc=2) for message in consumer: print(message) for message in consumer.get_messages(count=5, block=True, timeout=4): print(message) client.close()
class KafkaSpiderMixin1(object): """ Mixin class to implement reading urls from a kafka queue. :type kafka_topic: str """ kafka_topic = None def process_kafka_message(self, message): """" Tell this spider how to extract urls from a kafka message :param message: A Kafka message object :type message: kafka.common.OffsetAndMessage :rtype: str or None """ if not message: return None return message.value # override method def make_requests_from_url(self, url, id=None, attr=None): #request = Request(url,headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" }, dont_filter=True) request = Request(url,headers={'Origin': 'https://www.amazon.de', 'Referer':'https://www.amazon.de', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.8', 'Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36, LuminadBot/1.0 ([email protected])', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,'Cache-Control': 'max-age=0' }, dont_filter=True) request.cookies ={'s_pers':'%20s_fid%3D300B8810F7CDBDE1-10092DE00A8359D7%7C1558680220920%3B%20s_dl%3D1%7C1495610020921%3B%20gpv_page%3DDE%253AAZ%253ASOA-Landing%7C1495610020924%3B%20s_ev15%3D%255B%255B%2527AZDEGNOSellC%2527%252C%25271495608209183%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608216403%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220916%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220925%2527%255D%255D%7C1653374620925%3B%20s_eVar26%3DAmazon%2520Services%2520DE%7C1498200220927%3B', 'amznacsleftnav-656eac4a-695b-3a6a-946f-db61e4deb392':'1', 'amznacsleftnav-fdfd699f-c863-3b78-85b2-8a649c6b58f6':'1', 'x-amz-captcha-1':'1508482986892769', 'x-amz-captcha-2':'hw3RhTh0tvhX81cdMFkFgQ==', 'lc-acbde':'de_DE', 'session-id':'261-5677163-4561642', 'ubid-acbde':'259-8821950-7904223', 'a-ogbcbff':'1', 'x-acbde':'"V0z3CSC5jraR2B7OY6OiPR3wrDO7GbRjA9fTg2AJTorXXbAPToPEDvMAo8KTh@7M"', 'at-acbde':'Atza|IwEBIHwqc3CD45BqlJs_5aa-V8dGYqRemzUHaOJhdARXf-o6rlAp0DANlQO8ZPGB23Uek573IjBb2qkX4mlZWKna1Xn3pOzTpiUd0SQO7gh-uTZnxF5r2p22mMsR4_clEZvBBlZBMJYXD6HPxW7_sEYtklqCkY-Br197rDnz9KPza3y5u7XzgezJIBdXCaeq4vAqo9Wrl0uG0RGKSr41-4rKK9hpnGK1nN4UbO_qWxnLSwzA6LwgXczqe0C5EyH1HIp12IlKFB7OgxIEsH0QZAiT0eh0D7sFwlVG6eHfqPNWfix03SZ7apAC7C7jQ-vw1lmICAeJciD9QmumuCNEDDCT-GGWCkrAh-gxMRhKpm7Q5_gOtJijbqoLi3VfPO9QrCA7hYW8Atc-kFRIW3Y6vtRc8OZzZipCneewy-Rj_xYUMFVWMCmHs_ljfe2W6vxWgiRfmyw', 'sess-at-acbde':'"NbwPRqfG4oPuznYLUmFM5Y5JSvyizaA9ZJz6vTkNQL4="', 'sst-acbde':'Sst1|PQEs5smXCO43G8WIotdsANHyCEBZ9TkcZ_OdLYTgnk2mCfAy4Z5W77Y7zX74BQuxS7UKtfnUM6KkKhmcu01A2Fq7xshyjesDvnQDYp9QYcrFDvlceaVvpWqQfpEt2Q9XIM0VQFdd2EMpXc4C9QlehgHT0URfOlUmC47BkfeJr5dpb4Pv_dbnFASQli0k7Cln9sN_Vf4Wqz4km-6UTpsNlVJxJE48_RK6Zsk7bklH_cpJE8tfltiPzdhyhY2oDh7SieUx6CNKphxtIezjzr-0SbD8cg', 'x-wl-uid':'11PAl+O2T6FeY67SmgtWeMBtyZ538YMsy2Zcpov67B4kL2DVIv3Nx7rEprTLBkI4W3ZZ954YAADFuG1oAMSt9uIgNhk3yQfBCY6pDMJUcXUzK6rFTPF4tPnrWr3utKPzHqJATwvQOHKE=', 'session-token':'"tzfdQwuhV4SLJ9/PfV3QSfg2b3LxOcRlqovsFb3AsrqZSnkxHCjhgMsO3d7NbIS7rOee9CPoh7Lxo8LF7EdVopNDFYLMzzOtDGVhnY4czMEVNS5VHAxjtdaDvRNDJC0OloD0EvRMDfHeXG70D93/wWVNfqU0c6nKEv0yTLU7pFpIbTicUYQQFeDZYf9tPQEepQxbZ1pBOU+0FjTwWUj3SnNdDf/SVmmk+feDLRuqn+WcP6w6CPQ1G03W/TACUuIHBz9mSMRFPU0il4m+s0KyzA=="', 'csm-hit':'s-F8Q4HD9WHE8M6GMQKQT4|1519186540551', 'session-id-time':'2082754801l' } #request.cookies ={ 'x-wl-uid':'1yOwLjX2WnY9mLM7WsqYh6e6V1fXMd1ZMNtSL2K4PXEdSmASj6jCPPBezf56CZBu8dNd+B0dbGk6FSb6sv3/5Z2bObc/d7RBUn4jelvgzhpzxeiQQPCByKtKt+rFfaF6lordo7OBLv6I=', 's_vn':'1538041742354%26vn%3D1','s_fid':'7FA70D7094115718-2F7725F9CDA62241', 'regStatus':'pre-register', 's_nr':'1506673939908-Repeat', 's_vnum':'1938673939908%26vn%3D1', 's_dslv':'1506673939908', 'JSESSIONID':'7D8C49FEC5F5D74FBFB8C44B4582E920', 'skin':'noskin','session-token':'fMF7GsLbD9OFUtBEffIAbQYQ+k+oGY4qtqc4L+jpdCrQuiLu4c9Hm8YSsbtiO5c9mfQ3IRuuQojX/N/SOZ1vcQVF58RRX0RpMeXLEPvV50aTQq+f/s/rV8yGoETGydD/29yEVxxEqc4cWCblz5+V28+sOHeSSoUiYwysN7+jUIC+ICgHh8EJAM1aQiONRz31', 'ubid-main':'131-1502033-8002851', 'session-id-time':'2082787201l', 'session-id':'143-4281452-3926723', 'csm-hit':'%7B%22tb%22%3A%223FYTGMTG10SZNP3AYFTN%2Bs-TWA04Y4WMDA93A0N8PZQ%7C1507802966608%22%7D' } if id:request.meta['id']=id if attr:request.meta['attr']=attr # set the meta['item'] to use the item in the next call back return request def setup_kafka(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name self.topic ='general-starturls' _server=self.settings.get("KAFKA_LOCATION", 'localhost:9092') _partition_id = int(self.settings.get('SPIDER_PARTITION_ID', 0)) _group = self.settings.get("GROUP","scrapy-crawler") _conn = KafkaClient(_server) self.topic1 = self.settings.get('TOPIC', 'frontier-todo') mongo_server = self.settings.get("MONGODB_SERVER", 'localhost') mongo_port = self.settings.get("MONGODB_PORT", 'MONGODB_PORT') self.mng_client = MongoClient(mongo_server, mongo_port) self.consumer = SimpleConsumer(_conn,_group,self.topic1, partitions=[_partition_id], buffer_size=131072, max_buffer_size=1048576) self.producer = KafkaProducer(bootstrap_servers=[_server]) self.MONGODB_DB = self.settings.get("MONGODB_DB") self.MONGODB_COLLECTION = "shop" self.SPIDER_NAME = self.settings.get("SPIDER_NAME") self.JOB_NAME = self.settings.get("JOB_NAME") self.LOCALE = self.settings.get("LOCALE",'us') self.MONGODB_DB_INPUT = self.settings.get("MONGODB_DB_INPUT", "scr") self.NUM_REPETE = self.settings.get("NUMBER_REPETE_SCRAPE", 7) self.JOB_INPUT_COLLECTION = self.settings.get("JOB_INPUT_COLLECTION", "job_input3") self.ITEM_INPUT_COLLECTION = self.settings.get("ITEM_INPUT_COLLECTION" ,'scrap_input4') self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) def next_request(self): """ Returns a request to be scheduled. :rtype: str or None """ message = self.consumer.get_messages(1) print "messsssssssssssssssssss",message if message : url = self.process_kafka_message(message[0].message).split(",")[0].replace(",",' ').replace("#", ' ').replace("&", ' ').replace('"','').replace("'",'').replace("(",' ').replace(")",' ') #url = 'https://www.amazon.de/gp/offer-listing/' + url + '/ref=dp_olp_new?ie=UTF8&condition=new' url = 'https://www.amazon.de/dp/' + url +'/?th=1&psc=1' id = self.process_kafka_message(message[0].message).split(",")[1] attr =",".join( self.process_kafka_message(message[0].message).split(",")[2:]) db = self.mng_client['shop_url_out'] result1 = db.shop.find_one({'product_id':id ,'spider':self.SPIDER_NAME}) print "asfdsdfasdfafsdfsdf",result1 if result1: return None else: url=None if not url: time_data =int(time.time())-98000 mng_db = self.mng_client[self.MONGODB_DB_INPUT] db_cm = mng_db[self.JOB_INPUT_COLLECTION] try:result = list(db_cm.find({"spider_name":self.JOB_NAME, 'locale': self.LOCALE}).sort('start_time', -1))[0] except:result = list(db_cm.find({"repeate": {"$lt": self.NUM_REPETE}}).sort('start_time', -1))[0] result = list(db_cm.find({"spider_name":self.JOB_NAME ,'locale': self.LOCALE }).sort('start_time', -1))[0] print result if result: job_id = result['job_id'] db = self.mng_client[self.MONGODB_DB] #result1 = db.shop_url_out.find({'job_id':job_id,'seller_info':[]}) result2 = db.scrap_input_mapping.find({'job_id':job_id, 'price': {"$ne": '' } }) #result2 = db.scrap_input_mapping.find({'job_id':job_id}) time_data =int(time.time())-3600 #print db.scrap_input_mapping.find({'job_id':job_id,'spider':self.SPIDER_NAME, 'price': {"$ne": '' } }).count(),"ggggggggg" #return scraped_data = [] #scraped_data = [res["product_id"] for res in result1 ] scraped_data.extend([res.get("product_id",'') for res in result2 ]) #print scraped_data, "rassssssssssssssssssssss" collection_name = self.ITEM_INPUT_COLLECTION db_cm = mng_db['scrap_input'] #print db_cm, list(db_cm.find({'job_id':job_id, "title": { "$exists": False}})), "ddddddddddddddddddddddd" #print "ddddddddddddddddddddddddddddddddd",list(db_cm.find({'job_id':job_id})) count = 0 for document in list(db_cm.find({'job_id':job_id})): try: product_id = str(document[u'_id']) #print product_id if product_id not in scraped_data: product_name = str(document[u'DPID']) if product_name == '': continue upc = str(document[u'UPC']).replace("'","").replace('"','') msg = json.dumps({'job_id':str(job_id), 'DPID':product_name, 'UPC':upc}) future = self.producer.send(self.topic1 , product_name+ "," +product_id+ "," + msg) record_metadata = future.get(timeout=10) count = count+1 print record_metadata, count except: pass return None return self.make_requests_from_url(url,id,attr) def schedule_next_request(self): """Schedules a request if available""" req = self.next_request() if req: print dir(self) #req.headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" } self.crawler.engine.crawl(req, spider=self) else: print "Rahulllllllllllllllll" def spider_idle(self): """Schedules a request if available, otherwise waits.""" try: self.schedule_next_request() except: pass raise DontCloseSpider def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request()
def _run(self): pcount = 0 pause = False while True: try: if pause: gevent.sleep(2) pause = False self._logger.error("New KafkaClient %s" % self._topic) self._kfk = KafkaClient(self._brokers , "kc-" + self._topic) self._failed = False try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size = 4096*4, max_buffer_size=4096*32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("Error: %s trace %s" % \ (messag, traceback.format_exc())) self._failed = True raise RuntimeError(messag) self._logger.error("Starting %s" % self._topic) # Find the offset of the last message that has been queued consumer.seek(-1,2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %s is %s" % \ (self._topic,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(-1,1) else: consumer.seek(0,0) if self._limit: raise gevent.GreenletExit while True: try: mlist = consumer.get_messages(10,timeout=0.5) if not self.msg_handler(mlist): raise gevent.GreenletExit consumer.commit() pcount += len(mlist) except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() self._failed = True pause = True self._logger.error("Stopping %s pcount %d" % (self._topic, pcount)) partdb = self.stop_partition() return self._partoffset, partdb
def run(self, options=None): # try: # Create table if it doesn't exist in the database if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False: self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE) kafka = KafkaClient( config.get("kafka.host1") + "," + config.get("kafka.host2")) consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: # Prepare data for insert and copy to S3 data_str = StringIO() csv_str = StringIO() count = 0 # Get Offset from previous read s3_last_offset = self.get_s3_offset() (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0] last_offset = last_offset if last_offset else 0 # Resolve difference in offset (s3 offset does not carry over from day to day) if s3_last_offset > last_offset: last_offset = s3_last_offset self.REDSHIFT.execute( self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) print(last_offset) # Read from Offset consumer.seek(last_offset, 0) for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5): # Write tweets to StringIO self.write_to_data_str(message, data_str, csv_str) count += 1 last_offset += 1 # Store batch tweets to S3 self.write_to_s3(data_str, csv_str, last_offset) # Track Kafka Offset self.REDSHIFT.execute( self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) if count != self.BATCH_SIZE: break
class OneteraScheduler(FronteraScheduler): def __init__(self, crawler): super(OneteraScheduler, self).__init__(crawler) self.job_config = {} self.is_active = False self.results = [] self.results_sent = 0 self.last_result_iteration = None settings = self.frontier.manager.settings self.results_topic = settings.get('ONETERA_RESULTS_TOPIC') kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self.consumer = SimpleConsumer(kafka, settings.get('ONETERA_GROUP'), settings.get('ONETERA_INCOMING_TOPIC'), buffer_size=262144, max_buffer_size=10485760, auto_commit_every_n=1) self.producer = SimpleProducer(kafka) self.status_updates_topic = settings.get( 'ONETERA_STATUS_UPDATES_TOPIC') self.stats = {} def result_callback(self, result): self.results.append(result) def open(self, spider): super(OneteraScheduler, self).open(spider) spider.set_result_callback(self.result_callback) def has_pending_requests(self): if not self.is_active: return False return super(OneteraScheduler, self).has_pending_requests() def next_request(self): if not self.is_active: self._check_incoming() if self.is_active: return super(OneteraScheduler, self).next_request() return None def process_spider_output(self, response, result, spider): self._send_results() self._check_finished() return super(OneteraScheduler, self).process_spider_output(response, result, spider) def process_exception(self, request, exception, spider): super(OneteraScheduler, self).process_exception(request, exception, spider) self._send_results() self._check_finished() def _check_finished(self): if not self.is_active: return if self.results_sent > self.job_config['nResults']: logger.info( "Crawler reached the number of requested results. Crawling is stopping." ) self.is_active = False if self.last_result_iteration and self.frontier.manager.iteration - self.last_result_iteration > 10: logger.info("It looks like crawler get stuck. Stopping crawling.") self.is_active = False def _check_incoming(self): consumed = 0 try: for m in self.consumer.get_messages(count=1): try: msg = loads(m.message.value) except ValueError, ve: logger.error("Decoding error %s, message %s" % (ve, m.message.value)) else: logger.info( "Got incoming message %s from incoming topic." % m.message.value) self.frontier.manager.backend.cleanup() self._pending_requests.clear() self.results = [] self.results_sent = 0 self.last_result_iteration = None self.job_config = { 'workspace': msg['workspace'], 'nResults': msg['nResults'], 'excluded': msg['excluded'], 'included': msg['included'], 'relevantUrl': msg['relevantUrl'], 'irrelevantUrl': msg['irrelevantUrl'], } requests = [ Request(url, meta={'score': 1.0}) for url in msg['relevantUrl'] ] if not requests: raise Exception( "Empty seeds list, can't bootstrap crawler.") self.frontier.add_seeds(requests) self.frontier.spider.configure(self.job_config) self.is_active = True finally: consumed += 1
class TestRedisMonitor(TestCase): maxDiff = None queue_key = "link:istresearch.com:queue" def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load( "localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient( self.redis_monitor.settings['KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer(self.kafka_conn, "demo-id", "demo_test.outbound_firehose") def test_process_item(self): # we only want to go to the end now, not after this test is ran self.consumer.seek(0, 2) # set the info flag key = "info-test:blah" value = "ABC123" self.redis_monitor.redis_conn.set(key, value) # process the request plugin = self.redis_monitor.plugins_dict.items()[0][1] self.redis_monitor._process_plugin(plugin) # ensure the key is gone self.assertEquals(self.redis_monitor.redis_conn.get(key), None) def test_sent_to_kafka(self): success = {u'info-test': "ABC123", u"appid": u"someapp"} # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) self.assertEquals(success, the_dict) message_count += 1 self.assertEquals(message_count, 1)
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError) as e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1 except OffsetOutOfRangeError as e: # https://github.com/mumrah/kafka-python/issues/263 self._in_consumer.seek(0, 2) # moving to the tail of the log logger.info( "Caught OffsetOutOfRangeError, moving to the tail of the log.") self.backend.fetch_states(list(fingerprints)) fingerprints.clear() results = [] for msg in batch: if len(results) > 1024: self._producer.send_messages(self.outgoing_topic, *results) results = [] type = msg[0] if type == 'add_seeds': _, seeds = msg for seed in seeds: seed.meta['jid'] = self.job_id results.extend(self.on_add_seeds(seeds)) continue if type == 'page_crawled': _, response, links = msg if response.meta['jid'] != self.job_id: continue results.extend(self.on_page_crawled(response, links)) continue if type == 'request_error': _, request, error = msg if request.meta['jid'] != self.job_id: continue results.extend(self.on_request_error(request, error)) continue if len(results): self._producer.send_messages(self.outgoing_topic, *results) if self.cache_flush_counter == 30: logger.info("Flushing states") self.backend.flush_states(is_clear=False) logger.info("Flushing states finished") self.cache_flush_counter = 0 self.cache_flush_counter += 1 if self.strategy.finished(): logger.info("Succesfully reached the crawling goal. Exiting.") exit(0) logger.info("Consumed %d items.", consumed) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() def run(self): while True: self.work() def on_add_seeds(self, seeds): logger.info('Adding %i seeds', len(seeds)) seed_map = dict( map(lambda seed: (seed.meta['fingerprint'], seed), seeds)) self.backend.update_states(seeds, False) scores = self.strategy.add_seeds(seeds) self.backend.update_states(seeds, True) output = [] for fingerprint, score in scores.iteritems(): seed = seed_map[fingerprint] logger.debug('URL: %s', seed.url) if score is not None: encoded = self._encoder.encode_update_score( seed.meta['fingerprint'], score, seed.url, True) output.append(encoded) return output def on_page_crawled(self, response, links): logger.debug("Page crawled %s", response.url) objs_list = [response] objs_list.extend(links) objs = dict(map(lambda obj: (obj.meta['fingerprint'], obj), objs_list)) self.backend.update_states(objs_list, False) scores = self.strategy.page_crawled(response, links) self.backend.update_states(objs_list, True) output = [] for fingerprint, score in scores.iteritems(): obj = objs[fingerprint] if score is not None: encoded = self._encoder.encode_update_score( obj.meta['fingerprint'], score, obj.url, True) output.append(encoded) return output def on_request_error(self, request, error): self.backend.update_states(request, False) scores = self.strategy.page_error(request, error) self.backend.update_states(request, True) assert len(scores) == 1 fingerprint, score = scores.popitem() if score is not None: encoded = self._encoder.encode_update_score( request.meta['fingerprint'], score, request.url, False) return [encoded] return []
# In[ ]: import json # In[1]: from kafka import KafkaClient, SimpleConsumer kafka = KafkaClient("ec2-52-26-15-148.us-west-2.compute.amazonaws.com:9092") consumer = SimpleConsumer(kafka, "my-group", "moviereview9") # In[10]: #time.sleep(6) messages = consumer.get_messages(100) print messages # In[41]: jsonList = [json.loads(message.message.value). for message in messages] print jsonList # In[42]: print json.dumps(jsonList) # In[43]:
from __future__ import absolute_import, print_function#, unicode_literals import itertools from streamparse.spout import Spout import base64 import sys from kafka import KafkaClient, SimpleProducer, SimpleConsumer #from kafka.client import KafkaClient #from kafka.consumer import SimpleConsumer kafka = KafkaClient("cloud.soumet.com:9092") kafka_consumer = SimpleConsumer(kafka, "storm", "realtime", max_buffer_size=1310720000)#, max_buffer_size=1310720000) for message in kafka_consumer.get_messages(count=5000, block=False):#, block=True, timeout=4): print(message.message.value) kafka_consumer.commit()
if len(sys.argv) < 2: print 'Usage: {} <config>'.format(sys.argv[0]) sys.exit(-1) config = ConfigParser.SafeConfigParser() config.read(sys.argv[1]) #initalize data_feed data_feed = KafkaClient(config.get('data_feed', 'host')) consumer = SimpleConsumer(data_feed, group=config.get('data_feed', 'group'), topic=config.get('data_feed', 'topic')) fetch_num_messages = config.getint('data_feed', 'fetch_num_messages') #initialize data_sink sink_config = dict(config.items('data_sink')) class_name = sink_config.pop('class') classobject = utils.load_class(class_name, storage.VerdictDB) db = classobject(None, **sink_config) removed = added = 0 for msg in consumer.get_messages(count=fetch_num_messages, block=False): try: msg = json.loads(msg.message.value) blacklist_type = msg['type'] map(partial(db.insert, blacklist_type), msg['add']) added += len(msg['add']) map(partial(db.delete, blacklist_type), msg['remove']) removed += len(msg['remove']) except Exception as e: print '[{}] Error ingesting msg {}'.format(e, msg) print 'added %d urls removed %d urls' % (added, removed)
import os import sys from kafka import KafkaClient, SimpleConsumer #kafka=KafkaClient("ec2-52-8-194-192.us-west-1.compute.amazonaws.com:9092") kafka=KafkaClient("ec2-52-8-194-192.us-west-1.compute.amazonaws.com") kafka_consumer=SimpleConsumer(kafka,"my_group","filmon-topic2") messages= kafka_consumer.get_messages(count=1000, block=False) if not messages: print "no messages to read" for message in messages: print message.message.value
# stdlib from collections import defaultdict # 3p from kafka import SimpleClient, SimpleConsumer kafka_conn = SimpleClient("192.168.208.2:9092") consumer = SimpleConsumer(kafka_conn, "sample_check", "test-topic", auto_commit=True) for message in consumer.get_messages(count=10): print message.offset consumer.commit()
def _run(self): pcount = 0 while True: try: self._logger.error("New KafkaClient %d" % self._partition) self._kfk = KafkaClient(self._brokers, str(os.getpid())) try: consumer = SimpleConsumer(self._kfk, self._group, self._topic, buffer_size=4096 * 4, max_buffer_size=4096 * 32) #except: except Exception as ex: template = "Consumer Failure {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.info("%s" % messag) raise RuntimeError(messag) self._logger.error("Starting %d" % self._partition) # Find the offset of the last message that has been queued consumer.seek(0, 2) try: mi = consumer.get_message(timeout=0.1) consumer.commit() except common.OffsetOutOfRangeError: mi = None #import pdb; pdb.set_trace() self._logger.info("Last Queued for %d is %s" % \ (self._partition,str(mi))) # start reading from last previously processed message if mi != None: consumer.seek(0, 1) else: consumer.seek(0, 0) if self._limit: raise gevent.GreenletExit while True: try: self.resource_check() mlist = consumer.get_messages(10, timeout=0.2) for mm in mlist: if mm is None: continue self._logger.debug("%d Reading offset %d" % \ (self._partition, mm.offset)) consumer.commit() pcount += 1 if not self.msg_handler(mm): self._logger.info("%d could not handle %s" % (self._partition, str(mm))) raise gevent.GreenletExit except TypeError as ex: self._logger.error("Type Error: %s trace %s" % \ (str(ex.args), traceback.format_exc())) gevent.sleep(0.1) except common.FailedPayloadsError as ex: self._logger.error("Payload Error: %s" % str(ex.args)) gevent.sleep(0.1) except gevent.GreenletExit: break except AssertionError as ex: self._partoffset = ex break except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s" % \ (messag, traceback.format_exc())) self.stop_partition() gevent.sleep(2) partdb = {} for coll in self._uvedb.keys(): partdb[coll] = {} for gen in self._uvedb[coll].keys(): partdb[coll][gen] = {} for tab in self._uvedb[coll][gen].keys(): for rkey in self._uvedb[coll][gen][tab].keys(): uk = tab + ":" + rkey partdb[coll][gen][uk] = \ set(self._uvedb[coll][gen][tab][rkey].keys()) self._logger.error("Stopping %d pcount %d" % (self._partition, pcount)) self.stop_partition() return self._partoffset, partdb
class ScoringWorker(object): def __init__(self, settings, strategy_module): kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self._producer = SimpleProducer(kafka, codec=CODEC_SNAPPY) partition_id = settings.get('SCORING_PARTITION_ID') if partition_id == None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") self._in_consumer = SimpleConsumer(kafka, settings.get('SCORING_GROUP'), settings.get('INCOMING_TOPIC'), buffer_size=1048576, max_buffer_size=10485760, partitions=[partition_id]) self._manager = FrontierManager.from_settings(settings) self._decoder = Decoder(self._manager.request_model, self._manager.response_model) self._encoder = Encoder(self._manager.request_model) self.consumer_batch_size = settings.get('CONSUMER_BATCH_SIZE', 128) self.outgoing_topic = settings.get('SCORING_TOPIC') self.strategy = strategy_module.CrawlStrategy() self.backend = self._manager.backend self.stats = {} self.cache_flush_counter = 0 self.job_id = 0 def work(self): consumed = 0 batch = [] fingerprints = set() try: for m in self._in_consumer.get_messages( count=self.consumer_batch_size, block=True, timeout=1.0): try: msg = self._decoder.decode(m.message.value) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: type = msg[0] batch.append(msg) if type == 'add_seeds': _, seeds = msg fingerprints.update( map(lambda x: x.meta['fingerprint'], seeds)) continue if type == 'page_crawled': _, response, links = msg fingerprints.add(response.meta['fingerprint']) fingerprints.update( map(lambda x: x.meta['fingerprint'], links)) continue if type == 'request_error': _, request, error = msg fingerprints.add(request.meta['fingerprint']) continue raise TypeError('Unknown message type %s' % type) finally: consumed += 1
class TestRedisMonitor(TestCase): maxDiff = None queue_key = "link:istresearch.com:queue" def setUp(self): self.redis_monitor = RedisMonitor("localsettings.py") self.redis_monitor.settings = self.redis_monitor.wrapper.load("localsettings.py") self.redis_monitor.logger = MagicMock() self.redis_monitor.settings['KAFKA_TOPIC_PREFIX'] = "demo_test" self.redis_monitor.settings['STATS_TOTAL'] = False self.redis_monitor.settings['STATS_PLUGINS'] = False self.redis_monitor.settings['PLUGINS'] = { 'plugins.info_monitor.InfoMonitor': None, 'plugins.stop_monitor.StopMonitor': None, 'plugins.expire_monitor.ExpireMonitor': None, 'tests.tests_online.CustomMonitor': 100, } self.redis_monitor.redis_conn = redis.Redis( host=self.redis_monitor.settings['REDIS_HOST'], port=self.redis_monitor.settings['REDIS_PORT']) self.redis_monitor._load_plugins() self.redis_monitor.stats_dict = {} self.kafka_conn = KafkaClient(self.redis_monitor.settings[ 'KAFKA_HOSTS']) self.kafka_conn.ensure_topic_exists("demo_test.outbound_firehose") self.consumer = SimpleConsumer( self.kafka_conn, "demo-id", "demo_test.outbound_firehose" ) def test_process_item(self): # we only want to go to the end now, not after this test is ran self.consumer.seek(0, 2) # set the info flag key = "info-test:blah" value = "ABC123" self.redis_monitor.redis_conn.set(key, value) # process the request plugin = self.redis_monitor.plugins_dict.items()[0][1] self.redis_monitor._process_plugin(plugin) # ensure the key is gone self.assertEquals(self.redis_monitor.redis_conn.get(key), None) def test_sent_to_kafka(self): success = { u'info-test': "ABC123", u"appid": u"someapp" } # ensure it was sent out to kafka message_count = 0 for message in self.consumer.get_messages(): if message is None: break else: the_dict = json.loads(message.message.value) self.assertEquals(success, the_dict) message_count += 1 self.assertEquals(message_count, 1)
class HHStrategyWorker(ScoringWorker): worker_prefix = 'hh-strategy-worker' def __init__(self, settings): super(HHStrategyWorker, self).__init__(settings, topic) self.slot = Slot(log_processing=self.work, incoming=self.incoming, outgoing=self.outgoing, is_master=settings.get("FRONTERA_MASTER")) kafka_hh = KafkaClient(settings.get('KAFKA_LOCATION_HH')) self.consumer_hh = SimpleConsumer( kafka_hh, settings.get('FRONTERA_GROUP'), settings.get('FRONTERA_INCOMING_TOPIC'), buffer_size=262144, max_buffer_size=10485760, auto_commit_every_n=1) self.producer_hh = SimpleProducer(kafka_hh) self.results_topic = settings.get("FRONTERA_RESULTS_TOPIC") self.job_config = {} self.zookeeper = ZookeeperSession(settings.get('ZOOKEEPER_LOCATION'), name_prefix=self.worker_prefix) kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self.partitions_count = len( kafka.get_partition_ids_for_topic(settings.get('INCOMING_TOPIC'))) self.null_cycles = 0 def set_process_info(self, process_info): self.process_info = process_info self.zookeeper.set(process_info) def run(self): def onStart(): self.reset() self.slot.schedule() if self.slot.is_master: reset = CallLaterOnce(onStart) reset.setErrback(self.slot.error) reset.schedule(5.0) else: self.slot.schedule() reactor.run() def work(self): super(HHStrategyWorker, self).work() if self.stats['last_consumed'] == 0: self.null_cycles += 1 else: self.null_cycles = 0 if self.null_cycles == 500: logger.info("It seems crawler got stuck, performing self reset.") self.reset() self.null_cycles = 0 def incoming(self): if self.slot.is_active: return if not self.slot.is_master: logger.warn( "Incoming topic shouldn't be consumed on slave instances.") consumed = 0 try: for m in self.consumer_hh.get_messages(count=1): try: msg = loads(m.message.value) except ValueError, ve: logger.error("Decoding error %s, message %s" % (ve, m.message.value)) else: logger.info( "Got incoming message %s from incoming topic." % m.message.value) job_config = { 'workspace': msg['workspace'], 'nResults': msg.get('nResults', 0) / self.partitions_count, 'excluded': msg['excluded'], 'included': msg['included'], 'relevantUrl': msg['relevantUrl'], 'irrelevantUrl': msg['irrelevantUrl'], } self.reset() self.setup(job_config['relevantUrl'], job_config) finally: consumed += 1