class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): #price_field = random.randint(800,1400) msg_cnt = 0 datagenerator = DataGenerator() function_options = { 0:datagenerator.click_event, 1:datagenerator.view_event, 2:datagenerator.bid_event, 3:datagenerator.hover_event, 4:datagenerator.load_event } while True: #time_field = datetime.now().strftime("%Y%m%d %H%M%S") #price_field += random.randint(-10, 10)/10.0 #volume_field = random.randint(1, 1000) #str_fmt = "{};{};{};{}" #message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) num = random.randint(0, 4) message_info = function_options[num]() print json.dumps(message_info) self.producer.send_messages('test_adability', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter) def produce_msgs(self, topic, source_symbol, last_record_set): self.record_set = set() count = 0 try: for item in self.r["data"]: self.record_set.add(item["payment_id"]) count += 1 if not item["payment_id"] in last_record_set: message_info = "{}\n".format(json.dumps(item)) self.producer.send_messages(topic, source_symbol, message_info) # print message_info # print count except: k = 1 def get_venmo(self,limit=300,page="https://venmo.com/api/v5/public?"): try: self.r = self.sess.get(page + "&limit={}".format(limit)).json() except: self.r = ""
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list) self.key = key self.kafka_topic_name = topic if not key: self.producer = SimpleProducer(self.kafka_client) else: self.producer = KeyedProducer(self.kafka_client) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) if isinstance(msg, unicode): msg = msg.encode("utf-8") # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): self.producer.stop() logging.Handler.close(self)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.artist_id = [] self.artwork_id = [] def load_ids(self): artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt" artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt" with open(artwork_path) as f1: for line in f1: if line != "": self.artwork_id.append(line.strip()) f1.close() with open(artist_path) as f2: for line in f2: if line != "": self.artist_id.append(line.strip()) f2.close() def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") user_field = random.choice(self.artist_id) art_field = random.choice(self.artwork_id) str_fmt = "{};{};{};{};{}" message_info = str_fmt.format(source_symbol,time_field,user_field,"pin",art_field) # print message_info self.producer.send_messages('pin_activity', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): #price_field = random.randint(800,1400) msg_cnt = 0 datagenerator = DataGenerator() function_options = { 0: datagenerator.click_event, 1: datagenerator.view_event, 2: datagenerator.bid_event, 3: datagenerator.hover_event, 4: datagenerator.load_event } while True: #time_field = datetime.now().strftime("%Y%m%d %H%M%S") #price_field += random.randint(-10, 10)/10.0 #volume_field = random.randint(1, 1000) #str_fmt = "{};{};{};{}" #message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) num = random.randint(0, 4) message_info = function_options[num]() print json.dumps(message_info) self.producer.send_messages('test_adability', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.artist_id = [] self.artwork_id = [] def load_ids(self): artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt" artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt" with open(artwork_path) as f1: for line in f1: if line != "": self.artwork_id.append(line.strip()) f1.close() with open(artist_path) as f2: for line in f2: if line != "": self.artist_id.append(line.strip()) f2.close() def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") user_field = random.choice(self.artist_id) art_field = random.choice(self.artwork_id) str_fmt = "{};{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, user_field, "pin", art_field) # print message_info self.producer.send_messages('pin_activity', source_symbol, message_info) msg_cnt += 1
def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + str(row['score']) + '",' line += ' "pid":' + str(row['pid']) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.zipcode = [] self.complaint = [] def load_ids(self): zipcode_path = "/home/ubuntu/repos/project311/kafka/zipcodes.txt" complaint_path = "/home/ubuntu/repos/project311/kafka/complaint_type.txt" with open(zipcode_path, 'r') as f1: for line in f1: if line != "": self.zipcode.append(line.strip()) with open(complaint_path) as f2: for line in f2: if line != "": self.complaint.append(line.strip()) def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d%H%M%S") zipcode_field = random.choice(self.zipcode) complaint_field = random.choice(self.complaint) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, zipcode_field, complaint_field) print message_info self.producer.send_messages('complaints', source_symbol, message_info) msg_cnt += 1
def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, {pid:score} dict) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\ .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1])) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): # send to kafka client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "pid" :"' + str(row['pid']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "price":"' + str(row['price']) + '",' line += ' "ts":' + str(time) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line) # save to cassandra rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\ .toDF().write\ .format("org.apache.spark.sql.cassandra")\ .options(table='winningbid10s', keyspace='ad_flow')\ .save(mode="append")
def process(time, lines): """1. select user to push ads 2. save user-product corr table to cassandra 3. match user with bidder 4. save bid winner to cassandra Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext=getSqlContextInstance(lines.context) # calculate user-product correlation table #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\ # lines1s=lines.map(lambda x: ( x[0], 1))\ runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count()>0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line+= ' "uid" :"' + str(row['uid']) + '",' line+= ' "score":"' + str(row['score'])+'",' line+= ' "pid":' + str(row['pid'])+ '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts="", topic="", partition=0): logging.Handler.__init__(self) self.kafkaClient = KafkaClient(hosts) self.topic = topic self.partition = partition self.producer = KeyedProducer( self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200 ) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == "kafka": return try: # use default formatting msg = self.format(record) # produce message self.producer.send_messages(self.topic + record.name, self.partition, msg) except: import traceback ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei def close(self): self.producer.stop() logging.Handler.close(self)
def process(time, lines): """Calculate user-product corr table and select ad-push events Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\ .filter(lambda (k, v): v!=[]) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timeStamp" :"' + str(time) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + json.dumps(dict(row['score'])) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_deal_urls(self, api_url=''): ''' Constantly produce deal urls for consumers to crawl ''' # TODO - Find total deals per category # TODO - Calculate number of pages to crawl # TODO - Produce categories and page range for consumers # {category_slug; start_page; end_page} def produce_msgs(self, source_symbol): price_field = random.randint(800,1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10)/10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_messages(self, data): #timestamp = 1473613200 # 1:00 est #while timestamp <= 1473624000: while True: rows = np.random.randint(0, len(data) - 1, size=num_plays_persec) sampled_data = data.iloc[rows] curr_time = datetime.datetime.now() #create timestamp for camus to partition timestamp = datetime.datetime.strftime(curr_time, '%Y-%m-%d_%H:%M:%S') #create epoch timstamp for custom partitioning raw_timestamp = convert_datetime_to_est(curr_time) epoch = int(time.mktime(raw_timestamp.timetuple())) for idx, row in sampled_data.iterrows(): json_data = { 'timestamp': timestamp, 'epoch_timestamp': epoch, 'player_id': row.player_id, 'player_name': row.player_name, 'position': row.position, 'yards': row.yards, 'touchdown': row.touchdown } message_info = json.dumps(json_data) keystring = 'QA' if row.position == 'QB' else row.position key = b'{}'.format(keystring) self.producer.send_messages('nfl_plays', key, message_info)
def process(time, lines): """ Processing tweets Input: lines: (ts string, uid string, state string, tweet vector) Output: Json: (ts string, uid string, topicVec vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\ # .filter(lambda (time, uid, vec): vec!=[])\ # .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2])) rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\ .flatMap(lambda x:x)\ .filter(lambda (k, vec): vec!=[])\ .reduceByKey(lambda x,y:x+y)\ .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(kafkaNodeBC.value) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timestamp" :"' + str(row['timestamp']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "topicVec":' + json.dumps( [float(i) for i in row['topicVec']]) + '}' # print(line) producer.send_messages(outgoingTopic, str(hash(line)), line)
def unfollow_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Dict[str, str]: """ Produce unfollow events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ followee, follower = random.choice(users)[0], random.choice(users)[0] created_time, partition_date = get_datetime() record = { "follower_username": follower, "followed_username": followee, "created_time": created_time, "partition_date": partition_date, "event": "unfollow" } producer.send_messages("unfollow", bytes(followee, 'utf-8'), json.dumps(record).encode('utf-8')) return record
class KafkaLfProducer(object): def __init__(self, addr, conf_file, start_house_id, end_house_id, house_status): self.parser = SafeConfigParser() self.parser.read(conf_file) install_dir = self.parser.get('smw_tool', 'INSTALL_DIR') zipdb_file = self.parser.get('smw_tool', 'ZIP_DB_FILE') self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client, async=True, batch_send_every_n=500,batch_send=True) self.meterReader = MeterLfReader(start_house_id, end_house_id, house_status, install_dir + "/data/low_freq/", install_dir + "/" + zipdb_file) def produce_msgs(self, source_symbol): msg_cnt = 0 while not self.meterReader.houseSentDone(): (isLf, msg) = self.meterReader.getRecord() if msg_cnt % 500000 == 0: print "Sent " + str(msg_cnt) + " messages to Kafka" if isLf: self.producer.send_messages('smw_batch_lf2', source_symbol, msg) else: self.producer.send_messages('smw_batch_hf2', source_symbol, msg) msg_cnt += 1 print "Sent Total " + str(msg_cnt) + " messages to Kafka" self.meterReader.writeHouseStatus()
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter) def produce_msgs(self, topic, source_symbol, last_record_set): self.record_set = set() count = 0 try: for item in self.r["data"]: self.record_set.add(item["payment_id"]) count += 1 if not item["payment_id"] in last_record_set: message_info = "{}\n".format(json.dumps(item)) self.producer.send_messages(topic, source_symbol, message_info) # print message_info # print count except: k = 1 def get_venmo(self, limit=300, page="https://venmo.com/api/v5/public?"): try: self.r = self.sess.get(page + "&limit={}".format(limit)).json() except: self.r = ""
def create_user_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Dict[str, str]: """ Produce create-user events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ username, full_name = fake_user() created_time, partition_date = get_datetime() record = { "username": username, "full_name": full_name, "created_time": created_time, "partition_date": partition_date, "event": "create-user" } producer.send_messages("create-user", bytes(username, 'utf-8'), json.dumps(record).encode('utf-8')) users.append((username,)) return record
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client,async=True,\ batch_send_every_n=500,batch_send=False) self.min_steps = 1 self.max_steps = 3 self.max_users_each_thread = 12000 def produce_msgs(self, source_symbol): msg_cnt = 0 while True: start_uuid = (int(source_symbol) - 1) * self.max_users_each_thread stop_uuid = (int(source_symbol) * self.max_users_each_thread) - 1 uuid = random.sample(range(start_uuid,stop_uuid), 9) for uid in uuid: timestamp = datetime.now(timezone('US/Pacific')).\ strftime('%Y-%m-%d %H:%M:%S') steps = random.randint(1,10) json_msg= {'source':source_symbol,'uuid':uid, 'timestamp':timestamp, 'steps': steps} json_encoded = json.dumps(json_msg) self.producer.send_messages('steps_data_part4', source_symbol,\ json_encoded) print json_encoded msg_cnt += 1
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list) self.key = key self.kafka_topic_name = topic if not key: self.producer = SimpleProducer(self.kafka_client) else: self.producer = KeyedProducer(self.kafka_client) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): self.producer.stop() logging.Handler.close(self)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_source): hd = open(file_source) for line in hd: print line self.producer.send_messages('datatest', source_symbol, line)
def run(self, delay=0.1): client = KafkaClient("localhost:9092") producer = KeyedProducer(client) import numpy as np for photoid in TESTPHOTOIDS: producer.send_messages('flickr-photoid','%d'%np.random.randint(0,20) ,photoid) print "Sending PhotoID: %s"%photoid time.sleep(delay)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_source): hd = open(file_source) for line in hd: print line self.producer.send_messages('datatest', source_symbol, line)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_to_use): file_obj = open(file_to_use, 'r') msg_cnt = 0 while True: message_info = file_obj.next() print message_info self.producer.send_messages('venmo2', source_symbol, message_info) msg_cnt += 1
class KafkaLoggingHandler(logging.Handler): """ Use kafka to send msg to elk platform """ def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs) def emit(self, record): """ emit record :param record: :return: """ # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) msg = msg.encode("utf-8") # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send_messages( self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except BaseException: self.handleError(record) def close(self): """ close the client :return: """ if self.producer is not None: self.producer.stop() logging.Handler.close(self)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def stream_science_posts(self, key): r = requests.session() header = {"User-Agent": "anisotropix Science"} s = r.get('https://www.reddit.com/r/science/new/.json?limit=100', stream = True, headers =header)#tream = True, timeout = 2) for post in s.iter_lines(): if post: self.producer.send_messages('Science_posts',key, post) print (post)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_to_use): file_obj = open(file_to_use, 'r') msg_cnt = 0 while True: message_info = file_obj.next() print message_info self.producer.send_messages('venmo2', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def stream_science_posts(self, key): r = requests.session() header = {"User-Agent": "anisotropix Science"} s = r.get('https://www.reddit.com/r/science/new/.json?limit=100', stream=True, headers=header) #tream = True, timeout = 2) for post in s.iter_lines(): if post: self.producer.send_messages('Science_posts', key, post) print(post)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): msg_cnt = 0 while True: artwork_path = "loc.txt" with open(artwork_path) as f1: for line in f1: if line.strip(): print line.strip() self.producer.send_messages('post_geo_activity', source_symbol,line.strip()) msg_cnt += 1
def write(): k_client = KafkaClient(KAFKA_URL) p = KeyedProducer(k_client, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=2000) messages = [] for i in xrange(NUM_MESSAGES): message = json.dumps({'msg': 'X' * SIZE_MSG}) messages.append(message) if len(messages) >= 500: key = int(time.time() * 1000) p.send_messages(KAFKA_TOPIC, str(key), *messages) messages = [] key = int(time.time() * 1000) p.send_messages(KAFKA_TOPIC, str(key), *messages)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) # The 1999 KDDCup network traffic dataset self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r') self.mem_data = [] for record in self.data_file: self.mem_data.append(record) def produce_msgs(self, source_symbol): random.seed() while True: idx = random.randint(0, len(self.mem_data) - 1) str_fmt = "{}" message_content = str_fmt.format(self.mem_data[idx]) self.producer.send_messages('traffic_data', source_symbol, message_content)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.timezone = timezone('EST') def name_generator(self): return ''.join( random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVWZ') for i in range(random.randint(3, 9))) def item_generator(self): global item_lists return random.choice(item_lists) def produce_msgs(self): msg_cnt = 0 auction_id = 0 while True: auction_id += 1 #Create time EST time create_time = datetime.now( self.timezone).strftime("%Y-%m-%d %H:%M:%S") #Auctioner ID auctioner_id = random.randint(0, 100000) #Expiry: 2 hours to 4 days auction_type = random.randint(2, 96) #Starting price: 1 cent to $100 starting_price = random.uniform(0.01, 100.0) #Auctioner name generator auctioner_name = self.name_generator() #Item generator item = self.item_generator() str_fmt = "{};{};{};{};{};{};{}" message_info = str_fmt.format(auction_id, create_time, auctioner_id, auction_type, round(starting_price, 2), auctioner_name, item) print message_info self.producer.send_messages('auctions', str(random.randint(0, 4)), message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def open_save(self, fileName): log_file = open(fileName, "w") log_file.close() return log_file def create_topic(self, topic): script = "/usr/local/kafka/bin/kafka-topics.sh" os.system("{} --create --zookeeper localhost:2181 --topic {} --partitions {} --replication-factor 2".format(script, topic, "4")) return "topic {} created".format(topic) def produce_msgs(self, source_symbol, topic): server_topics = self.client.topic_partitions if topic not in server_topics: self.create_topic(topic) price_field = random.randint(800,1400) cities = ["Barcelona", "Philadelphia", "Honolulu", "Atlanta", "Miami", "Chicago", "SF", "LA", "NYC", "Houston", "Paris", "London", "Tokyo"] msg_cnt = 0 log_file = open("input1/{}.csv".format(topic), "a") while True: time_field = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f') location_field = random.choice(cities) price_field += random.randint(-10, 10)/10.0 str_fmt = "{},{},{},{}" message_info = str_fmt.format(source_symbol, time_field, location_field, price_field) print message_info log_file.write("{}\n".format(message_info)) self.producer.send_messages(topic, source_symbol, message_info) msg_cnt += 1 if msg_cnt > 200000: log_file.close() self.producer.stop() break
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): price_field = random.randint(800, 1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10) / 10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class Producer(object): # Initialization for the class with address def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.topic = 'ajay_test_topic' # Main method for simulation def produce_msgs(self, source_symbol): # Generate some random data price_field = random.randint(800, 1400) # Count the messages in the tunnel msg_cnt = 0 # Loop for the feilds while True: # Get a random time value time_field = datetime.now().strftime("%Y%m%d %H%M%S") # Get a random price value price_field += random.randint(-10, 10) / 10.0 # Get a random volume feild volume_field = random.randint(1, 1000) # Format your string str_fmt = "{};{};{};{}" # Create the message message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) # Print for debug print message_info # Send the message self.producer.send_messages(self.topic, source_symbol, message_info) # Messages count msg_cnt += 1
class Producer(object): def __init__(self, addr=None): self.isNone = True if addr is not None: self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.isNone = False def produce_msgs(self, source_symbol): random = Random(0) msg_cnt = 0 start = 50 for i in range(100): #for observation groups 13 through 13+range #time.sleep(10) #waits between observation groups for x in range( 3000 ): #1500 means about 1000 per obs because there are 4 producers time.sleep( 0.00001 ) # 0.2 waits this many seconds before producing another message about 1000 each obs each 5 min self.observationgroup_field = random.randint( start + i, start + i) self.observationorder_field = random.randint(1, 6) self.frequency_field = random.random() * 10000 self.snr_field = random.random() * 100 self.driftrate_field = random.random() - random.random() self.uncorrectedfrequency_field = random.random( ) - random.random() + self.frequency_field str_fmt = "{};{};{};{};{};{};{}" message_info = str_fmt.format( source_symbol, self.observationgroup_field, self.observationorder_field, self.frequency_field, self.snr_field, self.driftrate_field, self.uncorrectedfrequency_field) if not self.isNone: self.producer.send_messages('gbthits', source_symbol, message_info) else: break msg_cnt += 1 if self.isNone: break
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): price_field = random.randint(800,1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10)/10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, **kwargs): logging.Handler.__init__(self) self.kafka_client = SimpleClient(hosts_list) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) if isinstance(msg, unicode): msg = msg.encode("utf-8") # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send_messages(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except Exception: self.handleError(record) def close(self): if self.producer is not None: self.producer.stop() logging.Handler.close(self)
class Producer(object): def __init__(self, addr, group_id): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.group_id = group_id def produce_msgs(self, source_file): with open(source_file, 'r') as f: lines = f.readlines() start_time = datetime.now() num_lines = 0 line_inx = 0 max_inx = len(lines) while line_inx < max_inx: token = lines[line_inx].strip().split() line_inx = line_inx % max_inx if token[2] != 'NaN': for num in range(1000): user_id = "user_%s_%s" % (self.group_id, num) event_time = ( start_time + timedelta(0, num_lines)).strftime('%Y-%m-%d %H:%M:%S') hr = int(token[2]) + randint(0, 4) - 2 msg = {'id': user_id, 'time': event_time, 'hr': hr} json_msg = json.dumps(msg) print json_msg self.producer.send_messages('sensor', str(self.group_id), json_msg) line_inx += 1 num_lines += 1 line_inx = line_inx % max_inx time.sleep(2) line_inx += 1 num_lines += 1 line_inx = line_inx % max_inx
class Producer(object): ''' Messages are sent to a single kafka topic "Friendsquare" as a json formatted string ''' def __init__(self, addr, userslist, venueslist): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.userslist = userslist[0:500000] self.venueslist = venueslist[0:250000] def produce_msgs(self, partitionkey): new_time = datetime.now() msg_cnt = 0 while True: if ((msg_cnt % 4000) != 0): userid = int(random.choice(self.userslist)) venueid = int(random.choice(self.venueslist)) else: userid = int(random.choice(self.userslist[0:500000])) venueid = int(random.choice(self.venueslist[0:250000])) rating = random.randint(0, 5) randomdelta = np.random.normal(3, 3, 1)[0] new_time += timedelta(seconds=randomdelta) created_time = new_time.strftime("%Y-%m-%d %H:%M:%S") message_info = { 'partitionkey': partitionkey, 'userid': userid, 'venueid': venueid, 'created_at': created_time, 'rating': rating } msg_info = json.dumps(message_info) print message_info self.producer.send_messages('Friendsquare1', partitionkey, msg_info) msg_cnt += 1 time.sleep(0.01)
def create_photo_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str, str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Dict[str, str]: """ Produce photo-upload events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ user = random.choice(users)[0] tag, link = random.choice(tags) latitude, longitude = random.choice(locations) created_time, partition_date = get_datetime() record = { "username": user, "tags": tag, "photo_link": link, "created_time": created_time, "partition_date": partition_date, "latitude": latitude, "longitude": longitude, "event": "photo-upload" } producer.send_messages('photo-upload', bytes(user, 'utf-8'), json.dumps(record).encode('utf-8')) photos.append((created_time, user)) return record
def comment_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Optional[Dict[str, str]]: """ Produce comment events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ if not photos: return None follower = random.choice(users)[0] photo, followee = random.choice(photos) text = get_text() created_time, partition_date = get_datetime() if not all([photo, follower, followee]): return None record = { "follower_username": follower, "followed_username": followee, "photo_id": photo, "text": text, "created_time": created_time, "partition_date": partition_date, "event": "comment" } producer.send_messages("comment", bytes(followee, 'utf-8'), json.dumps(record).encode('utf-8')) return record
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): price_field = random.randint(800, 1400) msg_cnt = 0 category_product = [('furniture', 'cat bed'), ('dog food', 'purina dog biscuits'), ('cat food', 'fancy feast 8oz'), ('cleaning', 'roomba')] while True: rand_datetime = radar.random_datetime(start=datetime(year=2016, month=5, day=24), stop=datetime(year=2017, month=2, day=1)) time_field = rand_datetime.strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10) / 10.0 product_cat_listid = random.randint(0, 2) customer_field = random.randint(1, 10000) product_field = category_product[product_cat_listid][1] category_field = category_product[product_cat_listid][0] volume_field = random.randint(1, 10) str_fmt = "{};{};{};{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field, customer_field, product_field, category_field) print message_info self.producer.send_messages('transactiondata', source_symbol, message_info) msg_cnt += 1
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs) self.addFilter(KafkaLoggingFilter()) def emit(self, record): try: # use default formatting msg = self.format(record) if isinstance(msg, unicode): msg = msg.encode("utf-8") # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send_messages(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): if self.producer is not None: self.producer.stop() logging.Handler.close(self)
class KafkaHandler(logging.Handler): """ publish message to kafka """ def __init__(self, topic, producer_type=ProducerType.SIMPLE,\ host_port="127.0.0.1:9092", **producer_opts): self.topic = topic self.host_port = host_port if producer_type == ProducerType.SIMPLE: self.producer = SimpleProducer(KafkaClient(host_port),\ **producer_opts) else: self.producer = KeyedProducer(KafkaClient(host_port),\ **producer_opts) def emit(self, record): try: response = self.producer.send_messages(self.topic,\ self.format(record)) except: raise
class KafkaHandler(logging.Handler): """ publish message to kafka """ def __init__(self, topic, producer_type=ProducerType.SIMPLE,\ host_port="127.0.0.1:9092", **producer_opts): self.topic = topic self.host_port = host_port if producer_type == ProducerType.SIMPLE: self.producer = SimpleProducer(KafkaClient(host_port),\ **producer_opts) else: self.producer = KeyedProducer(KafkaClient(host_port),\ **producer_opts) def emit(self, record): try: response = self.producer.send_messages(self.topic,\ self.format(record)) except: raise
def test_switch_leader_keyed_producer(self): topic = self.topic producer = KeyedProducer(self.client, async=False) # Send 10 random messages for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) # kill leader for partition 0 self._kill_leader(topic, 0) recovered = False started = time.time() timeout = 60 while not recovered and (time.time() - started) < timeout: try: key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) if producer.partitioners[kafka_bytestring(topic)].partition(key) == 0: recovered = True except (FailedPayloadsError, ConnectionError): logging.debug("caught exception sending message -- will retry") continue # Verify we successfully sent the message self.assertTrue(recovered) # send some more messages just to make sure no more exceptions for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg)
from kafka.client import SimpleClient as KafkaClient from kafka.producer import KeyedProducer import json import time KAFKA_URL = '192.168.10.6:9092' KAFKA_GROUP = 'kafka_python_perf' KAFKA_TOPIC = 'raw-events' NUM_MESSAGES = 10 SIZE_MSG = 369 k_client = KafkaClient(KAFKA_URL) p = KeyedProducer(k_client, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=2000) messages = [] while 1: for i in xrange(NUM_MESSAGES): message = json.dumps({'msg': 'X' * SIZE_MSG}) messages.append(message) if len(messages) >= 500: key = int(time.time() * 1000) p.send_messages(KAFKA_TOPIC, str(key), *messages) messages = [] print("wrote 500") time.sleep(1)
#select a random piece of news from collected set of tweets and send as a Kafka message DATADIR="/home/ubuntu/synthetic_twitter/" KAFKA_NODE="ec2-54-215-247-116.us-west-1.compute.amazonaws.com" KAFKA_TOPIC="twitter" os.chdir(DATADIR) files = glob.glob("*.archive") #select a random company datafile = random.choice(files) datafile = DATADIR + datafile #select a random line in data file for news #R(3.4.2) (Waterman's "Reservoir Algorithm") data = open(datafile, "r") line = next(data) for num, nextline in enumerate(data): if random.randrange(num + 2): continue line = nextline data.close() #add "Synthetic Twitter" as news outlet line = line.rstrip().replace('}', ', "newsoutlet":"Synthetic Twitter"}') #add timestamp line = line.replace('}', ',"newstime":"' + time.strftime("%c") + '"}') #Create producer and send message client = KafkaClient(KAFKA_NODE) producer = KeyedProducer(client) producer.send_messages('twitter', str(hash(line) % 2), line)
#select a random piece of news from collected set of tweets and send as a Kafka message DATADIR = "/home/ubuntu/synthetic_twitter/" KAFKA_NODE = "ec2-54-215-247-116.us-west-1.compute.amazonaws.com" KAFKA_TOPIC = "twitter" os.chdir(DATADIR) files = glob.glob("*.archive") #select a random company datafile = random.choice(files) datafile = DATADIR + datafile #select a random line in data file for news #R(3.4.2) (Waterman's "Reservoir Algorithm") data = open(datafile, "r") line = next(data) for num, nextline in enumerate(data): if random.randrange(num + 2): continue line = nextline data.close() #add "Synthetic Twitter" as news outlet line = line.rstrip().replace('}', ', "newsoutlet":"Synthetic Twitter"}') #add timestamp line = line.replace('}', ',"newstime":"' + time.strftime("%c") + '"}') #Create producer and send message client = KafkaClient(KAFKA_NODE) producer = KeyedProducer(client) producer.send_messages('twitter', str(hash(line) % 2), line)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def open_save(self, fileName): log_file = open(fileName, "w") log_file.close() return log_file def produce_msgs(self, source_symbol, topic, items): print datetime.now() sample = [] saved = checkCassandra() for item in items: listing = [] log_file = open("justsoldonebay.csv", "a") if item.sellingStatus.sellingState == "EndedWithSales": gender = get_category(item.primaryCategory.categoryId) shoe = get_shoe(item.title.lower()) if item.itemId in saved: print item.itemId, item.title if item.itemId not in saved: zip_code2 = "" try: zip_code = item.postalCode try: int(zip_code) except: zip_code = 0 # save it elsewhere zip_code2 = item.postalCode except: zip_code = 0 try: location = item.location except: location = "0" try: gallery = item.galleryURL except: gallery = "NA" price = item.sellingStatus.convertedCurrentPrice.value pprice = "${}{}".format(price,item.sellingStatus.convertedCurrentPrice._currencyId) #print item.listingInfo.startTime, item.itemId, shoe, price, zip_code, gender #, item.title start = str(item.listingInfo.startTime) end = str(item.listingInfo.endTime) listing = [start, item.listingInfo.endTime, item.viewItemURL, item.itemId, shoe, price, zip_code, gender, gallery, location, item.title, zip_code2] sample.append(listing) print "done building q" msg_cnt = 0 while True: str_fmt = "{};{};{};{};{};{};{};{};{};{};{};{}" x = sample[msg_cnt] message_info = str_fmt.format(x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]) print message_info log_file.write(str(len(x)) +" "+message_info+ "\n") self.producer.send_messages('justsoldonebay', "1", message_info) msg_cnt += 1 if msg_cnt == len(sample): break
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_house_centric_msgs(self, source_symbol, topic): # Declare variables timing_list = [] # Read throughput timing distribution with open('data/throughput/throughput_timing.txt', 'rU') as infile: for line in infile : timing_list.append(line.strip()) # Run Forever while True: random_historical_data = random.randint(0, 85) # Pick random historical data - there are 86 time points zipcode_sales_dict = parse_prefilter_data('data/tri_zipcode_sales/%s.txt'%random_historical_data) zipcode_price_dict = parse_prefilter_data('data/tri_zipcode_price/%s.txt'%random_historical_data) random_zipcode_sales_list = build_random_zipcode_list(zipcode_sales_dict) distribution_zipcode_sales_list = build_distribution_based_zipcode_list(zipcode_sales_dict) t_end = time.time() + 60 * 60 # every hour while time.time() < t_end: time_field = datetime.now().strftime("%Y%m%d-%H%M%S") user_id_field = 0 random_number = random.randint(0, (len(random_zipcode_sales_list)-1)) user_zipcode = random_zipcode_sales_list[random_number] house_field = emit_random_zipcode_price(distribution_zipcode_sales_list, zipcode_price_dict) str_fmt = """{{"timestamp":"{}","user":{{"id":"{}","zipcode":"{}"}},"house":{}}}""" message_info = str_fmt.format(time_field, user_id_field, user_zipcode, house_field) print message_info self.producer.send_messages(topic, source_symbol, message_info) if float(timing_list[random_historical_data]) != 0: time.sleep(float(timing_list[random_historical_data])) def produce_user_centric_msgs(self, source_symbol, topic): user_id_list = ('1', '2') user_zipcode_list = ('10461', '07304') house_zipcode_list = ('10545', '07304') price_list = ('315789', '299679') # Run Forever while True: i = random.randint(0, 1) # Pick random user t_end = time.time() + 60 * 360 # for 6 hours while time.time() < t_end: time_field = datetime.now().strftime("%Y%m%d-%H%M%S") user_id_field = user_id_list[i] user_zipcode = user_zipcode_list[i] another_random_number = random.randint(int(float(price_list[i])-(float(price_list[i])*0.3)), int(float(price_list[i])+(float(price_list[i])*0.5))) house_field = '{"zipcode":"{%s}","price":"{%s}"}'%(house_zipcode_list[i], another_random_number) str_fmt = """{{"timestamp":"{}","user":{{"id":"{}","zipcode":"{}"}},"house":{}}}""" message_info = str_fmt.format(time_field, user_id_field, user_zipcode, house_field) print message_info self.producer.send_messages(topic, source_symbol, message_info) time.sleep(30) # send message every 30 seconds
class Producer(object): _msg_cnt = 0 def __init__(self, addr): print "Trying connection..." self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) print "Made connection with host: {}".format(addr) self._last_update = datetime.utcnow() # For latest deals self._more_pages = 10 self._chunk_size = 10 def produce_deal_urls(self, url, topic, partition_key, max_deals_per_page=100, initial_visit=True): ''' Constantly produce deal urls for consumers to crawl ''' if not initial_visit: # Get the right URL to crawl # Search the UTC time delta since last visit for this category checked_last = self._last_update.strftime("%Y-%m-%dT%H:%M:%S%Z") url = '{};updated_after={}'.format(url,checked_last) req = self.fetch_request(url) if req.ok: # Calculate number of pages to crawl # Max 100 per page, crawl total//100 try: total_category = req.json()['query']['total'] if total_category > 0: num_pages_to_fetch = ((total_category / max_deals_per_page) + 1) ''' Produce categories and page range for consumers Crawl extra pages to account for changing api. Pages with no deals will be filtered out by consumer Recommended approaches to partitioning 1. max(t/p, t/c) partitions. - t: Required throughput - p: Production speed - c: consumption speed 2. Rule of thumb - 100 * b * r - b: # of brokers in cluster - r: # of replication factor {category_slug; start_page; end_page} ''' total_pages = range(1, num_pages_to_fetch + self._more_pages) page_chunks = list(self.yield_chunks(total_pages, self._chunk_size)) for chunk in page_chunks: time_stamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z") msg = '{} => {} => {}'.format(time_stamp, url, chunk) print msg self.producer.send_messages(topic, str(partition_key), msg) self.__class__._msg_cnt += 1 self._last_update = datetime.utcnow() except simplejson.scanner.JSONDecodeError: pass def produce_deal_full_data(self): time_stamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z") pass def get_total_msg_prod(self): ''' Returns how many messages all instances of producer sent ''' return self.__class__._msg_cnt def fetch_request(self, url): ''' Return url to endpoint ''' return rq.get(url) def yield_chunks(self, int_list, num): ''' Yield successive chunks of size num from lists ''' for idx in xrange(0, len(int_list), num): yield int_list[idx:idx+num]
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.minLat = 40.708751 self.maxLat = 40.802895 self.minLong = -74.025879 self.maxLong = -73.930435 self.counter_start = 0 # self.locs = [(40.75280785, -73.97546422),(40.73988115,-73.98711691),(40.76105171, -73.96962834),\ # (40.75790096,-73.97578395),(40.75833353,-74.00436092),(40.74496999,-73.97087089),\ # (40.76088942,-73.97008963),(40.75494802,-73.96084512),(40.73754566,-73.98306014),\ # (40.76804075,-73.98086881),(40.73795777,-73.97972054),(40.75311322,-73.99081106),\ # (40.76445038,-73.9693873),(40.75204099,-73.99041951),(40.75705723,-73.98304045),\ # (40.74984862,-73.98108846),(40.73641334,-73.99263483),(40.74022644,-73.97511118),\ # (40.74081696,-73.99869147),(40.75155827,-73.97809876),(40.7979499,-73.93799602),\ # (40.78487376,-73.9488285),(40.78891306,-73.96322338),(40.80932537,-73.95927604),\ # (40.79512142,-73.97732225),(40.78566559,-73.94358666),(40.80024399,-73.96799964),\ # (40.78788311,-73.97040765),(40.80434947,-73.93874699),(40.80183406,-73.96247845),\ # (40.80595751,-73.95441724),(40.80650874,-73.96646741),(40.7931067,-73.9413598),\ # (40.81627861,-73.95581725),(40.80999546,-73.96029616),(40.81289571,-73.95471676),\ # (40.81689372,-73.93035378),(40.81309684,-73.92121306), (40.8096491,-73.93651239)] self.available=[[8,9,10, 17,18,19,20,21],[8,9,10,11,16,17,18,19,20],[8,9,15,16,17,18],\ [14,15,16,17,18,19,20,21,22],[10,11,12,22,23,00],[12,13,14,15,16],\ [19,20,21,22,23,00],[00,01,02],[8,11,13,15,17,19,21,22,23], [8,2],\ [0,1,2,3,4,5,6,7,8,9,10,12],[13,14,15,16,17,18,19,20,21,22,23]] def produce_msgs(self, name): # location_index = random.randint(0, len(self.locs)) # latitude = self.locs[location_index][0] # longitude = self.locs[location_index][1] lat_frac = random.random() long_frac = random.random() latitude = lat_frac*self.minLat + (1-lat_frac)*self.maxLat longitude = long_frac*self.minLong + (1-long_frac)*self.maxLong schedule = self.available[random.randint(0,len(self.available)-1)] steps = self.counter_start while True: direction = random.randint(0,4) t = datetime.now().strftime("%Y%m%d %H%M%S") hr = datetime.now().hour if hr == 0: steps = self.counter_start availability = hr in schedule if direction == 0: steps +=1 if latitude >= self.maxLat: latitude = latitude - 0.00001124152 else: latitude = latitude + 0.00001124152 elif direction == 1: steps +=1 if longitude >= self.maxLong: longitude = longitude - 0.00001124152 else: longitude = longitude + 0.00001124152 elif direction == 2: steps +=1 if latitude <= self.minLat: latitude = latitude + 0.00001124152 else: latitude = latitude - 0.00001124152 elif direction == 3: steps +=1 if latitude <= self.minLong: longitude = longitude + 0.00001124152 else: longitude = longitude - 0.00001124152 else: pass str_fmt = "{};{};{};{};{};{}" message_info = str_fmt.format("user_"+str(name), t, latitude, longitude, availability, steps) print message_info self.producer.send_messages('b', name, message_info) time.sleep(0.1)