def process(self, tuple): val = tuple.values[0] line = re.compile(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)").split(val) cct = ChicagoCrimeObject() ccl = ChicagoCrimeLocation() ccb = ChicagoCrimeBeat() cct.id = str(line[0]) cct.case_number = str(line[1]) cct.date = str(line[2]) cct.block = str(line[3]) cct.iucr = str(line[4]) cct.primary_type = str(line[5]) cct.description = str(line[6]) ccl.location_description = str(line[7]) ccl.location = str(line[21]) ccl.longitude = str(line[19]) ccl.latitude = str(line[20]) ccl.x_coordinate = str(line[15]) ccl.y_coordinate = str(line[16]) cct.location = ccl.toJSON() cct.arrest = str(line[8]) cct.domestic = str(line[9]) ccb.beat = str(line[10]) ccb.community_area = str(line[13]) ccb.district = str(line[11]) ccb.ward = str(line[12]) cct.beat = ccb.toJSON() cct.fbi_code = str(line[14]) cct.year = str(line[17]) cct.updated_on = str(line[18]) log.info(cct.toJSON()) storm.emit([cct.toJSON()])
def nextTuple(self): """ 從kafka batch 讀取資料處理 messages (m) are namedtuples with attributes: m.offset: message offset on topic-partition log (int) m.value: message (output of deserializer_class - default is raw bytes) """ if self.consumer is None: log.debug("self.consumer is not ready yet.") return # log.debug("ExpSpout.nextTuple()") # time.sleep(3) # prototype減速觀察 try: for message in self.consumer: if message is not None: # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time()) if self.counter == 0: log.warning( "start process 1000000 records at {0} (timestamp)". format(time.time())) self.counter += 1 # self.emit_thread.append(message.value) storm.emit([message.value]) if self.counter % 10000 == 0: log.warning( "finish process {0} records at {1} (timestamp@{2})". format(self.counter, time.time(), socket.gethostname())) except Exception as inst: log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
def nextTuple(self): """ 從kafka batch 讀取資料處理 messages (m) are namedtuples with attributes: m.offset: message offset on topic-partition log (int) m.value: message (output of deserializer_class - default is raw bytes) """ if self.consumer is None: log.debug("self.consumer is not ready yet.") return # log.debug("ExpSpout.nextTuple()") # time.sleep(3) # prototype減速觀察 try: for message in self.consumer: if message is not None: # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time()) if self.counter == 0: log.warning("start process 1000000 records at {0} (timestamp)".format(time.time())) self.counter += 1 # self.emit_thread.append(message.value) storm.emit([message.value]) if self.counter % 10000 == 0: log.warning("finish process {0} records at {1} (timestamp@{2})".format(self.counter, time.time(), socket.gethostname())) except Exception as inst: log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
def process(self, tup): raise ValueError('abc') log.debug('WordCountBolt.process() called with: %s', tup) word = tup.values[0] self._count[word] += 1 log.debug('WordCountBolt.process() emitting: %s', [word, self._count[word]]) storm.emit([word, self._count[word]])
def process(self, tup): log.debug('WordCountBolt.process() called with: %s', tup) word = tup.values[0] self._count[word] += 1 log.debug('WordCountBolt.process() emitting: %s', [word, self._count[word]]) storm.emit([word, self._count[word]])
def process(self, tup): """ 將接收到的csv line, 切分成row, 並只傳有興趣的欄位給下個bolt """ if tup.is_tick_tuple(): log.debug("tuple is tick") else: # log.warning("get tuple whose id is {0}".format(tup.id)) line = tup.values[0] line = line.strip()[8:] # remove "message:" added by fluentd # log.warning("SplitBolt process: %s", line.strip()) raw_row = line.split(",") if len(raw_row) == 47: storm.emit([raw_row[6], raw_row[4], raw_row[17], raw_row[18]]) if self.counter == 0: log.warning( "start process 1000000 records at {0} (timestamp@{1})".format(time.time(), socket.gethostname()) ) self.counter += 1 if self.counter == 1000000: # this won't work since more than on instance log.warning( "finish process 1000000 records at {0} (timestamp@{1})".format( time.time(), socket.gethostname() ) )
def nextTuple(self): """ 從kafka batch 讀取資料處理 messages (m) are namedtuples with attributes: m.offset: message offset on topic-partition log (int) m.value: message (output of deserializer_class - default is raw bytes) """ if self.consumer is None: log.debug("self.consumer is not ready yet.") return log.debug("ExpSpout.nextTuple()") time.sleep(3) # prototype減速觀察 cursor = 0 try: for message in self.consumer: cursor += 1 if message is not None: log.debug("offset: %s \t value: %s", message.offset, message.value) storm.emit([message.value]) if cursor > 10000: # prototype減量觀察 break except NoPartitionsForConsumerException: log.debug("NoPartitionsForConsumerException")
def process(self, tup): count =0 if len(tup.values[0])>1 and (len(tup.values[0])%2)==0: for word in self.get_words(tup.values[0]): if count ==0: helpcount = word count+=1 else: word2 = word.encode('utf-8') + ' ' + helpcount.encode('utf-8') count =0 storm.emit([word2])
def process(self, tup): if tup.is_tick_tuple(): for t in self.rankedItems.itervalues(): storm.emit(t.values) else: self.rankedItems[tup.values[0]] = tup if len(self.rankedItems) > self.maxSize: for t in sorted(self.rankedItems.itervalues(), key=tup_sort_key): del self.rankedItems[t.values[0]] break
def process(self, tup): if tup.is_tick_tuple(): now = time.time() now_floor = int(math.floor(now / self.window_duration) * self.window_duration) first_window = int(now_floor - self.num_windows * self.window_duration) self.conn.zunionstore( 'twitter_word_count', ['twitter_word_count:%s' % t for t in xrange(first_window, now_floor)]) for t in self.conn.zrevrange('twitter_word_count', 0, self.maxSize, withscores=True): log.info('Emitting: %s', repr(t)) storm.emit(t)
def fail(self, msg_id): """ emit message again with id which is composed with a prefix and the failed tuple.id :param msg_id: id of failed tuple """ # log.warning("fail of message #{0}".format(msg_id)) fail_id = "fail_{0}".format(msg_id) fail_message = self.message_pool[msg_id] self.message_pool[fail_id] = fail_message storm.emit([fail_message], id=fail_id) # emit message again del self.message_pool[msg_id]
def process(self, tup): if tup.is_tick_tuple(): for t in self.rankedItems.itervalues(): storm.emit(t.values) else: self.rankedItems[tup.values[0]] = tup if len(self.rankedItems) > self.maxSize: for t in sorted( self.rankedItems.itervalues(), key=tup_sort_key): del self.rankedItems[t.values[0]] break
def nextTuple(self): for message in self.consumer: algo = message.value if (len(algo) > 4): user = algo[:1] if user.isdigit(): aux = 'INSTANT'+user algo = algo[2:len(algo)] if(algo[0] == ' '): algo=algo[1:len(algo)] self.db[aux].tweet.insert_one({'tweet':algo}) storm.emit([algo,user])
def run(self): while True: if len(self.messages) != 0: # log.warning(self.messages.pop(0)) storm.emit([self.messages.pop(0)]) self.counter += 1 if self.counter % 10000 == 0: log.warning("emit process {0} records at {1} (timestamp@{2})".format(self.counter, time.time(), socket.gethostname())) else: time.sleep(0.01)
def nextTuple(self): for message in self.consumer: algo = message.value if (len(algo) > 4): user = algo[:1] if user.isdigit(): aux = 'INSTANT' + user algo = algo[2:len(algo)] if (algo[0] == ' '): algo = algo[1:len(algo)] self.db[aux].tweet.insert_one({'tweet': algo}) storm.emit([algo, user])
def run(self): while True: if len(self.messages) != 0: # log.warning(self.messages.pop(0)) storm.emit([self.messages.pop(0)]) self.counter += 1 if self.counter % 10000 == 0: log.warning("#{0}".format(self.counter)) if self.counter == 1000000: # mark time log.warning("emit process 1000000 records at {0} (timestamp)".format(time.time())) else: time.sleep(0.01)
def nextTuple(self): for message in self.consumer: algo = message.value if(len(algo) >4): user = algo[:1] if user.isdigit(): aux = 'BOARD'+user algo = algo[2:len(algo)] if(algo[0] == ' '): algo=algo[1:len(algo)] self.db[aux].bad.insert_one({'tweet':algo}) algo=algo.encode('utf-8','ignore') storm.emit([algo,user])
def nextTuple(self): #if self._index == len(self.sentences): # # This is just a demo; keep sleeping and returning None after we run # # out of data. We can't just sleep forever or Storm will hang. # time.sleep(1) # return None time.sleep(0.25) sentence = self.sentences[random.randint(0, len(self.sentences) - 1)] #sentence = self.sentences[self._index] #self._index += 1 log.debug('randomsentence emitting: %s', sentence) storm.emit([sentence])
def nextTuple(self): #if self._index == len(self.sentences): # # This is just a demo; keep sleeping and returning None after we run # # out of data. We can't just sleep forever or Storm will hang. # time.sleep(1) # return None time.sleep(0.25); sentence = self.sentences[random.randint(0, len(self.sentences) - 1)] #sentence = self.sentences[self._index] #self._index += 1 log.debug('randomsentence emitting: %s', sentence) storm.emit([sentence])
def nextTuple(self): for message in self.consumer: algo = message.value if (len(algo) > 4): user = algo[:1] if user.isdigit(): aux = 'BOARD' + user algo = algo[2:len(algo)] if (algo[0] == ' '): algo = algo[1:len(algo)] self.db[aux].good.insert_one({'tweet': algo}) algo = algo.encode('utf-8', 'replace') storm.emit([algo, user])
def nextTuple(self): #file = open('/home/pipe/twitterintel/topology/text.txt','a') for message in self.consumer: algo = message.value if (len(algo) > 4): user = algo[:1] if user.isdigit(): aux = 'BOARD'+user algo = algo[2:len(algo)] if(algo[0] == ' '): algo=algo[1:len(algo)] self.db[aux].spam.insert_one({'tweet':algo}) algo=algo.encode('utf-8','replace') storm.emit([algo,user])
def nextTuple(self): #file = open('/home/pipe/twitterintel/topology/text.txt','a') for message in self.consumer: algo = message.value if (len(algo) > 4): user = algo[:1] if user.isdigit(): aux = 'BOARD' + user algo = algo[2:len(algo)] if (algo[0] == ' '): algo = algo[1:len(algo)] self.db[aux].spam.insert_one({'tweet': algo}) algo = algo.encode('utf-8', 'replace') storm.emit([algo, user])
def run(self): while True: if len(self.messages) != 0: # log.warning(self.messages.pop(0)) storm.emit([self.messages.pop(0)]) self.counter += 1 if self.counter % 10000 == 0: log.warning( "emit process {0} records at {1} (timestamp@{2})". format(self.counter, time.time(), socket.gethostname())) else: time.sleep(0.01)
def process(self, tup): log.debug("HashtagCountBolt.process() started with: %s", tup) tag = tup.values[0] if tag != "None": self._count[tag] += 1 d = parse(tup.values[1]) date = calendar.timegm(d.timetuple()) db = MySQLdb.connect("localhost","root","password","twitter") cursor = db.cursor() sql = """INSERT INTO hashtags (hashtag, datetime, count) values ('%s', '%s', %d) on duplicate key update count=%d""" % (tag, date, self._count[tag], self._count[tag]) cursor.execute(sql) db.commit() storm.emit([tag, self._count[tag], date]) else: storm.emit(["None", "None", "None"])
def nextTuple(self): if self.consumer is None: print("self.consumer is not ready yet.") return try: for message in self.consumer: if message is not None: msg_id = str(self.counter) #log.info(">>> MESSAGE: " + message.value.decode('ascii')) storm.emit([message.value.decode('ascii')]) self.counter += 1 #log.info(">>>> COUNTER: " + self.counter) except Exception as inst: log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
def toNextBolt(self): """ 將累計的同個msisdn的uplink & downlink emit給下個bolt 並重新累計 """ for msisdn in self.total_uplink: # see if we could pass list in storm tuple: False, emit members needs to be hashable # so ... merge list to str merged = ",".join(self.total_records[msisdn]) # log.debug("%s", [msisdn, merged]) storm.emit([msisdn, self.total_uplink[msisdn], self.total_downlink[msisdn], merged]) # clear accumulator self.total_uplink.clear() self.total_downlink.clear() self.total_records.clear() self.counter = 0
def nextTuple(self): """ consume message from kafka messages (m) are named tuples with attributes: m.offset: message offset on topic-partition log (int) m.value: message (output of deserializer_class - default is raw bytes) """ if self.consumer is None: log.debug("self.consumer is not ready yet.") return if self.counter >= 1000000: return # log.debug("ExpSpout.nextTuple()") # time.sleep(3) # prototype減速觀察 try: # message = self.consumer.consume() for message in self.consumer: if message is not None: # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time()) if self.counter == 0: self.start_time = time.time() log.warning( "start process 1000000 records at {0} (timestamp@{1})".format( time.time(), socket.gethostname() ) ) msg_id = str(self.counter) self.message_pool[msg_id] = message.value # message cache for fail over storm.emit([message.value], id=msg_id) self.counter += 1 if self.counter % 10000 == 0: log.warning("mark @ #{0}".format(self.counter)) if self.counter == 1000000: # mark time self.end_time = time.time() log.warning( "finish process 1000000 records at {0} (timestamp@{1})".format( time.time(), socket.gethostname() ) ) log.warning("spend {0} seconds processing 1000000 records".format(self.end_time - self.start_time)) if self.counter % 100 == 0: break except Exception as inst: log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
def process(self, tup): if tup.is_tick_tuple(): now = time.time() now_floor = int( math.floor(now / self.window_duration) * self.window_duration) first_window = int(now_floor - self.num_windows * self.window_duration) self.conn.zunionstore('twitter_word_count', [ 'twitter_word_count:%s' % t for t in xrange(first_window, now_floor) ]) for t in self.conn.zrevrange('twitter_word_count', 0, self.maxSize, withscores=True): log.info('Emitting: %s', repr(t)) storm.emit(t)
def nextTuple(self): """ consume message from kafka messages (m) are named tuples with attributes: m.offset: message offset on topic-partition log (int) m.value: message (output of deserializer_class - default is raw bytes) """ if self.consumer is None: log.debug("self.consumer is not ready yet.") return if self.counter >= 1000000: return # log.debug("ExpSpout.nextTuple()") # time.sleep(3) # prototype減速觀察 try: # message = self.consumer.consume() for message in self.consumer: if message is not None: # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time()) if self.counter == 0: self.start_time = time.time() log.warning( "start process 1000000 records at {0} (timestamp@{1})" .format(time.time(), socket.gethostname())) msg_id = str(self.counter) self.message_pool[ msg_id] = message.value # message cache for fail over storm.emit([message.value], id=msg_id) self.counter += 1 if self.counter % 10000 == 0: log.warning("mark @ #{0}".format(self.counter)) if self.counter == 1000000: # mark time self.end_time = time.time() log.warning( "finish process 1000000 records at {0} (timestamp@{1})" .format(time.time(), socket.gethostname())) log.warning( "spend {0} seconds processing 1000000 records".format( self.end_time - self.start_time)) if self.counter % 100 == 0: break except Exception as inst: log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
def process(self, tup): log.debug('MsgLogStorageBolt.process() called with: %s', tup) content_type = tup.values[0] delivery_tag = tup.values[1] msg_body = tup.values[2] msg = message.Message().createFromJSON(msg_body) try: self.storage.saveMsg(msg) except Exception as e: log.debug('Message failed to be stored') log.debug('Error: %s', e) else: #channel.basic_ack(delivery_tag=method.delivery_tag) pass # Emit same message to next bolt storm.emit([content_type, delivery_tag, msg_body])
def toNextBolt(self): """ 將累計的同個msisdn的uplink & downlink emit給下個bolt 並重新累計 """ for msisdn in self.total_uplink: # see if we could pass list in storm tuple: False, emit members needs to be hashable # so ... merge list to str merged = ",".join(self.total_records[msisdn]) # log.debug("%s", [msisdn, merged]) storm.emit([ msisdn, self.total_uplink[msisdn], self.total_downlink[msisdn], merged ]) # clear accumulator self.total_uplink.clear() self.total_downlink.clear() self.total_records.clear() self.counter = 0
def nextTuple(self): # if self._index == len(self.sentences): # # This is just a demo; keep sleeping and returning None after we run # # out of data. We can't just sleep forever or Storm will hang. # time.sleep(1) # return None # time.sleep(0.25); # sentence = self.sentences[random.randint(0, len(self.sentences) - 1)] # sentence = self.sentences[self._index] # self._index += 1 # log.debug('rabbitmq_spout emitting: %s', sentence) # for word in sentence.split(' '): # storm.emit([word]) # Initialize our timers and loop until external influence stops us if self.conn_broker.is_open: # Call basic get which returns the 3 frame types method, header, body = self.channel.basic_get(queue=self.QUEUE_NAME) print method, header, body # It can be empty if the queue is empty so don't do anything if not method: time.sleep(self.SLEEP_TIME) return None if method.NAME == "Basic.GetEmpty": # No need to pound rabbit, sleep for a while. If you want messages as # fast as you can get them, use Basic.Consume time.sleep(self.SLEEP_TIME) return None ##storm.emit(['content_type', 'delivery_tag', 'msg_body']) # We have data else: # print "Basic.GetOk %s delivery-tag %i: %s" % (header.content_type, # method.delivery_tag, # body) storm.emit([header.content_type, method.delivery_tag, body]) # Acknowledge the receipt of the data # TODO: Do it at the end of processing, # here it is done at the end of delivering it to the following bolt self.channel.basic_ack(delivery_tag=method.delivery_tag)
def nextTuple(self): if self.consumer is None: print("self.consumer is not ready yet.") return while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: log.info('%% %s [%d] reached end of offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: print('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key()))) storm.emit(msg.value())
def process(self, tup): if tup.is_tick_tuple(): for t in sorted( self.rankedItems.itervalues(), key=tup_sort_key, reverse=True): log.info('Emitting: %s', repr(t.values)) storm.emit(t.values) else: self.rankedItems[tup.values[0]] = tup if len(self.rankedItems) > self.maxSize: for t in sorted( self.rankedItems.itervalues(), key=tup_sort_key): del self.rankedItems[t.values[0]] break zero_keys = set( k for k, v in self.rankedItems.iteritems() if v.values[1] == 0) for k in zero_keys: del self.rankedItems[k]
def process(self, tup): if(len(tup.values[0])>1): count =0 words = self.get_words(tup.values[0].encode('utf-8','ignore')) if len(words)>=2 and (len(words)%2)==0: for index in words: if count ==0: helpcount = words[index] count+=1 else: word2 = helpcount + ' ' + words[index] count =0 storm.emit([word2,tup.values[1]]) elif(len(words)>2): for index in words: if words[len(words)-1] == words[index]: word2 = words[len(words)-2] + ' ' + words[index] storm.emit([word2,tup.values[1]]) if count ==0: helpcount = words[index] count+=1 else: word2 = helpcount + ' ' + words[index] count =0 storm.emit([word2,tup.values[1]])
def process(self, tup): if (len(tup.values[0]) > 1): count = 0 words = self.get_words(tup.values[0].encode('utf-8', 'ignore')) if len(words) >= 2 and (len(words) % 2) == 0: for index in words: if count == 0: helpcount = words[index] count += 1 else: word2 = helpcount + ' ' + words[index] count = 0 storm.emit([word2, tup.values[1]]) elif (len(words) > 2): for index in words: if words[len(words) - 1] == words[index]: word2 = words[len(words) - 2] + ' ' + words[index] storm.emit([word2, tup.values[1]]) if count == 0: helpcount = words[index] count += 1 else: word2 = helpcount + ' ' + words[index] count = 0 storm.emit([word2, tup.values[1]])
def process(self, tup): """ 將接收到的csv line, 切分成row, 並只傳有興趣的欄位給下個bolt """ if tup.is_tick_tuple(): log.debug("tuple is tick") else: line = tup.values[0] line = line.strip()[8:] # remove "message:" added by fluentd # log.warning("SplitBolt process: %s", line.strip()) raw_row = line.split(",") if len(raw_row) == 47: storm.emit([raw_row[6], raw_row[4], raw_row[17], raw_row[18]]) if self.counter == 0: log.warning( "start process 1000000 records at {0} (timestamp)". format(time.time())) self.counter += 1 if self.counter == 1000000: # this won't work since more than on instance log.warning( "finish process 1000000 records at {0} (timestamp)". format(time.time()))
def process(self, tup): log.debug('SplitHashtagBolt.process() started with: %s', tup) t = tup.values[0] if t.has_key('entities'): if t['entities']['hashtags']: for i in t['entities']['hashtags']: try: tag = str(i['text'].decode("ascii")) date = t['created_at'] storm.emit([tag, date]) except: tag = "None" date = "None" storm.emit([tag, date]) else: tag = "None" date = "None" storm.emit([tag, date]) else: tag = "None" date = "None" storm.emit([tag, date])
def nextTuple(self): for message in self.consumer: algo = message.value self.db.BOARD.bad.insert_one({'tweet':algo}) storm.emit([algo])
def process(self, tup): for word in self.get_words(tup.values[0]): word2 = word.encode('utf-8') storm.emit([word])
def emitCurrentWindowCounts(self): counts = self.counter.getCountsThenAdvanceWindow() for k, v in counts.iteritems(): storm.emit([k, v])
def nextTuple(self): time.sleep(0.25) sentences = self.sentences sentence = sentences[random.randint(0, len(sentences) - 1)] self.log.debug('randomsentence emitting: %s', sentence) storm.emit([sentence])
def process(self, tup): words = tup.values[0].split(" ") for word in words: storm.emit([word])
def process(self, tup): log.debug('SplitSentenceBolt.process() called with: %s', tup) words = tup.values[0].split(" ") for word in words: log.debug('SplitSentenceBolt.process() emitting: %s', word) storm.emit([word])
def process(self, tup): for word in self.get_words(tup.values[0]): word = word.encode('utf-8', 'replace') storm.emit([word, tup.values[1]])
def process(self, tup): for word in self._get_words(tup.values[0]): storm.emit([word])
def process(self, tup): for word in self.get_words(tup.values[0]): storm.emit([word,tup.values[1]])
def process(self, tup): word = tup.values[0] self._count[word] += 1 storm.emit([word, self._count[word]])
def process(self, tup): for word in self.get_words(tup.values[0]): word= word.encode('utf-8','replace') storm.emit([word,tup.values[1]])
def nextTuple(self): tweet = self.queue.get() storm.emit([tweet]) self.queue.task_done()
def nextTuple(self): time.sleep(0.25) sentence = self.sentences[random.randint(0, len(self.sentences) - 1)] storm.emit([sentence])
def nextTuple(self): time.sleep(0.25) sentence = self.sentences[random.randint(0, len(self.sentences) - 1)] log.debug("RandomSentence nextTuple emitting %s", sentence) storm.emit([sentence])
def emitCurrentWindowCounts(self): counts = self.counter.getCountsThenAdvanceWindow() for k, v in counts.iteritems(): word2 = k.encode('utf-8')+ ' '+ str(v) self.producer.send(self.topic,word2) storm.emit([k, v])
def nextTuple(self): tag = self.t[random.randint(0, len(self.t) - 1)] date = datetime.datetime.now() log.debug('hashtagspout emitting: %s', tag) storm.emit([tag, date])