def createInputBufferScript(self): # For each input script type (text, or image, wher we get to it...) for _bufferscript in self.input_scripts: # get the input streams for this... for _inputsource in self.inverted_buffer_index[_bufferscript]: bufferscriptname = self.config[_bufferscript]["script"] bufferlogname = self.config["configuration"]["input-streams"][ _inputsource]["name"] exportkey = self.config[_bufferscript]["export-key"] dataprocessor = self.config["configuration"]["input-streams"][ _inputsource]["processor_script"] importkey = self.config["configuration"]["input-streams"][ _inputsource]["import-key"] bufferStr = \ '''#!/bin/sh cd hotexamples_com if ps up `cat {logdir}/{bufferlogname}.pid ` > /dev/null then printf "{bufferscriptname}.py is aleady running\\n" >> {logdir}/{bufferlogname}.out else printf "{bufferlogname} is no longer running. Deleting PID file.\\n" >> {logdir}/{bufferlogname}.out rm {logdir}/{bufferlogname}.pid >> {logdir}/{bufferlogname}.out printf "Deleted file\\n" >> {logdir}/{bufferlogname}.out printf "Starting {bufferscriptname}.py\\n" >> {logdir}/{bufferlogname}.out nohup ./assed_env/bin/python {assedscript}/{bufferscriptname}.py {logdir} {importkey} {exportkey} {dataprocessor} {dataprocessorscriptdir} {pidname} >> {logdir}/{bufferlogname}.log 2>&1 & fi'''.format(homedir = self.home_dir, logdir = self.log_dir, bufferscriptname = bufferscriptname, bufferlogname = bufferlogname, assedscript = self.assed_sript_dir, importkey = importkey, exportkey = exportkey, dataprocessor = dataprocessor, dataprocessorscriptdir = self.script_dir_importname, pidname=bufferlogname) self.inputBufferScriptFile = os.path.join( self.sh_dir, bufferlogname + ".sh") self.writeScript(self.inputBufferScriptFile, bufferStr) helper_utils.std_flush( "Generated script for Input Buffer at %s" % self.inputBufferScriptFile)
def createProcessScripts(self): for _processscript in self.process_scripts: scriptname = self.config[_processscript]["script"] processname = self.config[_processscript]["name"] importkey = self.config[_processscript]["import-key"] exportkey = self.config[_processscript]["export-key"] bufferStr = \ '''#!/bin/sh cd hotexamples_com if ps up `cat {logdir}/{processname}.pid ` > /dev/null then printf "{processscriptname}.py is aleady running\\n" >> {logdir}/{processname}.out else printf "{processname} is no longer running. Deleting PID file.\\n" >> {logdir}/{processname}.out rm {logdir}/{processname}.pid >> {logdir}/{processname}.out printf "Deleted file\\n" >> {logdir}/{processname}.out printf "Starting {processname}.py\\n" >> {logdir}/{processname}.out nohup ./assed_env/bin/python {assedscript}/assed_process.py {logdir} {importkey} {exportkey} {processscriptname} {processscriptdir} {pidname} >> {logdir}/{processname}.log 2>&1 & fi'''.format(homedir = self.home_dir, logdir = self.log_dir, processscriptname = scriptname, processname = processname, assedscript = self.assed_sript_dir, exportkey = exportkey, importkey = importkey, processscriptdir = self.script_dir_importname, pidname=processname) self.inputBufferScriptFile = os.path.join(self.sh_dir, scriptname + ".sh") self.writeScript(self.inputBufferScriptFile, bufferStr) helper_utils.std_flush( "Generated script for %s at %s" % (_processscript, self.inputBufferScriptFile))
def initializeKafka(self): admin = kafka.admin.KafkaAdminClient() for _scriptref in self.input_scripts + self.output_scripts + self.process_scripts: kafka_key = self.config[_scriptref]["export-key"].replace(":", "_") try: admin.create_topics(new_topics=[ kafka.admin.NewTopic(name=kafka_key, num_partitions=1, replication_factor=1) ], validate_only=False) helper_utils.std_flush( "Created %s export key in kafka broker" % kafka_key) except kafka.errors.TopicAlreadyExistsError: helper_utils.std_flush( "%s exportkey already exists in Kafka broker" % kafka_key)
def main(logdir, importkey, exportkey, dataprocessor, dataprocessorscriptdir, pidname): TOP_OF_FILE_START = True pid_name = pidname helper_utils.setup_pid(pid_name, logdir=logdir) # Import processscript helper_utils.std_flush("[%s] -- Initializing ASSED-input-buffer %s" % (helper_utils.readable_time(), pidname)) moduleImport = __import__("pipelines.%s.%s" % (dataprocessorscriptdir, dataprocessor), fromlist=[dataprocessor]) DataProcessor = getattr(moduleImport, dataprocessor) DataProcessor = DataProcessor() helper_utils.std_flush("[%s] -- Imported Data processor %s" % (helper_utils.readable_time(), dataprocessor)) # Set up connections pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) kafka_key = exportkey.replace(":", "_") kafka_producer = kafka.KafkaProducer() message_refresh = 7200 skip_count = 0 process_count = 0 time_slept = 0 message_timer = time.time() # Get earliest file to parse... helper_utils.std_flush("[%s] -- Searching for files" % helper_utils.readable_time()) finishedUpToTime = r.get(importkey) granularTime = 0 if finishedUpToTime is None: finishedUpToTime = 0 else: finishedUpToTime = int(finishedUpToTime.decode()) if finishedUpToTime == 0: # TODO CHANGE TO 7 days after setup is complete... helper_utils.std_flush( "[%s] -- No value for previous stop. Starting from 7 days prior", helper_utils.readable_time()) currentTime = datetime.now() - timedelta(days=7) foundFlag = 0 while foundFlag == 0: filePath = DataProcessor.getInputPath(currentTime) if os.path.exists(filePath): # We found the most recent file, and increment our counter finishedUpToTime = currentTime foundFlag = 1 else: # If our search is too broad - i.e. we are a month behind, ignore currentTime += TIME_DELTA_MINIMAL timeDeltaOutputStream = (datetime.now() - currentTime) if timeDeltaOutputStream.days == 0 and timeDeltaOutputStream.seconds <= 1: foundFlag = -1 else: # I.E. if we already have a timestmap from pervious execution, we will read files that are a minute behind, and catch up to the granular time helper_utils.std_flush( "[%s] -- Starting File tracking at %s" % (helper_utils.readable_time(), str(datetime.fromtimestamp(finishedUpToTime / 1000.0)))) granularTime = finishedUpToTime finishedUpToTime = datetime.fromtimestamp( granularTime / 1000.0) - timedelta(seconds=60) TOP_OF_FILE_START = False if TOP_OF_FILE_START: # Otherwise, we start from the beginning of the 'first' file... finishedUpToTime -= timedelta(seconds=finishedUpToTime.second) granularTime = 0 prevGranular = granularTime helper_utils.std_flush("[%s] -- Starting Stream Tracking for %s" % (helper_utils.readable_time(), importkey)) while True: if time.time() - message_timer > message_refresh: message_timer = time.time() helper_utils.std_flush( "[%s] -- Processed %i items, with %i items skipped and %i seconds slept in the last %i seconds" % (helper_utils.readable_time(), process_count, skip_count, time_slept, message_refresh)) process_count, skip_count, time_slept = 0, 0, 0 if (datetime.now() - finishedUpToTime).total_seconds() < 60: waitTime = 120 - (datetime.now() - finishedUpToTime).seconds time.sleep(waitTime) time_slept += waitTime else: filePath = DataProcessor.getInputPath(finishedUpToTime) if not os.path.exists(filePath): waitTime = (datetime.now() - finishedUpToTime).total_seconds() #Difference is less than Two minutes if waitTime < 120: waitTime = 120 - waitTime time.sleep(waitTime) time_slept += waitTime else: # Difference is more than two minutes - we can increment the the by one minute for the next ones finishedUpToTime += TIME_DELTA_MINIMAL # Not we have file else: with open(filePath, 'r') as fileRead: for line in fileRead: try: jsonVersion = json.loads(line) except ValueError as e: helper_utils.std_flush( "[%s] -- WARNING -- Possible warning for %s file for %s with error %s" % (helper_utils.readable_time(), filePath, importkey, str(e))) continue if "timestamp_ms" not in jsonVersion: jsonVersion["timestamp_ms"] = int( jsonVersion["timestamp"]) if granularTime > int(jsonVersion["timestamp_ms"]): # skip already finished this... skip_count += 1 continue else: # Have not done this item yet... # process processed_data = DataProcessor.process(jsonVersion) byted = bytes(json.dumps(processed_data), encoding="utf-8") kafka_producer.send(kafka_key, byted) kafka_producer.flush() granularTime = int(jsonVersion["timestamp_ms"]) r.set(importkey, granularTime) process_count += 1 if granularTime - prevGranular > 86400000: helper_utils.std_flush( "[%s] -- Finished with %s" % (helper_utils.readable_time(), str( datetime.fromtimestamp( granularTime / 1000.0)))) prevGranular = granularTime finishedUpToTime += TIME_DELTA_MINIMAL
# Now we have a StreamerManager with empty instances for each streamer we are going to launch. # We will launch all of them, and go on from there... for _streamer_ in StreamerManager: if StreamerManager[_streamer_]["type"] == "unstructured": #launch single unstructured streamer... StreamerManager[_streamer_]["apikey"] = StreamerManager[ _streamer_]["keyserver"].get_key() StreamerManager[_streamer_]["instance"] = StreamerManager[ _streamer_]["executor"]( StreamerManager[_streamer_]["keywords"], StreamerManager[_streamer_]["apikey"][1], errorQueue, messageQueue) StreamerManager[_streamer_]["instance"].start() std_flush( "Deployed unstructured streamer : %s\tat %s\twith key %s" % (StreamerManager[_streamer_]["name"], readable_time(), StreamerManager[_streamer_]["apikey"][0])) elif StreamerManager[_streamer_]["type"] == "structured": # Launch each instance (eventlangtuple)... for _instance_ in StreamerManager[_streamer_]["instances"]: StreamerManager[_streamer_]["instances"][_instance_][ "apikey"] = StreamerManager[_streamer_][ "keyserver"].get_key() StreamerManager[_streamer_]["instances"][_instance_][ "instance"] = StreamerManager[_streamer_]["executor"]( _instance_[0], _instance_[1], StreamerManager[_streamer_]["instances"][_instance_] ["keywords"], StreamerManager[_streamer_]["instances"] [_instance_]["apikey"][1], errorQueue, messageQueue) StreamerManager[_streamer_]["instances"][_instance_][ "instance"].start()
def main(logdir, importkey, exportkey, processscript, processscriptdir, pidname, debug, seekval): if debug is None: debug = 0 if debug: helper_utils.std_flush("[%s] -- DEBUG_MODE -- Active" % helper_utils.readable_time()) pid_name = pidname if not debug: helper_utils.setup_pid(pid_name, logdir=logdir) # Import processscript helper_utils.std_flush("[%s] -- Initializing ASSED-Process %s" % (helper_utils.readable_time(), pidname)) moduleImport = __import__("pipelines.%s.%s" % (processscriptdir, processscript), fromlist=[processscript]) MessageProcessor = getattr(moduleImport, processscript) if debug: MessageProcessor = MessageProcessor(debug=True) else: MessageProcessor = MessageProcessor() helper_utils.std_flush("[%s] -- Imported Module %s" % (helper_utils.readable_time(), processscript)) kafka_import = importkey.replace(":", "_") helper_utils.std_flush("[%s] -- Generated kafka import key %s" % (helper_utils.readable_time(), kafka_import)) kafka_export = exportkey.replace(":", "_") helper_utils.std_flush("[%s] -- Generated kafka export key %s" % (helper_utils.readable_time(), kafka_export)) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush( "[%s] -- Connected to redis with ConnectionPool on port 6379" % helper_utils.readable_time()) seek_partition = r.get(exportkey + ":partition") seek_offset = r.get(exportkey + ":offset") seek_partition = 0 if seek_partition is None else int(seek_partition) seek_offset = 0 if seek_offset is None else int(seek_offset) + 1 helper_utils.std_flush( "[%s] -- Obtained seek partition for kafka at Partition %i -- Offset %i" % (helper_utils.readable_time(), seek_partition, seek_offset)) # replace seek value if debug: if seekval is not None: seek_offset = seekval helper_utils.std_flush( "[%s] -- DEBUG -- Replaced seek offset for kafka at Partition %i -- Offset %i" % (helper_utils.readable_time(), seek_partition, seek_offset)) kafka_producer = kafka.KafkaProducer() helper_utils.std_flush("[%s] -- Generated kafka producer" % helper_utils.readable_time()) kafka_consumer = kafka.KafkaConsumer() helper_utils.std_flush("[%s] -- Generated kafka consumer" % helper_utils.readable_time()) TopicPartition = kafka.TopicPartition(kafka_import, seek_partition) kafka_consumer.assign([TopicPartition]) kafka_consumer.seek(TopicPartition, seek_offset) helper_utils.std_flush("[%s] -- Set kafka consumer seek" % helper_utils.readable_time()) message_correct_counter = 0 message_fail_counter = 0 message_counter = 0 for message in kafka_consumer: item = json.loads(message.value.decode()) processedMessage = MessageProcessor.process(item) # Push the message to kafka...if true if type(processedMessage) != type(tuple()): raise ValueError( "[%s] -- ERROR -- Invalid type %s for processedMessage. MessageProcessor.process() must return tuple of (bool,message)." % (helper_utils.readable_time(), str(type(processedMessage)))) if not processedMessage[0]: message_fail_counter += 1 else: if not debug: byted = bytes(json.dumps(processedMessage[1]), encoding="utf-8") kafka_producer.send(kafka_export, byted) kafka_producer.flush() message_correct_counter += 1 message_counter += 1 if not debug: r.set(exportkey + ":partition", message.partition) r.set(exportkey + ":offset", message.offset) r.set(exportkey + ":timestamp", message.timestamp) if message_counter % 1000 == 0: helper_utils.std_flush( "[%s] -- Processed %i messages with %i failures and %i successes" % (helper_utils.readable_time(), message_counter, message_fail_counter, message_correct_counter))
def main(importkey, exportkey, seekval): kafka_import = importkey.replace(":", "_") helper_utils.std_flush("Generated kafka import key %s" % kafka_import) kafka_export = exportkey.replace(":", "_") helper_utils.std_flush("Generated kafka export key %s" % kafka_export) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush("Connected to redis") seek_partition = r.get(exportkey + ":partition") seek_offset = r.get(exportkey + ":offset") seek_partition = 0 if seek_partition is None else int(seek_partition) seek_offset = 0 if seek_offset is None else int(seek_offset) + 1 helper_utils.std_flush( "Obtained seek partition for kafka at Partition %i -- Offset %i" % (seek_partition, seek_offset)) if seekval is not None: seek_offset = seekval helper_utils.std_flush( "Replaced seek offset for kafka at Partition %i -- Offset %i" % (seek_partition, seek_offset)) helper_utils.std_flush("\n\n") kafka_consumer = kafka.KafkaConsumer() helper_utils.std_flush("Generated kafka consumer") TopicPartition = kafka.TopicPartition(kafka_import, seek_partition) kafka_consumer.assign([TopicPartition]) kafka_consumer.seek(TopicPartition, seek_offset) helper_utils.std_flush("Set kafka consumer seek") count = 0 for message in kafka_consumer: #pdb.set_trace() count += 1 jsval = json.loads(message.value.decode()) helper_utils.std_flush(jsval["streamtype"], str(count))
def __init__(self, startTime, keywords, rootName, errorQueue,messageQueue): multiprocessing.Process.__init__(self) ''' Message queue for passing back errors and current times ''' self.errorQueue = errorQueue self.messageQueue = messageQueue ''' set up relevant details ''' self.keywords = keywords self.rootName = rootName self.DOWNLOAD_PREPEND = './downloads/' self.STREAM_FILES_PROCESSOR_MAX_SECOND_DELAY = CONSTANTS.STREAM_FILES_PROCESSOR_MAX_SECOND_DELAY self.BACK_CHECK_FILES_DAYS = 10 self.timeDelta = timedelta(seconds=CONSTANTS.STREAMING_GRANULARITY_SECONDS) ''' Set up the time counter Note the finishedUpToTime MUST be a datetime object ''' if startTime is None: self.fishedUpToTime = None # First attempt to get most recent output file currentTime = datetime.now() foundFlag = 0 while foundFlag == 0: filePath = self.getOutputPath(currentTime) if os.path.exists(filePath): # We found the most recent file, and increment our counter self.finishedUpToTime = currentTime+self.timeDelta std_flush(" ".join([self.rootName, "Found output-stream file at",(str(filePath))])) foundFlag = 1 else: #if our search is too broad - i.e. we are a month behind, ignore currentTime-=self.timeDelta if (datetime.now() - currentTime).days > self.BACK_CHECK_FILES_DAYS: foundFlag = -1 #If not exists, attempt to get earliest download file if foundFlag == -1: std_flush(" ".join([self.rootName, "Did not find any output-stream files."])) currentTime = datetime.now() - timedelta(days=self.BACK_CHECK_FILES_DAYS) foundFlag = 0 while foundFlag == 0: filePath = self.getInputPath(currentTime) if os.path.exists(filePath): #we found the most recent file, and increment our counter self.finishedUpToTime = currentTime std_flush(" ".join([self.rootName, "Found input-stream file at",(str(filePath))])) foundFlag = 1 else: #if our search is too broad - i.e. we are a month behind, ignore currentTime+=self.timeDelta timeDeltaOutputStream = (datetime.now() - currentTime) if timeDeltaOutputStream.days == 0 and timeDeltaOutputStream.seconds <= 1: foundFlag = -1 if foundFlag == -1: #So nothing is there std_flush(" ".join([self.rootName, "Did not find any input-stream files."])) #raise(self.NoStartTimeGivenAndNoFilesExist) raise RuntimeError() #If not, crash??????? else: self.fishedUpToTime = startTime #reset seconds to 0 self.finishedUpToTime -= timedelta(seconds=self.finishedUpToTime.second) self.previousMessageTime = self.finishedUpToTime
#Load the keywords keywordConfig = load_config(CONSTANTS.TOPIC_CONFIG_PATH) errorQueue = multiprocessing.Queue() messageQueue = multiprocessing.Queue() keyStreamConfig = {} # for each keyword-lang pair type, launch a StreamFilesProcessor for physicalEvent in keywordConfig['topic_names'].keys(): for language in keywordConfig['topic_names'][physicalEvent]["languages"]: eventLangTuple = (physicalEvent,language) keyStreamConfig[eventLangTuple] = {} keyStreamConfig[eventLangTuple]['name'] = physicalEvent keyStreamConfig[eventLangTuple]['lang'] = language keyStreamConfig[eventLangTuple]['keywords'] = keywordConfig['topic_names'][physicalEvent]["languages"][language] keyStreamConfig[eventLangTuple]['postpone'] = False std_flush(" ".join(["Deploying",str(eventLangTuple), "at", readable_time()])) try: keyStreamConfig[eventLangTuple]['processor'] = StreamFilesProcessor( None, keyStreamConfig[eventLangTuple]['keywords'], "_".join([eventLangTuple[0],eventLangTuple[1]]), errorQueue, messageQueue) except RuntimeError: std_flush(" ".join([str(eventLangTuple), " does not have files to start. Posponing launch 2 hr at", readable_time()])) keyStreamConfig[eventLangTuple]['postpone'] = True keyStreamConfig[eventLangTuple]['launchTime'] = datetime.now() if not keyStreamConfig[eventLangTuple]['postpone']: keyStreamConfig[eventLangTuple]['processor'].start() configCheckTimer = time.time()
def main(): local_timer = 0 refresh_timer = 7200 sleep_timer = 300 while True: if time.time() - local_timer > refresh_timer: local_timer = time.time() helper_utils.std_flush("[%s] -- Initializing EventDetection" % helper_utils.readable_time()) cell_cache = {} assed_config = file_utils.load_config("./config/assed_config.json") helper_utils.std_flush("[%s] -- Obtained DB Connection" % helper_utils.readable_time()) DB_CONN = db_utils.get_db_connection(assed_config) cursor = DB_CONN.cursor() available_streamers = [ item for item in assed_config["SocialStreamers"] ] streamer_results = {} helper_utils.std_flush( "[%s] -- Available streamers: %s" % (helper_utils.readable_time(), str(available_streamers))) for _streamer_ in available_streamers: helper_utils.std_flush( "[%s] -- Generating query for: %s" % (helper_utils.readable_time(), _streamer_)) _query_ = generate_social_query(_streamer_=_streamer_, _topic_="landslide") cursor.execute(_query_) streamer_results[_streamer_] = cursor.fetchall() helper_utils.std_flush( "[%s] -- Obtained results for : %s" % (helper_utils.readable_time(), _streamer_)) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "TRMM")) _query_ = generate_trmm_query() cursor.execute(_query_) trmm_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "TRMM")) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "USGS")) _query_ = generate_usgs_query() cursor.execute(_query_) usgs_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "USGS")) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "News")) _query_ = generate_news_query() cursor.execute(_query_) news_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "News")) cursor.close() helper_utils.std_flush( "[%s] -- Generating local cache with scoring:\tSocial-ML - 0.3\tSocial-HDI - 1\tNews - 3\tUSGS - 5\tTRMM - 1" % helper_utils.readable_time()) # Scoring -- Twitter-Social: 0.3 Twitter-HDI - 1 News: 3 USGS: 5 TRMM: 1 for _streamer_ in streamer_results: helper_utils.std_flush( "[%s] -- Local caching for %s" % (helper_utils.readable_time(), _streamer_)) for tuple_cell_ in streamer_results[_streamer_]: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} if int(float(tuple_cell_[1])) > 0: cell_cache[_cell_][_streamer_ + "-hdi"] = (int( float(tuple_cell_[1])), float(tuple_cell_[1])) if int(float(tuple_cell_[2]) / 0.34) > 0: cell_cache[_cell_][_streamer_ + "-ml"] = (int( float(tuple_cell_[2]) / 0.34), float( tuple_cell_[2])) helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "TRMM")) for tuple_cell_ in trmm_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["TRMM"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 1) ) # 1 <-- TRMM score helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "USGS")) for tuple_cell_ in usgs_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["USGS"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 5)) helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "News")) for tuple_cell_ in news_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["News"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 3)) helper_utils.std_flush( "[%s] -- Local cache score total generation" % helper_utils.readable_time()) for _cell_ in cell_cache: cell_cache[_cell_]["total"] = sum([ cell_cache[_cell_][item][1] for item in cell_cache[_cell_] ]) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush("[%s] -- Connected to Redis" % helper_utils.readable_time()) # Correct-key -- v1 or v2 # Key Push # Actual keys... # list_tracker_key tracks where the data is (either v1 or v2) # list_push_key contains the list of cells list_tracker_key = "assed:event:detection:multisource:listkey" list_push_key = "assed:event:detection:multisource:list" list_info_key = "assed:event:detection:multisource:info" key_version = r.get(list_tracker_key) if key_version is None: key_version = "v2" else: key_version = key_version.decode() push_key = 'v1' if key_version == 'v1': helper_utils.std_flush( "[%s] -- v1 key already in effect. Pushing to v2" % helper_utils.readable_time()) push_key = 'v2' else: helper_utils.std_flush( "[%s] -- v2 key already in effect. Pushing to v1" % helper_utils.readable_time()) cell_list = [item for item in cell_cache] true_list_push_key = list_push_key + ":" + push_key helper_utils.std_flush( "[%s] -- Deleting existing %s, if any" % (helper_utils.readable_time(), true_list_push_key)) r.delete(true_list_push_key) r.lpush(true_list_push_key, *cell_list) helper_utils.std_flush( "[%s] -- Pushed cell list to %s" % (helper_utils.readable_time(), true_list_push_key)) helper_utils.std_flush("[%s] -- Pushing individual cell results" % helper_utils.readable_time()) cell_counter = 0 for _cell_ in cell_cache: cell_push_contents = json.dumps(cell_cache[_cell_]) cell_specific_suffix = ":".join(_cell_.split("_")) cell_push_key = ":".join( [list_info_key, cell_specific_suffix, push_key]) r.set(cell_push_key, cell_push_contents) if cell_counter == 0: helper_utils.std_flush("[%s] -- First push: %s --- %s" % (helper_utils.readable_time(), cell_push_key, cell_push_contents)) cell_counter += 1 helper_utils.std_flush( "[%s] -- Completed individual cell pushes with %s cells" % (helper_utils.readable_time(), str(cell_counter))) r.set(list_tracker_key, push_key) helper_utils.std_flush( "[%s] -- Setting versioning in %s to %s" % (helper_utils.readable_time(), list_tracker_key, push_key)) helper_utils.std_flush("-------- COMPLETE AT %s ----------\n" % helper_utils.readable_time()) else: #helper_utils.std_flush("Sleeping for %s"%sleep_timer) time.sleep(sleep_timer)
def process(self, message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["hdi"] = 0 self.stream_tracker[message["streamtype"]]["non_hdi"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 if time.time() - self.cursor_timer > self.cursor_refresh: self.cursor.close() self.cursor = self.DB_CONN.cursor() self.cursor_timer = time.time() for _streamtype in self.stream_tracker: utils.helper_utils.std_flush( "[%s] -- Processed %i elements from %s with %i HDI and %i NONHDI" % (helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"], _streamtype, self.stream_tracker[_streamtype]["hdi"], self.stream_tracker[_streamtype]["non_hdi"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["non_hdi"] = 0 self.stream_tracker[_streamtype]["hdi"] = 0 if self.debug: utils.helper_utils.std_flush( "Processed %i elements from %s with %i HDI and %i NONHDI" % (self.stream_tracker[message["streamtype"]]["totalcounter"], message["streamtype"], self.stream_tracker[message["streamtype"]]["hdi"], self.stream_tracker[message["streamtype"]]["non_hdi"])) # Check # Check item self.verify_message(message) message["cell"] = utils.helper_utils.generate_cell( float(message["latitude"]), float(message["longitude"])) _time_ = int(int(message["timestamp"]) / 1000) _time_minus = self.time_convert(_time_ - 6 * self.MS_IN_DAYS) _time_plus = self.time_convert(_time_ + 3 * self.MS_IN_DAYS) select_s = 'SELECT location from HCS_News where cell = %s and timestamp > %s and timestamp < %s' params = (message["cell"], _time_minus, _time_plus) self.cursor.execute(select_s, params) results = self.cursor.fetchall() if len(results) > 0: #helper_utils.std_flush("True Event found for %s"%str(message["text"].encode("utf-8"))[2:-2]) self.true_counter += 1 # Push into landslide events... insert = 'INSERT INTO ASSED_Social_Events ( \ social_id, cell, \ latitude, longitude, timestamp, link, text, location, topic_name, source, valid, streamtype) \ VALUES (%s,%s,%s,%s,%s,%s, %s, %s,%s, %s, %s, %s)' params = (str(message["id_str"]), message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "hdi", "1", message["streamtype"]) #helper_utils.std_flush(insert%params) try: if not self.debug: self.cursor.execute(insert, params) self.DB_CONN.commit() else: #helper_utils.std_flush(insert%params) pass helper_utils.std_flush( "[%s] -- Possible landslide event at %s detected at time %s using HDI (current time: %s)" % (helper_utils.readable_time(), message["location"], self.ms_time_convert(message["timestamp"]), self.time_convert(time.time()))) self.stream_tracker[message["streamtype"]]["hdi"] += 1 return (False, message) except mdb._exceptions.Error as mdb_error: traceback.print_exc() true_mdb_error = eval(str(mdb_error)) if true_mdb_error[0] == 2013 or true_mdb_error[ 0] == 2006: # This is database connection error raise RuntimeError( "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down." % helper_utils.readable_time()) helper_utils.std_flush( '[%s] -- ERROR -- Failed to insert %s with error %s' % (helper_utils.readable_time(), message["id_str"], repr(mdb_error))) else: # No matching HDI pass """ tODO also perform event detection on other data (just news data (already exists), combination of earthquake AND TRMM (???)) """ if self.debug: #helper_utils.std_flush("No HDI detected for %s - %s - %s"%(str(message["id_str"]),str(message["text"].encode("utf-8"))[2:-2], message["cell"] )) pass self.stream_tracker[message["streamtype"]]["non_hdi"] += 1 return (True, message)
def createIfNotExists(self, dir_): if not os.path.exists(dir_): helper_utils.std_flush("%s directory not created. Creating" % dir_) os.makedirs(dir_) helper_utils.std_flush("Finished verifying directory %s" % dir_)
# Perform the import, then execute moduleImport = __import__("HighConfidenceStreamerSrc.%s" % _cfg["source_file"], fromlist=[_cfg["source_file"]]) Executor = getattr(moduleImport, _cfg["source_file"]) try: HCS_configuration[hcs_type]['processor'] = Executor( assed_config, root_name=hcs_type, errorQueue=errorQueue, messageQueue=messageQueue, **kwargs) except Exception as e: traceback.print_exc() std_flush("Failed to launch %s with error %s" % (hcs_type, repr(e))) std_flush("Launch complete for ", hcs_type, "HighConfigurationStreamer at ", readable_time()) HCS_configuration[hcs_type]['processor'].start() HCS_configuration[hcs_type]['timestamp'] = time.time() configCheckTimer = time.time() while True: if time.time() - configCheckTimer > CONSTANTS.HCS_CONFIG_TIME_CHECK: configCheckTimer = time.time() std_flush(" ".join(["Checking configuration at", readable_time()])) configReload = load_config(CONSTANTS.HIGH_CONFIDENCE_CONFIG_PATH) configCheckTimer = time.time() # TODO handle config changes... pass
def process(self, message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["positive"] = 0 self.stream_tracker[message["streamtype"]]["negative"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 if time.time() - self.cursor_timer > self.cursor_refresh: self.cursor.close() self.cursor = self.DB_CONN.cursor() self.cursor_timer = time.time() #helper_utils.std_flush("TRUE: %i\t\tFALSE: %i out of total of %i"%(self.true_counter, self.false_counter, self.total_counter)) self.total_counter, self.true_counter, self.false_counter = 0, 0, 0 for _streamtype in self.stream_tracker: utils.helper_utils.std_flush( "[%s] -- Processed %i elements from %s with %i positive and %i negative" % (helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"], _streamtype, self.stream_tracker[_streamtype]["positive"], self.stream_tracker[_streamtype]["negative"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["positive"] = 0 self.stream_tracker[_streamtype]["negative"] = 0 if self.debug: utils.helper_utils.std_flush( "Processed %i elements from %s with %i positive and %i negative" % (self.stream_tracker[message["streamtype"]]["totalcounter"], message["streamtype"], self.stream_tracker[message["streamtype"]]["positive"], self.stream_tracker[message["streamtype"]]["negative"])) # Get message text cleaned_message = str(message["text"].encode("utf-8"))[2:-2] encoded_message = self.encode(cleaned_message) prediction = np.argmax( self.model.predict(np.array([encoded_message]))[0]) params = None if prediction == 1: # push to db self.true_counter += 1 params = (message["id_str"], message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "1", message["streamtype"]) self.stream_tracker[message["streamtype"]]["positive"] += 1 elif prediction == 0: # push to db, with false? push to different db? self.false_counter += 1 params = (message["id_str"], message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "0", message["streamtype"]) self.stream_tracker[message["streamtype"]]["negative"] += 1 else: warnings.warn( "[%s] -- WARNING -- Prediction value of %i is not one of valid predictions [0, 1]" % (helper_utils.readable_time(), prediction)) try: if not self.debug: self.cursor.execute(self.db_insert, params) self.DB_CONN.commit() else: #helper_utils.std_flush(self.db_insert%params) pass except mdb._exceptions.Error as mdb_error: traceback.print_exc() true_mdb_error = eval(str(mdb_error)) if true_mdb_error[0] == 2013 or true_mdb_error[ 0] == 2006: # This is database connection error raise RuntimeError( "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down" % helper_utils.readable_time()) helper_utils.std_flush( '[%s] -- ERROR -- Failed to insert %s with error %s' % (helper_utils.readable_time(), message["id_str"], repr(mdb_error))) return (False, message) self.total_counter += 1 return (False, message)