def keepalive_fn(scheduler: sched.scheduler, params: inputs.Inputs, context: LambdaContext, keepalive_state: KeepaliveState, cache: Cache): ''' Each iteration of keepalive_thread runs this code. Add the next iteration of keepalive before exiting to continue the keepalive thread. Otherwise keepalives will stop ''' try: update_keepalive(params, keepalive_state, cache) keepalive_fn.num_keepalives += 1 if keepalive_fn.num_keepalives % defaults.KEEPALIVE_PRINT_EVERY == 0: print("keepalive_fn: keepalive #{}: state={}".format( keepalive_fn.num_keepalives, keepalive_state)) if context.invoked_function_arn and context.get_remaining_time_in_millis( ) < defaults.RETRIGGER_BEFORE_EXPIRY_MS: # if invoked as lambda (not CLI), then retrigger backing job if this instance of it will expire soon cache_keys = keepalive_state.cache_keys lastaccess_ms = int(cache.get(cache_keys.lastaccess)) lastaccess_age_ms = utils.millitime() - lastaccess_ms if lastaccess_age_ms > (defaults.BACKING_JOB_LIFETIME_MS * 0.9): # There were no recent calls to fetch the data produced by this backing job. No need to re-issue print( "Exiting backing job by ending keepalive thread. lastaccess_age_ms = ", lastaccess_age_ms) return False if not params.is_streaming(): ''' Fixed time-range jobs need not be reissued ''' print( "keepalive_fn: backing job won't be restarted because it is not a streaming job", params) return False # Restart this job again in another lambda invocation. # Before doing that, don't keepalive for a while to make it stale. Otherwise the new invocation # will assume there is another backing job already running and will auto-exit print( "keepalive_fn: backing job needs to be restarted. lastaccess_age_ms =", lastaccess_age_ms) time.sleep(defaults.KEEPALIVE_INTERVAL_SEC * defaults.KEEPALIVE_EXPIRY_MULTIPLE) start_backing_job_if_necessary(params, context, keepalive_state, cache) print( "keepalive_fn: exiting current backing job after re-issuing a new one" ) return False except Exception as e: print("keepalive_fn: exception", e, traceback.format_exc()) # schedule the next iteration of keepalive thread scheduler.enter(defaults.KEEPALIVE_INTERVAL_SEC, 1, keepalive_fn, argument=(scheduler, params, context, keepalive_state, cache))
def update_keepalive(params: inputs.Inputs, keepalive_state: KeepaliveState, cache: Cache): ''' Update the keepalive state in cache. Also check if the current backing job owns the keepalive. If not, exit ''' try: cache_keys = keepalive_state.cache_keys exit_if_necessary(keepalive_state, cache) keepalive_state.last_keepalive_ms = utils.millitime() cache.set(cache_keys.keepalive, pickle.dumps(keepalive_state)) except Exception as e: print("update_keepalive: exception", e, traceback.format_exc())
def __init__(self): self.invoke_time_ms = utils.millitime() self.cache_location = os.environ.get("cache_url", None) self.api_token = None self.api_endpoint = os.environ.get( "api_endpoint", "https://api.signalfx.com") # SignalFx API endpoint self.program = None # signalflow program self.start_time_ms = None self.end_time_ms = 0 self.resolution_hint_ms = 0 self.is_daemon = False
def exit_if_necessary(keepalive_state: KeepaliveState, cache: Cache): ''' if backing job ever discovers that another instance of the same thing is currently running and owns the keepalive key in cache, then it exits ''' cache_keys = keepalive_state.cache_keys try: cached_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) if cached_state.id != keepalive_state.id: expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000 if utils.millitime() - cached_state.last_keepalive_ms < expiry_ms: # Another backing job is running, and it has published a keepalive recently print( "exit_if_necessary: exiting because another instance already running", cached_state.id, time.ctime(cached_state.last_keepalive_ms / 1000)) os._exit(1) except Exception as e: print("exit_if_necessary: failed to read keepalive from cache", e)
def wait_for_backing_job_to_exit_batch_phase( keepalive_state: KeepaliveState, cache: Cache, cache_keys: CacheKeys, wait_until_ms: int): print("wait_for_backing_job_to_exit_batch_phase: started", cache_keys.keepalive) while not keepalive_state or not keepalive_state.in_streaming_phase: # wait for backing job to be running and advance to streaming state if utils.millitime() > wait_until_ms: raise Exception( "wait_for_backing_job_to_exit_batch_phase: timed out") print( "get_cached_result: waiting for batch phase to end. keepalive_state=", keepalive_state) time.sleep(1) try: keepalive_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) except Exception as e: print( "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache", cache_keys.keepalive, e) print("wait_for_backing_job_to_exit_batch_phase: backing job is ready", keepalive_state) return keepalive_state
def start_backing_job_if_necessary(params: inputs.Inputs, context: LambdaContext, cache: Cache): ''' If no backing job is running for a given signalflow program and duration, start one Returns keepalive_state from cache if active backing job is found (to prevent a duplicate cache read by callers ''' def start_backing_job_as_lambda(params: inputs.Inputs, tstart, tend, context: LambdaContext): # Start new backing job that runs as a lambda function print("start_backing_job_as_lambda: started") import boto3 lambda_client = boto3.client('lambda') lambda_client.invoke(FunctionName=context.invoked_function_arn, InvocationType='Event', Payload=json.dumps({ "program": params.program, "start_time_ms": tstart, "end_time_ms": tend, "resolution_hint_ms": params.resolution_hint_ms, "api_token": params.api_token, "api_endpoint": params.api_endpoint, "daemon": True })) def start_backing_job_as_process(params: inputs.Inputs, tstart, tend): # Start new backing job that runs as a python process print("start_backing_job_as_process: started") cmd: str = "nohup python3 {script} --program=\"{program}\" --token={token} \ --start_time_ms={tstart} --end_time_ms={tend} --resolution_hint_ms={res} --endpoint={endpoint}".format( script=__file__, program=params.program, tstart=tstart, tend=tend, res=params.resolution_hint_ms, token=params.api_token, endpoint=params.api_endpoint) cmd += " --daemon > /tmp/{}.log 2>&1 &".format( params.cache_key_prefix()) print("start_backing_job_as_process:", cmd) os.system(cmd) # begin code for start_backing_job_if_necessary() try: cache_keys = CacheKeys(params.cache_key_prefix()) print("start_backing_job_if_necessary: started", cache_keys) now_ms = utils.millitime() cached_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) keepalive_age_ms = now_ms - cached_state.last_keepalive_ms expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000 if keepalive_age_ms < expiry_ms: print( "start_backing_job_if_necessary: found active backing job already running. keepalive_age_ms =", keepalive_age_ms) return cached_state print( "start_backing_job_if_necessary: found expired keepalive_age_ms =", keepalive_age_ms) cache.set(cache_keys.keepalive, None) except Exception as e: print("start_backing_job_if_necessary: no keeplive found in cache", e) tstart = params.start_time_ms tend = params.end_time_ms if not params.is_streaming(): tstart = params.absolute_ms(tstart) tend = params.absolute_ms(tend) if context.invoked_function_arn: # This backing job was invoked as a lambda. So invoke a new lambda start_backing_job_as_lambda(params, tstart, tend, context) else: start_backing_job_as_process(params, tstart, tend) return None
def data_consumer_thread_fn(params: inputs.Inputs, context: LambdaContext, data_queue: Queue, keepalive_state: KeepaliveState, cache: Cache): ''' Thread that consumes data messages from analytics job, and sticks each one individually into cache. Also detects when job moves from batch to stream phase. Unfortunately that requires 'auto-detection' where data does not arrive for close to a second :-( ''' print("data_consumer_thread_fn: started") try: cache_keys = keepalive_state.cache_keys data_to_encache = {} last_datamsg_walltime_ms = 0 while True: now_ms = utils.millitime() try: if params.is_streaming(): # remove trailing data keys that are beyond the scope of the current 'window' of a streaming job valid_timestamps = [ ts for ts in keepalive_state.data_timestamps if ts >= (now_ms - params.job_duration_ms() - keepalive_state.resolution_ms) ] keepalive_state.data_timestamps = set(valid_timestamps) msg = data_queue.get(False) last_datamsg_walltime_ms = utils.millitime() data_to_encache.setdefault(msg.logical_timestamp_ms, {}) data_to_encache[msg.logical_timestamp_ms].update(msg.data) except Exception as e: # No data found in queue. However there may be pending data from previous messages that need caching timestamps_encached = set() for timestamp, values in data_to_encache.items(): try: cache.set(cache_keys.data_prefix + str(timestamp), values) timestamps_encached.add(timestamp) except Exception as e: # Failed to set data in cache None for timestamp_encached in timestamps_encached: data_to_encache.pop(timestamp_encached) keepalive_state.data_timestamps.add(timestamp_encached) if data_to_encache: print( "data_consumer_thread_fn: will retry writing {} data keys to cache {}" .format(len(data_to_encache), list(data_to_encache))) elif not keepalive_state.in_streaming_phase: # Now that all data is successfully published, 'Auto-detect' whether we have completed batch phase # and entered stream phase. If so, update keepalive_state if last_datamsg_walltime_ms > 0 and ( now_ms - last_datamsg_walltime_ms ) > defaults.STREAM_PHASE_DETECTION_INTERVAL_MS: keepalive_state.in_streaming_phase = True print( "data_consumer_thread_fn: backing job entered stream phase after {} datapoints. now={}, last={}" .format(len(keepalive_state.data_timestamps), now_ms, last_datamsg_walltime_ms)) # start healthcheck thread now that data is flowing in threading.Thread(target=healthcheck_thread_fn, args=(params, context, keepalive_state, cache)).start() time.sleep(defaults.STREAM_PHASE_DETECTION_INTERVAL_MS / 1000 / 5) except Exception as e: print("data_consumer_thread_fn exception", e, traceback.format_exc()) finally: print("data_consumer_thread_fn: ended")
parser.add_argument( "--cache_url", required=False, default="", help="memcache configuration url is host:port form") parser.add_argument("--daemon", action="store_true") args = parser.parse_args() test_event = { "program": args.program, "api_token": args.token, "cache_url": args.cache_url, "daemon": args.daemon } if args.endpoint: test_event["api_endpoint"] = args.endpoint if args.start_time_ms: test_event["start_time_ms"] = args.start_time_ms if args.end_time_ms: test_event["end_time_ms"] = args.end_time_ms if args.resolution_hint_ms: test_event["resolution_hint_ms"] = args.resolution_hint_ms context = LambdaContext() context.aws_request_id = utils.millitime() context.invoked_function_arn = None lambda_handler(test_event, context) except Exception as e: print("Exception", e, traceback.format_exc()) os._exit(1)