def keepalive_fn(scheduler: sched.scheduler, params: inputs.Inputs, context: LambdaContext, keepalive_state: KeepaliveState, cache: Cache): ''' Each iteration of keepalive_thread runs this code. Add the next iteration of keepalive before exiting to continue the keepalive thread. Otherwise keepalives will stop ''' try: update_keepalive(params, keepalive_state, cache) keepalive_fn.num_keepalives += 1 if keepalive_fn.num_keepalives % defaults.KEEPALIVE_PRINT_EVERY == 0: print("keepalive_fn: keepalive #{}: state={}".format( keepalive_fn.num_keepalives, keepalive_state)) if context.invoked_function_arn and context.get_remaining_time_in_millis( ) < defaults.RETRIGGER_BEFORE_EXPIRY_MS: # if invoked as lambda (not CLI), then retrigger backing job if this instance of it will expire soon cache_keys = keepalive_state.cache_keys lastaccess_ms = int(cache.get(cache_keys.lastaccess)) lastaccess_age_ms = utils.millitime() - lastaccess_ms if lastaccess_age_ms > (defaults.BACKING_JOB_LIFETIME_MS * 0.9): # There were no recent calls to fetch the data produced by this backing job. No need to re-issue print( "Exiting backing job by ending keepalive thread. lastaccess_age_ms = ", lastaccess_age_ms) return False if not params.is_streaming(): ''' Fixed time-range jobs need not be reissued ''' print( "keepalive_fn: backing job won't be restarted because it is not a streaming job", params) return False # Restart this job again in another lambda invocation. # Before doing that, don't keepalive for a while to make it stale. Otherwise the new invocation # will assume there is another backing job already running and will auto-exit print( "keepalive_fn: backing job needs to be restarted. lastaccess_age_ms =", lastaccess_age_ms) time.sleep(defaults.KEEPALIVE_INTERVAL_SEC * defaults.KEEPALIVE_EXPIRY_MULTIPLE) start_backing_job_if_necessary(params, context, keepalive_state, cache) print( "keepalive_fn: exiting current backing job after re-issuing a new one" ) return False except Exception as e: print("keepalive_fn: exception", e, traceback.format_exc()) # schedule the next iteration of keepalive thread scheduler.enter(defaults.KEEPALIVE_INTERVAL_SEC, 1, keepalive_fn, argument=(scheduler, params, context, keepalive_state, cache))
def exit_if_necessary(keepalive_state: KeepaliveState, cache: Cache): ''' if backing job ever discovers that another instance of the same thing is currently running and owns the keepalive key in cache, then it exits ''' cache_keys = keepalive_state.cache_keys try: cached_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) if cached_state.id != keepalive_state.id: expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000 if utils.millitime() - cached_state.last_keepalive_ms < expiry_ms: # Another backing job is running, and it has published a keepalive recently print( "exit_if_necessary: exiting because another instance already running", cached_state.id, time.ctime(cached_state.last_keepalive_ms / 1000)) os._exit(1) except Exception as e: print("exit_if_necessary: failed to read keepalive from cache", e)
def wait_for_backing_job_to_exit_batch_phase( keepalive_state: KeepaliveState, cache: Cache, cache_keys: CacheKeys, wait_until_ms: int): print("wait_for_backing_job_to_exit_batch_phase: started", cache_keys.keepalive) while not keepalive_state or not keepalive_state.in_streaming_phase: # wait for backing job to be running and advance to streaming state if utils.millitime() > wait_until_ms: raise Exception( "wait_for_backing_job_to_exit_batch_phase: timed out") print( "get_cached_result: waiting for batch phase to end. keepalive_state=", keepalive_state) time.sleep(1) try: keepalive_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) except Exception as e: print( "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache", cache_keys.keepalive, e) print("wait_for_backing_job_to_exit_batch_phase: backing job is ready", keepalive_state) return keepalive_state
def get_cached_result(params: inputs.Inputs, context: LambdaContext, cache: Cache): ''' Backing job is already running. So just query cached data from and return result ''' def wait_for_backing_job_to_exit_batch_phase( keepalive_state: KeepaliveState, cache: Cache, cache_keys: CacheKeys, wait_until_ms: int): print("wait_for_backing_job_to_exit_batch_phase: started", cache_keys.keepalive) while not keepalive_state or not keepalive_state.in_streaming_phase: # wait for backing job to be running and advance to streaming state if utils.millitime() > wait_until_ms: raise Exception( "wait_for_backing_job_to_exit_batch_phase: timed out") print( "get_cached_result: waiting for batch phase to end. keepalive_state=", keepalive_state) time.sleep(1) try: keepalive_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) except Exception as e: print( "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache", cache_keys.keepalive, e) print("wait_for_backing_job_to_exit_batch_phase: backing job is ready", keepalive_state) return keepalive_state print("get_cached_result: started") # Update 'lastaccess' timestamp in memcache to indicate the corresponding backing job's data was recently queried cache_keys: CacheKeys = CacheKeys(params.cache_key_prefix()) now_ms = params.invoke_time_ms try: cache.set(cache_keys.lastaccess, now_ms) except Exception as e: print( "get_cached_result: failed to set lastaccess cache key {}={}, {}". format(cache_keys.lastaccess, now_ms, e)) # start the backing job if one is not running, or if the backing job's keepalive timestamp is stale keepalive_state: KeepaliveState = start_backing_job_if_necessary( params, context, cache) # now that backing job is surely running, wait for it to become 'ready' - i.e. go from batch to streaming phase keepalive_state = wait_for_backing_job_to_exit_batch_phase( keepalive_state, cache, cache_keys, now_ms + defaults.API_TIMEOUT_MS) # compute which cache keys need to be fetched if not params.is_streaming(): tstart = params.absolute_ms(params.start_time_ms) tend = params.absolute_ms(params.end_time_ms) else: tend = now_ms tstart = tend - params.duration_ms() timestamps = sorted([ ts for ts in keepalive_state.data_timestamps if ts >= tstart and ts <= tend ]) data_keys = [cache_keys.data_prefix + str(ts) for ts in timestamps] # retrieve metadata and data from cache. retry if necessary metadata = cache.get(cache_keys.metadata) if len(timestamps): print( "get_cached_result: fetching {} timestamps {} - {} @ {}ms".format( len(timestamps), time.ctime(timestamps[0] / 1000), time.ctime(timestamps[-1] / 1000), keepalive_state.resolution_ms)) data = cache.multiget(data_keys) missing_keys = set(data_keys) - set(data.keys()) if (len(missing_keys)): print("get_cached_result: retrying fetch of {}/{} keys: {}".format( len(missing_keys), len(data_keys), sorted(missing_keys))) data.update(cache.multiget(list(missing_keys))) # Fill in results in results struct result = { "start_time_ms": tstart, "end_time_ms": tend, "earliest_result_ms": 0, "latest_result_ms": 0, "resolution_ms": keepalive_state.resolution_ms, "metadata": metadata, "data": {}, "missing_timestamps_ms": [] } # First fill in retrieved data tsids = set() missing_timestamps = [] for timestamp in timestamps: k = cache_keys.data_prefix + str(timestamp) if k in data.keys(): for tsid, value in data[k].items(): if not result["earliest_result_ms"]: result["earliest_result_ms"] = timestamp if timestamp > result["latest_result_ms"]: result["latest_result_ms"] = timestamp tsids.add(tsid) result["data"].setdefault(tsid, []) result["data"][tsid].append([timestamp, value]) else: missing_timestamps.append(timestamp) # Second, fill in metadata of only the relevant mts that have data remove_metadata_ids = set(metadata.keys()).difference(tsids) for tsid in remove_metadata_ids: metadata.pop(tsid) result["missing_timestamps_ms"] = missing_timestamps return result
def start_backing_job_if_necessary(params: inputs.Inputs, context: LambdaContext, cache: Cache): ''' If no backing job is running for a given signalflow program and duration, start one Returns keepalive_state from cache if active backing job is found (to prevent a duplicate cache read by callers ''' def start_backing_job_as_lambda(params: inputs.Inputs, tstart, tend, context: LambdaContext): # Start new backing job that runs as a lambda function print("start_backing_job_as_lambda: started") import boto3 lambda_client = boto3.client('lambda') lambda_client.invoke(FunctionName=context.invoked_function_arn, InvocationType='Event', Payload=json.dumps({ "program": params.program, "start_time_ms": tstart, "end_time_ms": tend, "resolution_hint_ms": params.resolution_hint_ms, "api_token": params.api_token, "api_endpoint": params.api_endpoint, "daemon": True })) def start_backing_job_as_process(params: inputs.Inputs, tstart, tend): # Start new backing job that runs as a python process print("start_backing_job_as_process: started") cmd: str = "nohup python3 {script} --program=\"{program}\" --token={token} \ --start_time_ms={tstart} --end_time_ms={tend} --resolution_hint_ms={res} --endpoint={endpoint}".format( script=__file__, program=params.program, tstart=tstart, tend=tend, res=params.resolution_hint_ms, token=params.api_token, endpoint=params.api_endpoint) cmd += " --daemon > /tmp/{}.log 2>&1 &".format( params.cache_key_prefix()) print("start_backing_job_as_process:", cmd) os.system(cmd) # begin code for start_backing_job_if_necessary() try: cache_keys = CacheKeys(params.cache_key_prefix()) print("start_backing_job_if_necessary: started", cache_keys) now_ms = utils.millitime() cached_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) keepalive_age_ms = now_ms - cached_state.last_keepalive_ms expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000 if keepalive_age_ms < expiry_ms: print( "start_backing_job_if_necessary: found active backing job already running. keepalive_age_ms =", keepalive_age_ms) return cached_state print( "start_backing_job_if_necessary: found expired keepalive_age_ms =", keepalive_age_ms) cache.set(cache_keys.keepalive, None) except Exception as e: print("start_backing_job_if_necessary: no keeplive found in cache", e) tstart = params.start_time_ms tend = params.end_time_ms if not params.is_streaming(): tstart = params.absolute_ms(tstart) tend = params.absolute_ms(tend) if context.invoked_function_arn: # This backing job was invoked as a lambda. So invoke a new lambda start_backing_job_as_lambda(params, tstart, tend, context) else: start_backing_job_as_process(params, tstart, tend) return None