Exemplo n.º 1
0
def keepalive_fn(scheduler: sched.scheduler, params: inputs.Inputs,
                 context: LambdaContext, keepalive_state: KeepaliveState,
                 cache: Cache):
    ''' Each iteration of keepalive_thread runs this code. Add the next iteration of keepalive before exiting to
    continue the keepalive thread. Otherwise keepalives will stop '''
    try:
        update_keepalive(params, keepalive_state, cache)
        keepalive_fn.num_keepalives += 1
        if keepalive_fn.num_keepalives % defaults.KEEPALIVE_PRINT_EVERY == 0:
            print("keepalive_fn: keepalive #{}: state={}".format(
                keepalive_fn.num_keepalives, keepalive_state))

        if context.invoked_function_arn and context.get_remaining_time_in_millis(
        ) < defaults.RETRIGGER_BEFORE_EXPIRY_MS:
            # if invoked as lambda (not CLI), then retrigger backing job if this instance of it will expire soon
            cache_keys = keepalive_state.cache_keys
            lastaccess_ms = int(cache.get(cache_keys.lastaccess))
            lastaccess_age_ms = utils.millitime() - lastaccess_ms

            if lastaccess_age_ms > (defaults.BACKING_JOB_LIFETIME_MS * 0.9):
                # There were no recent calls to fetch the data produced by this backing job. No need to re-issue
                print(
                    "Exiting backing job by ending keepalive thread. lastaccess_age_ms = ",
                    lastaccess_age_ms)
                return False

            if not params.is_streaming():
                ''' Fixed time-range jobs need not be reissued '''
                print(
                    "keepalive_fn: backing job won't be restarted because it is not a streaming job",
                    params)
                return False

            # Restart this job again in another lambda invocation.
            # Before doing that, don't keepalive for a while to make it stale. Otherwise the new invocation
            # will assume there is another backing job already running and will auto-exit
            print(
                "keepalive_fn: backing job needs to be restarted. lastaccess_age_ms =",
                lastaccess_age_ms)
            time.sleep(defaults.KEEPALIVE_INTERVAL_SEC *
                       defaults.KEEPALIVE_EXPIRY_MULTIPLE)
            start_backing_job_if_necessary(params, context, keepalive_state,
                                           cache)
            print(
                "keepalive_fn: exiting current backing job after re-issuing a new one"
            )
            return False
    except Exception as e:
        print("keepalive_fn: exception", e, traceback.format_exc())

    # schedule the next iteration of keepalive thread
    scheduler.enter(defaults.KEEPALIVE_INTERVAL_SEC,
                    1,
                    keepalive_fn,
                    argument=(scheduler, params, context, keepalive_state,
                              cache))
Exemplo n.º 2
0
def update_keepalive(params: inputs.Inputs, keepalive_state: KeepaliveState,
                     cache: Cache):
    ''' Update the keepalive state in cache. Also check if the current backing job owns the keepalive. If not, exit '''
    try:
        cache_keys = keepalive_state.cache_keys
        exit_if_necessary(keepalive_state, cache)
        keepalive_state.last_keepalive_ms = utils.millitime()
        cache.set(cache_keys.keepalive, pickle.dumps(keepalive_state))
    except Exception as e:
        print("update_keepalive: exception", e, traceback.format_exc())
Exemplo n.º 3
0
 def __init__(self):
     self.invoke_time_ms = utils.millitime()
     self.cache_location = os.environ.get("cache_url", None)
     self.api_token = None
     self.api_endpoint = os.environ.get(
         "api_endpoint",
         "https://api.signalfx.com")  # SignalFx API endpoint
     self.program = None  # signalflow program
     self.start_time_ms = None
     self.end_time_ms = 0
     self.resolution_hint_ms = 0
     self.is_daemon = False
Exemplo n.º 4
0
def exit_if_necessary(keepalive_state: KeepaliveState, cache: Cache):
    ''' if backing job ever discovers that another instance of the same thing is currently running and owns the
    keepalive key in cache, then it exits '''
    cache_keys = keepalive_state.cache_keys
    try:
        cached_state: KeepaliveState = pickle.loads(
            cache.get(cache_keys.keepalive))
        if cached_state.id != keepalive_state.id:
            expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000
            if utils.millitime() - cached_state.last_keepalive_ms < expiry_ms:
                # Another backing job is running, and it has published a keepalive recently
                print(
                    "exit_if_necessary: exiting because another instance already running",
                    cached_state.id,
                    time.ctime(cached_state.last_keepalive_ms / 1000))
                os._exit(1)
    except Exception as e:
        print("exit_if_necessary: failed to read keepalive from cache", e)
Exemplo n.º 5
0
 def wait_for_backing_job_to_exit_batch_phase(
         keepalive_state: KeepaliveState, cache: Cache,
         cache_keys: CacheKeys, wait_until_ms: int):
     print("wait_for_backing_job_to_exit_batch_phase: started",
           cache_keys.keepalive)
     while not keepalive_state or not keepalive_state.in_streaming_phase:
         # wait for backing job to be running and advance to streaming state
         if utils.millitime() > wait_until_ms:
             raise Exception(
                 "wait_for_backing_job_to_exit_batch_phase: timed out")
         print(
             "get_cached_result: waiting for batch phase to end. keepalive_state=",
             keepalive_state)
         time.sleep(1)
         try:
             keepalive_state: KeepaliveState = pickle.loads(
                 cache.get(cache_keys.keepalive))
         except Exception as e:
             print(
                 "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache",
                 cache_keys.keepalive, e)
     print("wait_for_backing_job_to_exit_batch_phase: backing job is ready",
           keepalive_state)
     return keepalive_state
Exemplo n.º 6
0
def start_backing_job_if_necessary(params: inputs.Inputs,
                                   context: LambdaContext, cache: Cache):
    ''' If no backing job is running for a given signalflow program and duration, start one
    Returns keepalive_state from cache if active backing job is found (to prevent a duplicate cache read by callers '''
    def start_backing_job_as_lambda(params: inputs.Inputs, tstart, tend,
                                    context: LambdaContext):
        # Start new backing job that runs as a lambda function
        print("start_backing_job_as_lambda: started")
        import boto3
        lambda_client = boto3.client('lambda')
        lambda_client.invoke(FunctionName=context.invoked_function_arn,
                             InvocationType='Event',
                             Payload=json.dumps({
                                 "program": params.program,
                                 "start_time_ms": tstart,
                                 "end_time_ms": tend,
                                 "resolution_hint_ms":
                                 params.resolution_hint_ms,
                                 "api_token": params.api_token,
                                 "api_endpoint": params.api_endpoint,
                                 "daemon": True
                             }))

    def start_backing_job_as_process(params: inputs.Inputs, tstart, tend):
        # Start new backing job that runs as a python process
        print("start_backing_job_as_process: started")
        cmd: str = "nohup python3 {script} --program=\"{program}\" --token={token} \
                    --start_time_ms={tstart} --end_time_ms={tend} --resolution_hint_ms={res} --endpoint={endpoint}".format(
            script=__file__,
            program=params.program,
            tstart=tstart,
            tend=tend,
            res=params.resolution_hint_ms,
            token=params.api_token,
            endpoint=params.api_endpoint)
        cmd += " --daemon > /tmp/{}.log 2>&1 &".format(
            params.cache_key_prefix())
        print("start_backing_job_as_process:", cmd)
        os.system(cmd)

    # begin code for start_backing_job_if_necessary()
    try:
        cache_keys = CacheKeys(params.cache_key_prefix())
        print("start_backing_job_if_necessary: started", cache_keys)
        now_ms = utils.millitime()
        cached_state: KeepaliveState = pickle.loads(
            cache.get(cache_keys.keepalive))
        keepalive_age_ms = now_ms - cached_state.last_keepalive_ms
        expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000

        if keepalive_age_ms < expiry_ms:
            print(
                "start_backing_job_if_necessary: found active backing job already running. keepalive_age_ms =",
                keepalive_age_ms)
            return cached_state

        print(
            "start_backing_job_if_necessary: found expired keepalive_age_ms =",
            keepalive_age_ms)
        cache.set(cache_keys.keepalive, None)
    except Exception as e:
        print("start_backing_job_if_necessary: no keeplive found in cache", e)

    tstart = params.start_time_ms
    tend = params.end_time_ms
    if not params.is_streaming():
        tstart = params.absolute_ms(tstart)
        tend = params.absolute_ms(tend)

    if context.invoked_function_arn:
        # This backing job was invoked as a lambda. So invoke a new lambda
        start_backing_job_as_lambda(params, tstart, tend, context)
    else:
        start_backing_job_as_process(params, tstart, tend)

    return None
Exemplo n.º 7
0
def data_consumer_thread_fn(params: inputs.Inputs, context: LambdaContext,
                            data_queue: Queue, keepalive_state: KeepaliveState,
                            cache: Cache):
    ''' Thread that consumes data messages from analytics job, and sticks each one individually into cache.
     Also detects when job moves from batch to stream phase. Unfortunately that requires 'auto-detection' where
     data does not arrive for close to a second :-( '''
    print("data_consumer_thread_fn: started")
    try:
        cache_keys = keepalive_state.cache_keys
        data_to_encache = {}
        last_datamsg_walltime_ms = 0
        while True:
            now_ms = utils.millitime()
            try:
                if params.is_streaming():
                    # remove trailing data keys that are beyond the scope of the current 'window' of a streaming job
                    valid_timestamps = [
                        ts for ts in keepalive_state.data_timestamps
                        if ts >= (now_ms - params.job_duration_ms() -
                                  keepalive_state.resolution_ms)
                    ]
                    keepalive_state.data_timestamps = set(valid_timestamps)

                msg = data_queue.get(False)
                last_datamsg_walltime_ms = utils.millitime()
                data_to_encache.setdefault(msg.logical_timestamp_ms, {})
                data_to_encache[msg.logical_timestamp_ms].update(msg.data)
            except Exception as e:
                # No data found in queue. However there may be pending data from previous messages that need caching
                timestamps_encached = set()
                for timestamp, values in data_to_encache.items():
                    try:
                        cache.set(cache_keys.data_prefix + str(timestamp),
                                  values)
                        timestamps_encached.add(timestamp)
                    except Exception as e:
                        # Failed to set data in cache
                        None
                for timestamp_encached in timestamps_encached:
                    data_to_encache.pop(timestamp_encached)
                    keepalive_state.data_timestamps.add(timestamp_encached)
                if data_to_encache:
                    print(
                        "data_consumer_thread_fn: will retry writing {} data keys to cache {}"
                        .format(len(data_to_encache), list(data_to_encache)))
                elif not keepalive_state.in_streaming_phase:
                    # Now that all data is successfully published, 'Auto-detect' whether we have completed batch phase
                    # and entered stream phase. If so, update keepalive_state
                    if last_datamsg_walltime_ms > 0 and (
                            now_ms - last_datamsg_walltime_ms
                    ) > defaults.STREAM_PHASE_DETECTION_INTERVAL_MS:
                        keepalive_state.in_streaming_phase = True
                        print(
                            "data_consumer_thread_fn: backing job entered stream phase after {} datapoints. now={}, last={}"
                            .format(len(keepalive_state.data_timestamps),
                                    now_ms, last_datamsg_walltime_ms))
                        # start healthcheck thread now that data is flowing in
                        threading.Thread(target=healthcheck_thread_fn,
                                         args=(params, context,
                                               keepalive_state,
                                               cache)).start()

                time.sleep(defaults.STREAM_PHASE_DETECTION_INTERVAL_MS / 1000 /
                           5)
    except Exception as e:
        print("data_consumer_thread_fn exception", e, traceback.format_exc())
    finally:
        print("data_consumer_thread_fn: ended")
Exemplo n.º 8
0
        parser.add_argument(
            "--cache_url",
            required=False,
            default="",
            help="memcache configuration url is host:port form")
        parser.add_argument("--daemon", action="store_true")
        args = parser.parse_args()

        test_event = {
            "program": args.program,
            "api_token": args.token,
            "cache_url": args.cache_url,
            "daemon": args.daemon
        }
        if args.endpoint:
            test_event["api_endpoint"] = args.endpoint
        if args.start_time_ms:
            test_event["start_time_ms"] = args.start_time_ms
        if args.end_time_ms:
            test_event["end_time_ms"] = args.end_time_ms
        if args.resolution_hint_ms:
            test_event["resolution_hint_ms"] = args.resolution_hint_ms
        context = LambdaContext()
        context.aws_request_id = utils.millitime()
        context.invoked_function_arn = None
        lambda_handler(test_event, context)

    except Exception as e:
        print("Exception", e, traceback.format_exc())
        os._exit(1)