示例#1
0
async def test_locmem_cache_uses_net_loc_for_separaing_namespaces():
    cache = Cache("locmem://")
    other_cache = Cache("locmem://other")

    await cache.connect()
    await other_cache.connect()

    await cache.set("test", "Ok!")
    assert await other_cache.get("test") is None
    assert await cache.get("test") == "Ok!"
示例#2
0
def update_keepalive(params: inputs.Inputs, keepalive_state: KeepaliveState,
                     cache: Cache):
    ''' Update the keepalive state in cache. Also check if the current backing job owns the keepalive. If not, exit '''
    try:
        cache_keys = keepalive_state.cache_keys
        exit_if_necessary(keepalive_state, cache)
        keepalive_state.last_keepalive_ms = utils.millitime()
        cache.set(cache_keys.keepalive, pickle.dumps(keepalive_state))
    except Exception as e:
        print("update_keepalive: exception", e, traceback.format_exc())
async def test_cache_response() -> None:
    cache = Cache("locmem://null", ttl=2 * 60)
    spy = CacheSpy(PlainTextResponse("Hello, world!"))
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert spy.misses == 1

        assert "Expires" in r.headers
        expires_fmt = "%a, %d %b %Y %H:%M:%S GMT"
        expires = dt.datetime.strptime(r.headers["Expires"], expires_fmt)
        delta: dt.timedelta = expires - dt.datetime.utcnow()
        assert delta.total_seconds() == pytest.approx(120, rel=1e-2)
        assert "Cache-Control" in r.headers
        assert r.headers["Cache-Control"] == "max-age=120"

        r1 = await client.get("/")
        assert spy.misses == 1
        assert ComparableHTTPXResponse(r1) == r

        r2 = await client.get("/")
        assert spy.misses == 1
        assert ComparableHTTPXResponse(r2) == r
async def test_cookies_in_response_and_cookieless_request() -> None:
    """
    Responses that set cookies shouldn't be cached
    if the request doesn't have cookies.
    """
    cache = Cache("locmem://null")

    async def app(scope: Scope, receive: Receive, send: Send) -> None:
        response = PlainTextResponse("Hello, world!")
        response.set_cookie("session_id", "1234")
        await response(scope, receive, send)

    spy = CacheSpy(app)
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert spy.misses == 1

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert spy.misses == 2
示例#5
0
def generate_cache_key(
    request: Request,
    method: str,
    varying_headers: typing.List[str],
    cache: Cache,
) -> str:
    """
    Return a cache key generated from the request full URL and varying
    response headers.

    Note that the given `method` may be different from that of the request, e.g.
    because we're trying to find a response cached from a previous GET request
    while this one is a HEAD request. (This is OK because web servers will strip content
    from responses to a HEAD request before sending them on the wire.)
    """
    assert method in CACHABLE_METHODS

    ctx = hashlib.md5()
    for header in varying_headers:
        value = request.headers.get(header)
        if value is not None:
            ctx.update(value.encode())

    absolute_url = str(request.url)
    url = hashlib.md5(absolute_url.encode("ascii"))

    return cache.make_key(
        f"cache_page.{method}.{url.hexdigest()}.{ctx.hexdigest()}")
示例#6
0
async def cache():
    obj = Cache("redis://localhost:6379/1")
    await obj.connect()
    await obj.clear()
    yield obj
    await obj.clear()
    await obj.disconnect()
async def test_streaming_response() -> None:
    """Streaming responses should not be cached."""
    cache = Cache("locmem://null")

    async def body() -> typing.AsyncIterator[str]:
        yield "Hello, "
        yield "world!"

    async def app(scope: Scope, receive: Receive, send: Send) -> None:
        response = StreamingResponse(body())
        await response(scope, receive, send)

    spy = CacheSpy(app)
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert spy.misses == 1

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert spy.misses == 2
示例#8
0
async def test_decorator_raw_asgi() -> None:
    cache = Cache("locmem://null", ttl=2 * 60)

    @cached(cache)
    async def app(scope: Scope, receive: Receive, send: Send) -> None:
        response = PlainTextResponse("Hello, world!")
        await response(scope, receive, send)

    spy = app.app = CacheSpy(app.app)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1
async def test_use_cached_head_response_on_get() -> None:
    """
    Making a HEAD request should use the cached response for future GET requests.
    """
    cache = Cache("locmem://null")
    spy = CacheSpy(PlainTextResponse("Hello, world!"))
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.head("/")
        assert not r.text
        assert r.status_code == 200
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1

        r1 = await client.get("/")
        assert r1.text == "Hello, world!"
        assert r1.status_code == 200
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1
async def test_not_http() -> None:
    async def app(scope: Scope, receive: Receive, send: Send) -> None:
        assert scope["type"] == "lifespan"

    cache = Cache("locmem://null")
    app = CacheMiddleware(app, cache=cache)
    await app({"type": "lifespan"}, mock_receive, mock_send)
示例#11
0
async def test_decorate_starlette_view() -> None:
    cache = Cache("locmem://null", ttl=2 * 60)

    with pytest.raises(ValueError):

        @cached(cache)
        async def home(request: Request) -> Response:
            ...  # pragma: no cover
示例#12
0
async def test_cache_can_be_used_as_context_manager():
    async with Cache("locmem://") as cache:
        await cache.set("test", "Ok!")
        assert await cache.get("test") == "Ok!"

        assert await cache.get_or_set('test2', _testing_coroutine('arg', test1='kwarg')) == "Ok!"

        assert await cache(_testing_coroutine('arg', test1='kwarg')) == 'Ok!'
示例#13
0
def generate_varying_headers_cache_key(request: Request, cache: Cache) -> str:
    """
    Return a cache key generated from the requested absolute URL, suitable for
    associating varying headers to a requested URL.
    """
    url = request.url.path
    url_hash = hashlib.md5(url.encode("ascii"))
    return cache.make_key(f"varying_headers.{url_hash.hexdigest()}")
async def test_vary() -> None:
    """
    Sending different values for request headers registered as varying should
    result in different cache entries.
    """
    cache = Cache("locmem://null")

    async def gzippable_app(scope: Scope, receive: Receive,
                            send: Send) -> None:
        headers = Headers(scope=scope)

        if "gzip" in headers.getlist("accept-encoding"):
            body = gzip.compress(b"Hello, world!")
            response = PlainTextResponse(
                content=body,
                headers={
                    "Content-Encoding": "gzip",
                    "Content-Length": str(len(body))
                },
            )
        else:
            response = PlainTextResponse("Hello, world!")

        response.headers["Vary"] = "Accept-Encoding"
        await response(scope, receive, send)

    spy = CacheSpy(gzippable_app)
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        r = await client.get("/", headers={"accept-encoding": "gzip"})
        assert spy.misses == 1
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert r.headers["content-encoding"] == "gzip"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers

        # Different "Accept-Encoding" header => the cached result
        # for "Accept-Encoding: gzip" should not be used.
        r1 = await client.get("/", headers={"accept-encoding": "identity"})
        assert spy.misses == 2
        assert r1.status_code == 200
        assert r1.text == "Hello, world!"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers

        # This "Accept-Encoding" header has already been seen => we should
        # get a cached response.
        r2 = await client.get("/", headers={"accept-encoding": "gzip"})
        assert spy.misses == 2
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert r2.headers["Content-Encoding"] == "gzip"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
示例#15
0
def keepalive_fn(scheduler: sched.scheduler, params: inputs.Inputs,
                 context: LambdaContext, keepalive_state: KeepaliveState,
                 cache: Cache):
    ''' Each iteration of keepalive_thread runs this code. Add the next iteration of keepalive before exiting to
    continue the keepalive thread. Otherwise keepalives will stop '''
    try:
        update_keepalive(params, keepalive_state, cache)
        keepalive_fn.num_keepalives += 1
        if keepalive_fn.num_keepalives % defaults.KEEPALIVE_PRINT_EVERY == 0:
            print("keepalive_fn: keepalive #{}: state={}".format(
                keepalive_fn.num_keepalives, keepalive_state))

        if context.invoked_function_arn and context.get_remaining_time_in_millis(
        ) < defaults.RETRIGGER_BEFORE_EXPIRY_MS:
            # if invoked as lambda (not CLI), then retrigger backing job if this instance of it will expire soon
            cache_keys = keepalive_state.cache_keys
            lastaccess_ms = int(cache.get(cache_keys.lastaccess))
            lastaccess_age_ms = utils.millitime() - lastaccess_ms

            if lastaccess_age_ms > (defaults.BACKING_JOB_LIFETIME_MS * 0.9):
                # There were no recent calls to fetch the data produced by this backing job. No need to re-issue
                print(
                    "Exiting backing job by ending keepalive thread. lastaccess_age_ms = ",
                    lastaccess_age_ms)
                return False

            if not params.is_streaming():
                ''' Fixed time-range jobs need not be reissued '''
                print(
                    "keepalive_fn: backing job won't be restarted because it is not a streaming job",
                    params)
                return False

            # Restart this job again in another lambda invocation.
            # Before doing that, don't keepalive for a while to make it stale. Otherwise the new invocation
            # will assume there is another backing job already running and will auto-exit
            print(
                "keepalive_fn: backing job needs to be restarted. lastaccess_age_ms =",
                lastaccess_age_ms)
            time.sleep(defaults.KEEPALIVE_INTERVAL_SEC *
                       defaults.KEEPALIVE_EXPIRY_MULTIPLE)
            start_backing_job_if_necessary(params, context, keepalive_state,
                                           cache)
            print(
                "keepalive_fn: exiting current backing job after re-issuing a new one"
            )
            return False
    except Exception as e:
        print("keepalive_fn: exception", e, traceback.format_exc())

    # schedule the next iteration of keepalive thread
    scheduler.enter(defaults.KEEPALIVE_INTERVAL_SEC,
                    1,
                    keepalive_fn,
                    argument=(scheduler, params, context, keepalive_state,
                              cache))
async def test_duplicate_caching() -> None:
    cache = Cache("locmem://default")
    special_cache = Cache("locmem://special")

    class DuplicateCache(HTTPEndpoint):
        pass

    app = Starlette(
        routes=[
            Route("/duplicate_cache",
                  CacheMiddleware(DuplicateCache, cache=special_cache))
        ],
        middleware=[Middleware(CacheMiddleware, cache=cache)],
    )

    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, special_cache, client:
        with pytest.raises(DuplicateCaching):
            await client.get("/duplicate_cache")
async def test_cache_not_connected() -> None:
    cache = Cache("locmem://null", ttl=2 * 60)
    app = CacheMiddleware(PlainTextResponse("Hello, world!"), cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with client:
        with pytest.raises(CacheNotConnected) as ctx:
            await client.get("/")

    exc = ctx.value
    assert exc.cache is cache
    assert str(cache.url) in str(exc)
示例#18
0
async def test_logs_trace(capsys: typing.Any) -> None:
    cache = Cache("locmem://null", ttl=2 * 60)
    app = CacheMiddleware(PlainTextResponse("Hello, world!"), cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        with override_log_level("trace"):
            await client.get("/")

    stderr = capsys.readouterr().err
    assert "cache_lookup MISS" in stderr
    assert "get_from_cache request.url='http://testserver/" in stderr
示例#19
0
def metadata_consumer_thread_fn(metadata_queue: Queue,
                                keepalive_state: KeepaliveState, cache: Cache):
    ''' Thread that consumes metadata messages, updates full metadata in cache '''
    print("metadata_consumer_thread_fn: started")
    try:
        cache_keys = keepalive_state.cache_keys
        metadata = {}
        while (True):
            try:
                publish = False
                while (not metadata_queue.empty()):
                    msg = metadata_queue.get()
                    metadata[msg.tsid] = msg.properties
                    publish = True
                if publish:
                    cache.set(cache_keys.metadata, metadata)
            except Exception as e:
                print("metadata_consumer_thread_fn: exception", e,
                      traceback.format_exc())
            time.sleep(1.0)
    finally:
        print("metadata_consumer_thread_fn: ended")
示例#20
0
async def test_decorator_starlette_endpoint() -> None:
    cache = Cache("locmem://null", ttl=2 * 60)

    @cached(cache)
    class CachedHome(HTTPEndpoint):
        async def get(self, request: Request) -> Response:
            return PlainTextResponse("Hello, world!")

    class UncachedUsers(HTTPEndpoint):
        async def get(self, request: Request) -> Response:
            return PlainTextResponse("Hello, users!")

    assert isinstance(CachedHome, CacheMiddleware)
    spy = CachedHome.app = CacheSpy(CachedHome.app)
    users_spy = CacheSpy(UncachedUsers)

    app = Starlette(routes=[Route("/", CachedHome), Route("/users", users_spy)])
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1

        r = await client.get("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert "Expires" in r.headers
        assert "Cache-Control" in r.headers
        assert spy.misses == 1

        assert users_spy.misses == 0

        r = await client.get("/users")
        assert r.status_code == 200
        assert r.text == "Hello, users!"
        assert "Expires" not in r.headers
        assert "Cache-Control" not in r.headers
        assert users_spy.misses == 1

        r = await client.get("/users")
        assert r.status_code == 200
        assert r.text == "Hello, users!"
        assert "Expires" not in r.headers
        assert "Cache-Control" not in r.headers
        assert users_spy.misses == 2
示例#21
0
async def test_non_cachable_zero_ttl(cache: Cache) -> None:
    """
    We shouldn't bother caching if the cache TTL is zero.
    """
    cache.ttl = 0
    scope: Scope = {
        "type": "http",
        "method": "GET",
        "path": "/path",
        "headers": [],
    }
    request = Request(scope)
    response = PlainTextResponse("Hello, world!")
    with pytest.raises(ResponseNotCachable):
        await store_in_cache(response, request=request, cache=cache)
示例#22
0
async def test_logs_debug(capsys: typing.Any) -> None:
    cache = Cache("locmem://null", ttl=2 * 60)
    app = CacheMiddleware(PlainTextResponse("Hello, world!"), cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        with override_log_level("debug"):
            await client.get("/")
            await client.get("/")

    stderr = capsys.readouterr().err
    miss_line, store_line, hit_line, *_ = stderr.split("\n")
    assert "cache_lookup MISS" in miss_line
    assert "store_in_cache max_age=120" in store_line
    assert "cache_lookup HIT" in hit_line
    assert "get_from_cache request.url='http://testserver/" not in stderr
async def test_not_200_ok(status_code: int) -> None:
    """Responses that don't have status code 200 should not be cached."""
    cache = Cache("locmem://null")
    spy = CacheSpy(PlainTextResponse("Hello, world!", status_code=status_code))
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        r = await client.get("/")
        assert r.status_code == status_code
        assert r.text == "Hello, world!"
        assert "Expires" not in r.headers
        assert "Cache-Control" not in r.headers
        assert spy.misses == 1

        r1 = await client.get("/")
        assert ComparableHTTPXResponse(r1) == r
        assert spy.misses == 2
示例#24
0
def exit_if_necessary(keepalive_state: KeepaliveState, cache: Cache):
    ''' if backing job ever discovers that another instance of the same thing is currently running and owns the
    keepalive key in cache, then it exits '''
    cache_keys = keepalive_state.cache_keys
    try:
        cached_state: KeepaliveState = pickle.loads(
            cache.get(cache_keys.keepalive))
        if cached_state.id != keepalive_state.id:
            expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000
            if utils.millitime() - cached_state.last_keepalive_ms < expiry_ms:
                # Another backing job is running, and it has published a keepalive recently
                print(
                    "exit_if_necessary: exiting because another instance already running",
                    cached_state.id,
                    time.ctime(cached_state.last_keepalive_ms / 1000))
                os._exit(1)
    except Exception as e:
        print("exit_if_necessary: failed to read keepalive from cache", e)
async def test_non_cachable_request() -> None:
    cache = Cache("locmem://null")
    spy = CacheSpy(PlainTextResponse("Hello, world!"))
    app = CacheMiddleware(spy, cache=cache)
    client = httpx.AsyncClient(app=app, base_url="http://testserver")

    async with cache, client:
        assert spy.misses == 0

        r = await client.post("/")
        assert r.status_code == 200
        assert r.text == "Hello, world!"
        assert "Expires" not in r.headers
        assert "Cache-Control" not in r.headers
        assert spy.misses == 1

        r1 = await client.post("/")
        assert ComparableHTTPXResponse(r1) == r
        assert spy.misses == 2
示例#26
0
def setup(project_settings: str = None, database: bool = False) -> GraphQL:
    """Load Turbulette applications and return the GraphQL route."""
    project_settings_module = (get_project_settings_by_env()
                               if not project_settings else
                               import_module(project_settings))

    # The database connection has to be initialized before the LazySettings object to be setup
    # so we have to connect to the database before the registry to be setup
    if database:
        get_gino_instance()

    registry = Registry(project_settings_module=project_settings_module)
    conf.registry.__setup__(registry)
    schema = registry.setup()
    # At this point, settings are now available through `settings` from `turbulette.conf` module
    settings = conf.settings

    # Now that the database connection is established, we can use `settings`

    cache.__setup__(Cache(settings.CACHE))

    extensions: List[Type[Extension]] = [PolicyExtension]
    for ext in settings.ARIADNE_EXTENSIONS:
        module_class = ext.rsplit(".", 1)
        extensions.append(
            getattr(
                import_module(module_class[0]),
                module_class[1],
            ))

    graphql_route = GraphQL(
        schema,
        debug=settings.DEBUG,
        extensions=extensions,
        error_formatter=error_formatter,
    )
    return graphql_route
示例#27
0
 def wait_for_backing_job_to_exit_batch_phase(
         keepalive_state: KeepaliveState, cache: Cache,
         cache_keys: CacheKeys, wait_until_ms: int):
     print("wait_for_backing_job_to_exit_batch_phase: started",
           cache_keys.keepalive)
     while not keepalive_state or not keepalive_state.in_streaming_phase:
         # wait for backing job to be running and advance to streaming state
         if utils.millitime() > wait_until_ms:
             raise Exception(
                 "wait_for_backing_job_to_exit_batch_phase: timed out")
         print(
             "get_cached_result: waiting for batch phase to end. keepalive_state=",
             keepalive_state)
         time.sleep(1)
         try:
             keepalive_state: KeepaliveState = pickle.loads(
                 cache.get(cache_keys.keepalive))
         except Exception as e:
             print(
                 "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache",
                 cache_keys.keepalive, e)
     print("wait_for_backing_job_to_exit_batch_phase: backing job is ready",
           keepalive_state)
     return keepalive_state
示例#28
0
def get_cached_result(params: inputs.Inputs, context: LambdaContext,
                      cache: Cache):
    ''' Backing job is already running. So just query cached data from and return result '''
    def wait_for_backing_job_to_exit_batch_phase(
            keepalive_state: KeepaliveState, cache: Cache,
            cache_keys: CacheKeys, wait_until_ms: int):
        print("wait_for_backing_job_to_exit_batch_phase: started",
              cache_keys.keepalive)
        while not keepalive_state or not keepalive_state.in_streaming_phase:
            # wait for backing job to be running and advance to streaming state
            if utils.millitime() > wait_until_ms:
                raise Exception(
                    "wait_for_backing_job_to_exit_batch_phase: timed out")
            print(
                "get_cached_result: waiting for batch phase to end. keepalive_state=",
                keepalive_state)
            time.sleep(1)
            try:
                keepalive_state: KeepaliveState = pickle.loads(
                    cache.get(cache_keys.keepalive))
            except Exception as e:
                print(
                    "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache",
                    cache_keys.keepalive, e)
        print("wait_for_backing_job_to_exit_batch_phase: backing job is ready",
              keepalive_state)
        return keepalive_state

    print("get_cached_result: started")

    # Update 'lastaccess' timestamp in memcache to indicate the corresponding backing job's data was recently queried
    cache_keys: CacheKeys = CacheKeys(params.cache_key_prefix())
    now_ms = params.invoke_time_ms
    try:
        cache.set(cache_keys.lastaccess, now_ms)
    except Exception as e:
        print(
            "get_cached_result: failed to set lastaccess cache key {}={}, {}".
            format(cache_keys.lastaccess, now_ms, e))

    # start the backing job if one is not running, or if the backing job's keepalive timestamp is stale
    keepalive_state: KeepaliveState = start_backing_job_if_necessary(
        params, context, cache)

    # now that backing job is surely running, wait for it to become 'ready' - i.e. go from batch to streaming phase
    keepalive_state = wait_for_backing_job_to_exit_batch_phase(
        keepalive_state, cache, cache_keys, now_ms + defaults.API_TIMEOUT_MS)

    # compute which cache keys need to be fetched
    if not params.is_streaming():
        tstart = params.absolute_ms(params.start_time_ms)
        tend = params.absolute_ms(params.end_time_ms)
    else:
        tend = now_ms
        tstart = tend - params.duration_ms()

    timestamps = sorted([
        ts for ts in keepalive_state.data_timestamps
        if ts >= tstart and ts <= tend
    ])
    data_keys = [cache_keys.data_prefix + str(ts) for ts in timestamps]

    # retrieve metadata and data from cache. retry if necessary
    metadata = cache.get(cache_keys.metadata)
    if len(timestamps):
        print(
            "get_cached_result: fetching {} timestamps {} - {} @ {}ms".format(
                len(timestamps), time.ctime(timestamps[0] / 1000),
                time.ctime(timestamps[-1] / 1000),
                keepalive_state.resolution_ms))
    data = cache.multiget(data_keys)
    missing_keys = set(data_keys) - set(data.keys())
    if (len(missing_keys)):
        print("get_cached_result: retrying fetch of {}/{} keys: {}".format(
            len(missing_keys), len(data_keys), sorted(missing_keys)))
        data.update(cache.multiget(list(missing_keys)))

    # Fill in results in results struct
    result = {
        "start_time_ms": tstart,
        "end_time_ms": tend,
        "earliest_result_ms": 0,
        "latest_result_ms": 0,
        "resolution_ms": keepalive_state.resolution_ms,
        "metadata": metadata,
        "data": {},
        "missing_timestamps_ms": []
    }

    # First fill in retrieved data
    tsids = set()
    missing_timestamps = []
    for timestamp in timestamps:
        k = cache_keys.data_prefix + str(timestamp)
        if k in data.keys():
            for tsid, value in data[k].items():
                if not result["earliest_result_ms"]:
                    result["earliest_result_ms"] = timestamp
                if timestamp > result["latest_result_ms"]:
                    result["latest_result_ms"] = timestamp
                tsids.add(tsid)
                result["data"].setdefault(tsid, [])
                result["data"][tsid].append([timestamp, value])
        else:
            missing_timestamps.append(timestamp)

    # Second, fill in metadata of only the relevant mts that have data
    remove_metadata_ids = set(metadata.keys()).difference(tsids)
    for tsid in remove_metadata_ids:
        metadata.pop(tsid)

    result["missing_timestamps_ms"] = missing_timestamps
    return result
示例#29
0
def start_backing_job_if_necessary(params: inputs.Inputs,
                                   context: LambdaContext, cache: Cache):
    ''' If no backing job is running for a given signalflow program and duration, start one
    Returns keepalive_state from cache if active backing job is found (to prevent a duplicate cache read by callers '''
    def start_backing_job_as_lambda(params: inputs.Inputs, tstart, tend,
                                    context: LambdaContext):
        # Start new backing job that runs as a lambda function
        print("start_backing_job_as_lambda: started")
        import boto3
        lambda_client = boto3.client('lambda')
        lambda_client.invoke(FunctionName=context.invoked_function_arn,
                             InvocationType='Event',
                             Payload=json.dumps({
                                 "program": params.program,
                                 "start_time_ms": tstart,
                                 "end_time_ms": tend,
                                 "resolution_hint_ms":
                                 params.resolution_hint_ms,
                                 "api_token": params.api_token,
                                 "api_endpoint": params.api_endpoint,
                                 "daemon": True
                             }))

    def start_backing_job_as_process(params: inputs.Inputs, tstart, tend):
        # Start new backing job that runs as a python process
        print("start_backing_job_as_process: started")
        cmd: str = "nohup python3 {script} --program=\"{program}\" --token={token} \
                    --start_time_ms={tstart} --end_time_ms={tend} --resolution_hint_ms={res} --endpoint={endpoint}".format(
            script=__file__,
            program=params.program,
            tstart=tstart,
            tend=tend,
            res=params.resolution_hint_ms,
            token=params.api_token,
            endpoint=params.api_endpoint)
        cmd += " --daemon > /tmp/{}.log 2>&1 &".format(
            params.cache_key_prefix())
        print("start_backing_job_as_process:", cmd)
        os.system(cmd)

    # begin code for start_backing_job_if_necessary()
    try:
        cache_keys = CacheKeys(params.cache_key_prefix())
        print("start_backing_job_if_necessary: started", cache_keys)
        now_ms = utils.millitime()
        cached_state: KeepaliveState = pickle.loads(
            cache.get(cache_keys.keepalive))
        keepalive_age_ms = now_ms - cached_state.last_keepalive_ms
        expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000

        if keepalive_age_ms < expiry_ms:
            print(
                "start_backing_job_if_necessary: found active backing job already running. keepalive_age_ms =",
                keepalive_age_ms)
            return cached_state

        print(
            "start_backing_job_if_necessary: found expired keepalive_age_ms =",
            keepalive_age_ms)
        cache.set(cache_keys.keepalive, None)
    except Exception as e:
        print("start_backing_job_if_necessary: no keeplive found in cache", e)

    tstart = params.start_time_ms
    tend = params.end_time_ms
    if not params.is_streaming():
        tstart = params.absolute_ms(tstart)
        tend = params.absolute_ms(tend)

    if context.invoked_function_arn:
        # This backing job was invoked as a lambda. So invoke a new lambda
        start_backing_job_as_lambda(params, tstart, tend, context)
    else:
        start_backing_job_as_process(params, tstart, tend)

    return None
示例#30
0
def data_consumer_thread_fn(params: inputs.Inputs, context: LambdaContext,
                            data_queue: Queue, keepalive_state: KeepaliveState,
                            cache: Cache):
    ''' Thread that consumes data messages from analytics job, and sticks each one individually into cache.
     Also detects when job moves from batch to stream phase. Unfortunately that requires 'auto-detection' where
     data does not arrive for close to a second :-( '''
    print("data_consumer_thread_fn: started")
    try:
        cache_keys = keepalive_state.cache_keys
        data_to_encache = {}
        last_datamsg_walltime_ms = 0
        while True:
            now_ms = utils.millitime()
            try:
                if params.is_streaming():
                    # remove trailing data keys that are beyond the scope of the current 'window' of a streaming job
                    valid_timestamps = [
                        ts for ts in keepalive_state.data_timestamps
                        if ts >= (now_ms - params.job_duration_ms() -
                                  keepalive_state.resolution_ms)
                    ]
                    keepalive_state.data_timestamps = set(valid_timestamps)

                msg = data_queue.get(False)
                last_datamsg_walltime_ms = utils.millitime()
                data_to_encache.setdefault(msg.logical_timestamp_ms, {})
                data_to_encache[msg.logical_timestamp_ms].update(msg.data)
            except Exception as e:
                # No data found in queue. However there may be pending data from previous messages that need caching
                timestamps_encached = set()
                for timestamp, values in data_to_encache.items():
                    try:
                        cache.set(cache_keys.data_prefix + str(timestamp),
                                  values)
                        timestamps_encached.add(timestamp)
                    except Exception as e:
                        # Failed to set data in cache
                        None
                for timestamp_encached in timestamps_encached:
                    data_to_encache.pop(timestamp_encached)
                    keepalive_state.data_timestamps.add(timestamp_encached)
                if data_to_encache:
                    print(
                        "data_consumer_thread_fn: will retry writing {} data keys to cache {}"
                        .format(len(data_to_encache), list(data_to_encache)))
                elif not keepalive_state.in_streaming_phase:
                    # Now that all data is successfully published, 'Auto-detect' whether we have completed batch phase
                    # and entered stream phase. If so, update keepalive_state
                    if last_datamsg_walltime_ms > 0 and (
                            now_ms - last_datamsg_walltime_ms
                    ) > defaults.STREAM_PHASE_DETECTION_INTERVAL_MS:
                        keepalive_state.in_streaming_phase = True
                        print(
                            "data_consumer_thread_fn: backing job entered stream phase after {} datapoints. now={}, last={}"
                            .format(len(keepalive_state.data_timestamps),
                                    now_ms, last_datamsg_walltime_ms))
                        # start healthcheck thread now that data is flowing in
                        threading.Thread(target=healthcheck_thread_fn,
                                         args=(params, context,
                                               keepalive_state,
                                               cache)).start()

                time.sleep(defaults.STREAM_PHASE_DETECTION_INTERVAL_MS / 1000 /
                           5)
    except Exception as e:
        print("data_consumer_thread_fn exception", e, traceback.format_exc())
    finally:
        print("data_consumer_thread_fn: ended")