async def write_s3_obj(app, obj_id, bucket=None): """ writes the given object to s3 """ s3key = getS3Key(obj_id) log.info( f"write_s3_obj for obj_id: {obj_id} / s3_key: {s3key} bucket: {bucket}" ) pending_s3_write = app["pending_s3_write"] pending_s3_write_tasks = app["pending_s3_write_tasks"] dirty_ids = app["dirty_ids"] chunk_cache = app['chunk_cache'] meta_cache = app['meta_cache'] deflate_map = app['deflate_map'] shuffle_map = app['shuffle_map'] notify_objs = app["root_notify_ids"] deleted_ids = app['deleted_ids'] success = False if isValidDomain(obj_id): domain_bucket = getBucketForDomain(obj_id) if bucket and bucket != domain_bucket: log.error( f"expected bucket for domain: {obj_id} to match what wsas passed to write_s3_obj" ) else: bucket = domain_bucket if s3key in pending_s3_write: msg = f"write_s3_key - not expected for key {s3key} to be in pending_s3_write map" log.error(msg) raise KeyError(msg) if obj_id not in pending_s3_write_tasks: # don't allow reentrant write log.debug(f"write_s3_obj for {obj_id} not s3sync task") if obj_id in deleted_ids and isValidUuid(obj_id): # if this objid has been deleted (and its unique since this is not a domain id) # cancel any pending task and return log.warn(f"Canceling write for {obj_id} since it has been deleted") if obj_id in pending_s3_write_tasks: log.info(f"removing pending s3 write task for {obj_id}") task = pending_s3_write_tasks[obj_id] task.cancel() del pending_s3_write_tasks[obj_id] return None now = time.time() last_update_time = now if obj_id in dirty_ids: last_update_time = dirty_ids[obj_id][ 0] # timestamp is first element of two-tuple if last_update_time > now: msg = f"last_update time {last_update_time} is in the future for obj_id: {obj_id}" log.error(msg) raise ValueError(msg) pending_s3_write[s3key] = now # do the following in the try block so we can always remove the pending_s3_write at the end try: if isValidChunkId(obj_id): if obj_id not in chunk_cache: log.error(f"expected to find obj_id: {obj_id} in chunk cache") raise KeyError(f"{obj_id} not found in chunk cache") if not chunk_cache.isDirty(obj_id): log.error(f"expected chunk cache obj {obj_id} to be dirty") raise ValueError("bad dirty state for obj") chunk_arr = chunk_cache[obj_id] chunk_bytes = arrayToBytes(chunk_arr) dset_id = getDatasetId(obj_id) deflate_level = None shuffle = 0 if dset_id in shuffle_map: shuffle = shuffle_map[dset_id] if dset_id in deflate_map: deflate_level = deflate_map[dset_id] log.debug( f"got deflate_level: {deflate_level} for dset: {dset_id}") if dset_id in shuffle_map: shuffle = shuffle_map[dset_id] log.debug(f"got shuffle size: {shuffle} for dset: {dset_id}") await putS3Bytes(app, s3key, chunk_bytes, shuffle=shuffle, deflate_level=deflate_level, bucket=bucket) success = True # if chunk has been evicted from cache something has gone wrong if obj_id not in chunk_cache: msg = f"expected to find {obj_id} in chunk_cache" log.error(msg) elif obj_id in dirty_ids and dirty_ids[obj_id][ 0] > last_update_time: log.info( f"write_s3_obj {obj_id} got updated while s3 write was in progress" ) else: # no new write, can clear dirty chunk_cache.clearDirty(obj_id) # allow eviction from cache log.debug( "putS3Bytes Chunk cache utilization: {} per, dirty_count: {}" .format(chunk_cache.cacheUtilizationPercent, chunk_cache.dirtyCount)) else: # meta data update # check for object in meta cache if obj_id not in meta_cache: log.error(f"expected to find obj_id: {obj_id} in meta cache") raise KeyError(f"{obj_id} not found in meta cache") if not meta_cache.isDirty(obj_id): log.error(f"expected meta cache obj {obj_id} to be dirty") raise ValueError("bad dirty state for obj") obj_json = meta_cache[obj_id] await putS3JSONObj(app, s3key, obj_json, bucket=bucket) success = True # should still be in meta_cache... if obj_id in deleted_ids: log.info( f"obj {obj_id} has been deleted while write was in progress" ) elif obj_id not in meta_cache: msg = f"expected to find {obj_id} in meta_cache" log.error(msg) elif obj_id in dirty_ids and dirty_ids[obj_id][ 0] > last_update_time: log.info( f"write_s3_obj {obj_id} got updated while s3 write was in progress" ) else: meta_cache.clearDirty(obj_id) # allow eviction from cache finally: # clear pending_s3_write item log.debug(f"write_s3_obj finally block, success={success}") if s3key not in pending_s3_write: msg = f"write s3 obj: Expected to find {s3key} in pending_s3_write map" log.error(msg) else: if pending_s3_write[s3key] != now: msg = f"pending_s3_write timestamp got updated unexpectedly for {s3key}" log.error(msg) del pending_s3_write[s3key] # clear task if obj_id not in pending_s3_write_tasks: log.debug(f"no pending s3 write task for {obj_id}") else: log.debug(f"removing pending s3 write task for {obj_id}") del pending_s3_write_tasks[obj_id] # clear dirty flag if obj_id in dirty_ids and dirty_ids[obj_id][0] == last_update_time: log.debug(f"clearing dirty flag for {obj_id}") del dirty_ids[obj_id] # add to map so that root can be notified about changed objects if isValidUuid(obj_id) and isSchema2Id(obj_id): root_id = getRootObjId(obj_id) notify_objs[root_id] = bucket # calculate time to do the write elapsed_time = time.time() - now log.info(f"s3 write for {s3key} took {elapsed_time:.3f}s") return obj_id
async def POST_Chunk(request): log.request(request) app = request.app params = request.rel_url.query put_points = False num_points = 0 if "count" not in params: log.warn("expected count param") raise HTTPBadRequest() if "count" in params: num_points = int(params["count"]) if "action" in params and params["action"] == "put": log.info(f"POST Chunk put points - num_points: {num_points}") put_points = True else: log.info(f"POST Chunk get points - num_points: {num_points}") s3path = None s3offset = 0 s3size = 0 if "s3path" in params: if put_points: log.error("s3path can not be used with put points POST request") raise HTTPBadRequest() s3path = params["s3path"] log.debug(f"GET_Chunk - using s3path: {s3path}") bucket = None elif "bucket" in params: bucket = params["bucket"] else: bucket = None if "s3offset" in params: try: s3offset = int(params["s3offset"]) except ValueError: log.error(f"invalid s3offset params: {params['s3offset']}") raise HTTPBadRequest() if "s3size" in params: try: s3size = int(params["s3size"]) except ValueError: log.error(f"invalid s3size params: {params['s3sieze']}") raise HTTPBadRequest() chunk_id = request.match_info.get('id') if not chunk_id: msg = "Missing chunk id" log.error(msg) raise HTTPBadRequest(reason=msg) log.info(f"POST chunk_id: {chunk_id}") chunk_index = getChunkIndex(chunk_id) log.debug(f"chunk_index: {chunk_index}") if not isValidUuid(chunk_id, "Chunk"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) validateInPartition(app, chunk_id) log.debug(f"request params: {list(params.keys())}") if "dset" in params: msg = "Unexpected dset in POST request" log.error(msg) raise HTTPBadRequest(reason=msg) if not request.has_body: msg = "POST Value with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) content_type = "application/octet-stream" if "Content-Type" in request.headers: # client should use "application/octet-stream" for binary transfer content_type = request.headers["Content-Type"] if content_type != "application/octet-stream": msg = f"Unexpected content_type: {content_type}" log.error(msg) raise HTTPBadRequest(reason=msg) dset_id = getDatasetId(chunk_id) dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) dims = getChunkLayout(dset_json) rank = len(dims) type_json = dset_json["type"] dset_dtype = createDataType(type_json) # create a numpy array for incoming points input_bytes = await request_read(request) if len(input_bytes) != request.content_length: msg = f"Read {len(input_bytes)} bytes, expecting: {request.content_length}" log.error(msg) raise HTTPInternalServerError() if rank == 1: coord_type_str = "uint64" else: coord_type_str = f"({rank},)uint64" if put_points: # create a numpy array with the following type: # (coord1, coord2, ...) | dset_dtype point_dt = np.dtype([("coord", np.dtype(coord_type_str)), ("value", dset_dtype)]) point_shape = (num_points,) chunk_init = True else: point_dt = np.dtype('uint64') point_shape = (num_points, rank) chunk_init = False point_arr = bytesToArray(input_bytes, point_dt, point_shape) chunk_arr = await get_chunk(app, chunk_id, dset_json, bucket=bucket, s3path=s3path, s3offset=s3offset, s3size=s3size, chunk_init=chunk_init) if chunk_arr is None: log.warn(f"chunk {chunk_id} not found") raise HTTPNotFound() if put_points: # writing point data try: chunkWritePoints(chunk_id=chunk_id, chunk_layout=dims, chunk_arr=chunk_arr, point_arr=point_arr) except ValueError as ve: log.warn(f"got value error from chunkWritePoints: {ve}") raise HTTPBadRequest() # write empty response resp = json_response({}) save_chunk(app, chunk_id, bucket=bucket) # lazily write chunk to storage else: # read points try: output_arr = chunkReadPoints(chunk_id=chunk_id, chunk_layout=dims, chunk_arr=chunk_arr, point_arr=point_arr) except ValueError as ve: log.warn(f"got value error from chunkReadPoints: {ve}") raise HTTPBadRequest() output_data = arrayToBytes(output_arr) # write response try: resp = StreamResponse() resp.headers['Content-Type'] = "application/octet-stream" resp.content_length = len(output_data) await resp.prepare(request) await resp.write(output_data) except Exception as e: log.error(f"Exception during binary data write: {e}") raise HTTPInternalServerError() finally: await resp.write_eof() return resp
async def GET_Chunk(request): log.request(request) app = request.app params = request.rel_url.query chunk_id = request.match_info.get('id') if not chunk_id: msg = "Missing chunk id" log.error(msg) raise HTTPBadRequest(reason=msg) if not isValidUuid(chunk_id, "Chunk"): msg = "Invalid chunk id: {}".format(chunk_id) log.warn(msg) raise HTTPBadRequest(reason=msg) validateInPartition(app, chunk_id) log.debug("request params: {}".format(list(params.keys()))) if "dset" not in params: msg = "Missing dset in GET request" log.error(msg) raise HTTPBadRequest(reason=msg) dset_json = json.loads(params["dset"]) log.debug("dset_json: {}".format(dset_json)) type_json = dset_json["type"] dims = getChunkLayout(dset_json) log.debug("got dims: {}".format(dims)) rank = len(dims) # get chunk selection from query params if "select" in params: log.debug("select: {}".format(params["select"])) selection = [] for i in range(rank): dim_slice = getSliceQueryParam(request, i, dims[i]) selection.append(dim_slice) selection = tuple(selection) log.debug("got selection: {}".format(selection)) dt = createDataType(type_json) log.debug("dtype: {}".format(dt)) rank = len(dims) if rank == 0: msg = "No dimension passed to GET chunk request" log.error(msg) raise HTTPBadRequest(reason=msg) if len(selection) != rank: msg = "Selection rank does not match shape rank" log.error(msg) raise HTTPBadRequest(reason=msg) for i in range(rank): s = selection[i] log.debug("selection[{}]: {}".format(i, s)) s3path = None s3offset = 0 s3size = 0 if "s3path" in params: s3path = params["s3path"] log.debug(f"GET_Chunk - useing s3path: {s3path}") if "s3offset" in params: try: s3offset = int(params["s3offset"]) except ValueError: log.error(f"invalid s3offset params: {params['s3offset']}") raise HTTPBadRequest() if "s3size" in params: try: s3size = int(params["s3size"]) except ValueError: log.error(f"invalid s3size params: {params['s3sieze']}") raise HTTPBadRequest() chunk_arr = await getChunk(app, chunk_id, dset_json, s3path=s3path, s3offset=s3offset, s3size=s3size) if chunk_arr is None: # return a 404 msg = "Chunk {} does not exist".format(chunk_id) log.info(msg) raise HTTPNotFound() resp = None if "query" in params: # do query selection query = params["query"] log.info("query: {}".format(query)) if rank != 1: msg = "Query selection only supported for one dimensional arrays" log.warn(msg) raise HTTPBadRequest(reason=msg) limit = 0 if "Limit" in params: limit = int(params["Limit"]) values = [] indices = [] field_names = [] if dt.fields: field_names = list(dt.fields.keys()) x = chunk_arr[selection] log.debug("x: {}".format(x)) eval_str = getEvalStr(query, "x", field_names) log.debug("eval_str: {}".format(eval_str)) where_result = np.where(eval(eval_str)) log.debug("where_result: {}".format(where_result)) where_result_index = where_result[0] log.debug("whare_result index: {}".format(where_result_index)) log.debug("boolean selection: {}".format(x[where_result_index])) s = selection[0] count = 0 for index in where_result_index: log.debug("index: {}".format(index)) value = x[index].tolist() log.debug("value: {}".format(value)) json_val = bytesArrayToList(value) log.debug("json_value: {}".format(json_val)) json_index = index.tolist() * s.step + s.start # adjust for selection indices.append(json_index) values.append(json_val) count += 1 if limit > 0 and count >= limit: log.info("got limit items") break query_result = {} query_result["index"] = indices query_result["value"] = values log.info(f"query_result retiurning: {len(indices)} rows") log.debug(f"query_result: {query_result}") resp = json_response(query_result) else: # get requested data output_arr = chunk_arr[selection] output_data = arrayToBytes(output_arr) # write response try: resp = StreamResponse() resp.headers['Content-Type'] = "application/octet-stream" resp.content_length = len(output_data) await resp.prepare(request) await resp.write(output_data) except Exception as e: log.error(f"Exception during binary data write: {e}") raise HTTPInternalServerError() finally: await resp.write_eof() return resp
async def GET_Chunk(request): log.request(request) app = request.app params = request.rel_url.query chunk_id = request.match_info.get('id') if not chunk_id: msg = "Missing chunk id" log.error(msg) raise HTTPBadRequest(reason=msg) if not isValidUuid(chunk_id, "Chunk"): msg = f"Invalid chunk id: {chunk_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) validateInPartition(app, chunk_id) log.debug(f"request params: {params.keys()}") bucket = None s3path = None s3offset = None s3size = None query = None limit = 0 if "s3path" in params: s3path = params["s3path"] log.debug(f"GET_Chunk - using s3path: {s3path}") elif "bucket" in params: bucket = params["bucket"] if "s3offset" in params: try: s3offset = int(params["s3offset"]) except ValueError: log.error(f"invalid s3offset params: {params['s3offset']}") raise HTTPBadRequest() if "s3size" in params: try: s3size = int(params["s3size"]) except ValueError: log.error(f"invalid s3size params: {params['s3sieze']}") raise HTTPBadRequest() if "query" in params: query = params["query"] if "Limit" in params: limit = int(params["Limit"]) dset_id = getDatasetId(chunk_id) dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) dims = getChunkLayout(dset_json) log.debug(f"got dims: {dims}") rank = len(dims) log.debug(f"dset_json: {dset_json}") # get chunk selection from query params selection = [] for i in range(rank): dim_slice = getSliceQueryParam(request, i, dims[i]) selection.append(dim_slice) selection = tuple(selection) log.debug(f"got selection: {selection}") chunk_arr = await get_chunk(app, chunk_id, dset_json, bucket=bucket, s3path=s3path, s3offset=s3offset, s3size=s3size, chunk_init=False) if chunk_arr is None: msg = f"chunk {chunk_id} not found" log.warn(msg) raise HTTPNotFound() if query: # run given query try: read_resp = chunkQuery(chunk_id=chunk_id, chunk_layout=dims, chunk_arr=chunk_arr, slices=selection, query=query, limit=limit, return_json=True) except TypeError as te: log.warn(f"chunkQuery - TypeError: {te}") raise HTTPBadRequest() except ValueError as ve: log.warn(f"chunkQuery - ValueError: {ve}") raise HTTPBadRequest() else: # read selected data from chunk output_arr = chunkReadSelection(chunk_arr, slices=selection) read_resp = arrayToBytes(output_arr) # write response if isinstance(read_resp, bytes): try: resp = StreamResponse() resp.headers['Content-Type'] = "application/octet-stream" resp.content_length = len(read_resp) await resp.prepare(request) await resp.write(read_resp) except Exception as e: log.error(f"Exception during binary data write: {e}") raise HTTPInternalServerError() finally: await resp.write_eof() else: # JSON response resp = json_response(read_resp) return resp