def pull_external_data_encodings_only(project_id, dp_id, object_info, credentials, receipt_token, parent_span=None): """ """ log = logger.bind(pid=project_id, dp_id=dp_id) with DBConn() as conn: if not check_project_exists(conn, project_id): log.info("Project deleted, stopping immediately") return bucket_name = object_info['bucket'] object_name = object_info['path'] log.info("Pulling encoding data from an object store") env_credentials = parse_minio_credentials({ 'AccessKeyId': config.MINIO_ACCESS_KEY, 'SecretAccessKey': config.MINIO_SECRET_KEY }) stat, stream = stat_and_stream_object(bucket_name, object_name, env_credentials) count = int(stat.metadata['X-Amz-Meta-Hash-Count']) size = int(stat.metadata['X-Amz-Meta-Hash-Size']) if object_name.endswith('.json'): encodings_stream = ijson.items(io.BytesIO(stream.data), 'clks.item') converted_stream = include_encoding_id_in_json_stream(encodings_stream, size, count) else: converted_stream = include_encoding_id_in_binary_stream(stream, size, count) upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size, parent_span=parent_span) # # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span))
def pull_external_data_encodings_only(project_id, dp_id, object_info, credentials, receipt_token, parent_span=None): """ """ log = logger.bind(pid=project_id, dp_id=dp_id) with DBConn() as conn: if not check_project_exists(conn, project_id): log.info("Project deleted, stopping immediately") return bucket_name = object_info['bucket'] object_name = object_info['path'] log.info("Pulling encoding data from an object store") mc_credentials = parse_minio_credentials(credentials) stat, stream = stat_and_stream_object(bucket_name, object_name, mc_credentials) count = int(stat.metadata['X-Amz-Meta-Hash-Count']) size = int(stat.metadata['X-Amz-Meta-Hash-Size']) converted_stream = include_encoding_id_in_binary_stream(stream, size, count) upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size) # # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span))
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128): """ Save the user provided raw CLK data. """ receipt_token = generate_code() filename = Config.BIN_FILENAME_FMT.format(receipt_token) # Set the state to 'pending' in the bloomingdata table with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, count) db.update_encoding_metadata_set_encoding_size(conn, dp_id, size) logger.info( f"Storing supplied binary clks of individual size {size} in file: {filename}" ) num_bytes = count * (size + 6) logger.debug( "Directly storing binary file with index, base64 encoded CLK, popcount" ) # Upload to object store logger.info( f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}" ) parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('save-to-minio', child_of=parent_span) as span: mc = connect_to_object_store() try: mc.put_object(Config.MINIO_BUCKET, filename, data=raw_stream, length=num_bytes) except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError, minio.error.ResponseError): logger.info( "Mismatch between expected stream length and header info") raise ValueError( "Mismatch between expected stream length and header info") with opentracing.tracer.start_span('update-database', child_of=parent_span) as span: with DBConn() as conn: db.update_encoding_metadata(conn, filename, dp_id, 'ready') db.set_dataprovider_upload_state(conn, dp_id, True) # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return receipt_token
def project_delete(project_id): log, parent_span = bind_log_and_span(project_id) log.info('Request to delete project') # Check the resource exists and hasn't already been marked for deletion abort_if_project_doesnt_exist(project_id) # Check the caller has a valid results token. Yes it should be renamed. abort_if_invalid_results_token(project_id, request.headers.get('Authorization')) log.info("Marking project for deletion") with DBConn() as db_conn: db.mark_project_deleted(db_conn, project_id) log.info("Queuing authorized request to delete project resources") remove_project.delay(project_id, serialize_span(parent_span)) return '', 204
def post(project_id, run): log, span = bind_log_and_span(project_id) log.debug("Processing request to add a new run", run=run) # Check the resource exists abort_if_project_doesnt_exist(project_id) # Check the caller has a valid results token. Yes it should be renamed. abort_if_invalid_results_token(project_id, request.headers.get('Authorization')) abort_if_project_in_error_state(project_id) run_model = Run.from_json(run, project_id) log.debug("Saving run") with db.DBConn() as db_conn: run_model.save(db_conn) check_for_executable_runs.delay(project_id, serialize_span(span)) return RunDescription().dump(run_model), 201
def post(project_id, run): log = logger.bind(pid=project_id) log.debug("Processing request to add a new run", run=run) # Check the resource exists abort_if_project_doesnt_exist(project_id) # Check the caller has a valid results token. Yes it should be renamed. abort_if_invalid_results_token(project_id, request.headers.get('Authorization')) abort_if_project_in_error_state(project_id) run_model = Run.from_json(run, project_id) log.debug("Saving run") with db.DBConn() as db_conn: run_model.save(db_conn) project_object = db.get_project(db_conn, project_id) parties_contributed = db.get_number_parties_uploaded( db_conn, project_id) ready_to_run = parties_contributed == project_object['parties'] log.debug( "Expecting {} parties to upload data. Have received {}".format( project_object['parties'], parties_contributed)) if ready_to_run: log.info( "Scheduling task to carry out all runs for project {} now". format(project_id)) update_run_mark_queued(db_conn, run_model.run_id) else: log.info("Task queued but won't start until CLKs are all uploaded") if ready_to_run: span = g.flask_tracer.get_span() span.set_tag("run_id", run_model.run_id) span.set_tag("project_id", run_model.project_id) check_for_executable_runs.delay(project_id, serialize_span(span)) return RunDescription().dump(run_model), 201
def project_clks_post(project_id): """ Update a project to provide encoded PII data. """ log = logger.bind(pid=project_id) headers = request.headers parent_span = g.flask_tracer.get_span() with opentracing.tracer.start_span('check-auth', child_of=parent_span) as span: abort_if_project_doesnt_exist(project_id) if headers is None or 'Authorization' not in headers: safe_fail_request(401, message="Authentication token required") token = headers['Authorization'] # Check the caller has valid token -> otherwise 403 abort_if_invalid_dataprovider_token(token) with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( get_db(), project_id) log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") receipt_token = None with opentracing.tracer.start_span('upload-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) if headers['Content-Type'] == "application/json": span.set_tag("content-type", 'json') # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This # enables running the web frontend with less memory. # However, as connexion is very, very strict about input validation when it comes to json, it will always # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as # json into memory. -> issue #184 receipt_token, raw_file = upload_json_clk_data( dp_id, get_json(), span) # Schedule a task to deserialize the hashes, and carry # out a pop count. handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(span)) log.info("Job scheduled to handle user uploaded hashes") elif headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) log.debug( f"Stream size is {len(request.data)} B, and we expect {(6 + size)* count} B" ) if len(request.data) != (6 + size) * count: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: receipt_token = upload_clk_data_binary(project_id, dp_id, stream, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def pull_external_data(project_id, dp_id, encoding_object_info, blocks_object_info, receipt_token, parent_span=None): """ Load encoding and blocking data from object store. - pull blocking map into memory, create blocks in db - stream encodings into DB and add encoding + blocks from in memory dict. :param project_id: identifier for the project :param dp_id: :param encoding_object_info: a dictionary contains bucket and path of uploaded encoding :param blocks_object_info: a dictionary contains bucket and path of uploaded blocks :param receipt_token: token used to insert into database """ env_credentials = parse_minio_credentials({ 'AccessKeyId': config.MINIO_ACCESS_KEY, 'SecretAccessKey': config.MINIO_SECRET_KEY }) log = logger.bind(pid=project_id, dp_id=dp_id) with DBConn() as conn: if not check_project_exists(conn, project_id): log.info("Project deleted, stopping immediately") return mc = connect_to_object_store(env_credentials) log.debug("Pulling blocking information from object store") response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path']) encoding_to_block_map = json.load(response)['blocks'] log.debug("Counting the blocks") block_sizes = {} for encoding_id in encoding_to_block_map: _blocks = encoding_to_block_map[encoding_id] for block_id in _blocks: block_id = str(block_id) block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1 block_count = len(block_sizes) log.debug(f"Processing {block_count} blocks") # stream the encodings bucket_name = encoding_object_info['bucket'] object_name = encoding_object_info['path'] stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, env_credentials) count = int(stat.metadata['X-Amz-Meta-Hash-Count']) size = int(stat.metadata['X-Amz-Meta-Hash-Size']) log.debug(f"Processing {count} encodings of size {size}") assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}" with DBConn() as conn: with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span): insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count) update_encoding_metadata_set_encoding_size(conn, dp_id, size) with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span): log.debug("Adding blocks to db") insert_blocking_metadata(conn, dp_id, block_sizes) def ijson_encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id, encoding in zip(range(count), encoding_stream): yield ( str(encoding_id), binary_formatter.pack(encoding_id, deserialize_bytes(encoding)), encoding_to_block_map[str(encoding_id)] ) def encoding_iterator(encoding_stream): binary_formatter = binary_format(size) for encoding_id in range(count): yield ( str(encoding_id), binary_formatter.pack(encoding_id, encoding_stream.read(size)), encoding_to_block_map[str(encoding_id)] ) if object_name.endswith('.json'): encodings_stream = ijson.items(io.BytesIO(encodings_stream.data), 'clks.item') encoding_generator = ijson_encoding_iterator(encodings_stream) else: encoding_generator = encoding_iterator(encodings_stream) with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span): log.debug("Adding encodings and associated blocks to db") try: store_encodings_in_db(conn, dp_id, encoding_generator, size) except Exception as e: update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error') log.warning(e) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): update_encoding_metadata(conn, None, dp_id, 'ready') update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready') # # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span))
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token, uses_blocking, parent_span): """ Take user provided upload information - accepting multiple formats - and eventually injest into the database. Encodings uploaded directly in the JSON are first quarantined in the object store, and a background task deserializes them. Encodings that are in an object store are streamed directly into the database by a background task. """ log = logger.bind(pid=project_id) log.info("Checking json is consistent") try: abort_if_inconsistent_upload(uses_blocking, clk_json) except ValueError as e: safe_fail_request(403, e.args[0]) if "encodings" in clk_json and 'file' in clk_json['encodings']: # external encodings log.info("External encodings uploaded") encoding_object_info = clk_json['encodings']['file'] object_name = encoding_object_info['path'] _check_object_path_allowed(project_id, dp_id, object_name, log) encoding_credentials = clk_json['encodings'].get('credentials') # Schedule a background task to pull the encodings from the object store # This background task updates the database with encoding metadata assuming # that there are no blocks. if 'blocks' not in clk_json: log.info("scheduling task to pull encodings from object store") pull_external_data_encodings_only.delay( project_id, dp_id, encoding_object_info, encoding_credentials, receipt_token, parent_span=serialize_span(parent_span)) else: # Need to deal with both encodings and blocks if 'file' in clk_json['blocks']: object_name = clk_json['blocks']['file']['path'] _check_object_path_allowed(project_id, dp_id, object_name, log) # Blocks are in an external file blocks_object_info = clk_json['blocks']['file'] blocks_credentials = clk_json['blocks'].get('credentials') log.info( "scheduling task to pull both encodings and blocking data from object store" ) pull_external_data.delay( project_id, dp_id, encoding_object_info, encoding_credentials, blocks_object_info, blocks_credentials, receipt_token, parent_span=serialize_span(parent_span)) else: raise NotImplementedError( "Don't currently handle combination of external encodings and blocks" ) return # Convert uploaded JSON to common schema. # # The original JSON API simply accepted "clks", then came a combined encoding and # blocking API expecting the top level element "clknblocks". Finally an API that # specifies both "encodings" and "blocks" independently at the top level. # # We rewrite all into the "clknblocks" format. if "encodings" in clk_json: logger.debug( "converting from 'encodings' & 'blocks' format to 'clknblocks'") clk_json = convert_encoding_upload_to_clknblock(clk_json) is_valid_clks = not uses_blocking and 'clks' in clk_json element = 'clks' if is_valid_clks else 'clknblocks' if len(clk_json[element]) < 1: safe_fail_request(400, message="Missing CLKs information") filename = Config.RAW_FILENAME_FMT.format(receipt_token) logger.info("Storing user {} supplied {} from json".format(dp_id, element)) with opentracing.tracer.start_span('splitting-json-clks', child_of=parent_span) as span: encoding_count = len(clk_json[element]) span.set_tag(element, encoding_count) logger.debug(f"Received {encoding_count} {element}") if element == 'clks': logger.info("Rewriting provided json into clknsblocks format") clk_json = convert_clks_to_clknblocks(clk_json) element = 'clknblocks' logger.info("Counting block sizes and number of blocks") # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]} block_sizes = {} for _, *elements_blocks in clk_json[element]: for el_block in elements_blocks: block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1 block_count = len(block_sizes) logger.info(f"Received {encoding_count} encodings in {block_count} blocks") for block in block_sizes: logger.info(f"Block {block} has {block_sizes[block]} elements") # write clk_json into a temp file tmp = tempfile.NamedTemporaryFile(mode='w') json.dump(clk_json, tmp) tmp.flush() with opentracing.tracer.start_span('save-clk-file-to-quarantine', child_of=parent_span) as span: span.set_tag('filename', filename) mc = connect_to_object_store() mc.fput_object(Config.MINIO_BUCKET, filename, tmp.name, content_type='application/json') logger.info('Saved uploaded {} JSON to file {} in object store.'.format( element.upper(), filename)) with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span): with DBConn() as conn: db.insert_encoding_metadata(conn, filename, dp_id, receipt_token, encoding_count, block_count) db.insert_blocking_metadata(conn, dp_id, block_sizes) # Schedule a task to deserialize the encodings handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(parent_span))
def project_clks_post(project_id): """ Update a project to provide encoded PII data. """ headers = request.headers log, parent_span = bind_log_and_span(project_id) log.debug("Starting data upload request") token = precheck_upload_token(project_id, headers, parent_span) receipt_token = generate_code() with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( conn, project_id) upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock( conn, dp_id) # get flag use_blocking from table projects uses_blocking = get_project_column(conn, project_id, 'uses_blocking') if not upload_state_updated: return safe_fail_request( 403, "This token has already been used to upload clks.") log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) try: if headers['Content-Type'] == "application/json": span.set_tag("content-type", 'json') # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This # enables running the web frontend with less memory. # However, as connexion is very, very strict about input validation when it comes to json, it will always # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as # json into memory. -> issue #184 handle_encoding_upload_json(project_id, dp_id, get_json(), receipt_token, uses_blocking, parent_span=span) log.info("Job scheduled to handle users upload") elif headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) expected_bytes = binary_format(size).size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: upload_clk_data_binary(project_id, dp_id, stream, receipt_token, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") except ProblemException as e: # Have an exception that is safe for the user. We reset the upload state to # allow the user to try upload again. log.info( f"Problem occurred, returning status={e.status} - {e.detail}") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='not_started') raise except Exception as e: log.warning("Unhandled error occurred during data upload") log.exception(e) with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='error') safe_fail_request( 500, "Sorry, the server couldn't handle that request") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='done') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def project_binaryclks_post(project_id): """ Update a project to provide encoded PII data. """ log, parent_span = bind_log_and_span(project_id) headers = request.headers token = precheck_upload_token(project_id, headers, parent_span) with DBConn() as conn: dp_id = db.get_dataprovider_id(conn, token) project_encoding_size = db.get_project_schema_encoding_size( conn, project_id) upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock( conn, dp_id) if not upload_state_updated: return safe_fail_request( 403, "This token has already been used to upload clks.") log = log.bind(dp_id=dp_id) log.info("Receiving CLK data.") receipt_token = generate_code() with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: span.set_tag("project_id", project_id) try: if headers['Content-Type'] == "application/octet-stream": span.set_tag("content-type", 'binary') log.info("Handling binary CLK upload") try: count, size = check_binary_upload_headers(headers) log.info( f"Headers tell us to expect {count} encodings of {size} bytes" ) span.log_kv({'count': count, 'size': size}) except Exception: log.warning( "Upload failed due to problem with headers in binary upload" ) raise # Check against project level encoding size (if it has been set) if project_encoding_size is not None and size != project_encoding_size: # fail fast - we haven't stored the encoded data yet return safe_fail_request( 400, "Upload 'Hash-Size' doesn't match project settings") # TODO actually stream the upload data straight to Minio. Currently we can't because # connexion has already read the data before our handler is called! # https://github.com/zalando/connexion/issues/592 # stream = get_stream() stream = BytesIO(request.data) converted_stream = include_encoding_id_in_binary_stream( stream, size, count) expected_bytes = size * count log.debug( f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B" ) if len(request.data) != expected_bytes: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct" ) try: upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size) except ValueError: safe_fail_request( 400, "Uploaded data did not match the expected size. Check request headers are correct." ) else: safe_fail_request(400, "Content Type not supported") except Exception: log.warning( "The dataprovider was not able to upload their clks," " re-enable the corresponding upload token to be used.") with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='error') raise with DBConn() as conn: db.set_dataprovider_upload_state(conn, dp_id, state='done') # Now work out if all parties have added their data if clks_uploaded_to_project(project_id): logger.info("All parties data present. Scheduling any queued runs") check_for_executable_runs.delay(project_id, serialize_span(parent_span)) return {'message': 'Updated', 'receipt_token': receipt_token}, 201