def pull_external_data_encodings_only(project_id, dp_id, object_info, credentials, receipt_token, parent_span=None):
    """

    """
    log = logger.bind(pid=project_id, dp_id=dp_id)

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.info("Project deleted, stopping immediately")
            return

        bucket_name = object_info['bucket']
        object_name = object_info['path']

    log.info("Pulling encoding data from an object store")
    env_credentials = parse_minio_credentials({
        'AccessKeyId': config.MINIO_ACCESS_KEY,
        'SecretAccessKey': config.MINIO_SECRET_KEY
    })
    stat, stream = stat_and_stream_object(bucket_name, object_name, env_credentials)

    count = int(stat.metadata['X-Amz-Meta-Hash-Count'])
    size = int(stat.metadata['X-Amz-Meta-Hash-Size'])

    if object_name.endswith('.json'):
        encodings_stream = ijson.items(io.BytesIO(stream.data), 'clks.item')
        converted_stream = include_encoding_id_in_json_stream(encodings_stream, size, count)
    else:
        converted_stream = include_encoding_id_in_binary_stream(stream, size, count)
    upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size, parent_span=parent_span)

    # # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id, serialize_span(parent_span))
예제 #2
0
def pull_external_data_encodings_only(project_id, dp_id, object_info, credentials, receipt_token, parent_span=None):
    """

    """
    log = logger.bind(pid=project_id, dp_id=dp_id)

    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.info("Project deleted, stopping immediately")
            return

        bucket_name = object_info['bucket']
        object_name = object_info['path']

    log.info("Pulling encoding data from an object store")
    mc_credentials = parse_minio_credentials(credentials)
    stat, stream = stat_and_stream_object(bucket_name, object_name, mc_credentials)

    count = int(stat.metadata['X-Amz-Meta-Hash-Count'])
    size = int(stat.metadata['X-Amz-Meta-Hash-Size'])
    converted_stream = include_encoding_id_in_binary_stream(stream, size, count)
    upload_clk_data_binary(project_id, dp_id, converted_stream, receipt_token, count, size)

    # # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id, serialize_span(parent_span))
예제 #3
0
def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128):
    """
    Save the user provided raw CLK data.

    """
    receipt_token = generate_code()
    filename = Config.BIN_FILENAME_FMT.format(receipt_token)
    # Set the state to 'pending' in the bloomingdata table
    with DBConn() as conn:
        db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                    count)
        db.update_encoding_metadata_set_encoding_size(conn, dp_id, size)
    logger.info(
        f"Storing supplied binary clks of individual size {size} in file: {filename}"
    )

    num_bytes = count * (size + 6)

    logger.debug(
        "Directly storing binary file with index, base64 encoded CLK, popcount"
    )

    # Upload to object store
    logger.info(
        f"Uploading {count} binary encodings to object store. Total size: {fmt_bytes(num_bytes)}"
    )
    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('save-to-minio',
                                       child_of=parent_span) as span:
        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET,
                          filename,
                          data=raw_stream,
                          length=num_bytes)
        except (minio.error.InvalidSizeError, minio.error.InvalidArgumentError,
                minio.error.ResponseError):
            logger.info(
                "Mismatch between expected stream length and header info")
            raise ValueError(
                "Mismatch between expected stream length and header info")

    with opentracing.tracer.start_span('update-database',
                                       child_of=parent_span) as span:
        with DBConn() as conn:
            db.update_encoding_metadata(conn, filename, dp_id, 'ready')
            db.set_dataprovider_upload_state(conn, dp_id, True)

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return receipt_token
예제 #4
0
def project_delete(project_id):
    log, parent_span = bind_log_and_span(project_id)
    log.info('Request to delete project')
    # Check the resource exists and hasn't already been marked for deletion
    abort_if_project_doesnt_exist(project_id)

    # Check the caller has a valid results token. Yes it should be renamed.
    abort_if_invalid_results_token(project_id,
                                   request.headers.get('Authorization'))
    log.info("Marking project for deletion")

    with DBConn() as db_conn:
        db.mark_project_deleted(db_conn, project_id)

    log.info("Queuing authorized request to delete project resources")
    remove_project.delay(project_id, serialize_span(parent_span))

    return '', 204
예제 #5
0
def post(project_id, run):
    log, span = bind_log_and_span(project_id)
    log.debug("Processing request to add a new run", run=run)
    # Check the resource exists
    abort_if_project_doesnt_exist(project_id)

    # Check the caller has a valid results token. Yes it should be renamed.
    abort_if_invalid_results_token(project_id,
                                   request.headers.get('Authorization'))

    abort_if_project_in_error_state(project_id)

    run_model = Run.from_json(run, project_id)

    log.debug("Saving run")

    with db.DBConn() as db_conn:
        run_model.save(db_conn)

    check_for_executable_runs.delay(project_id, serialize_span(span))
    return RunDescription().dump(run_model), 201
예제 #6
0
def post(project_id, run):
    log = logger.bind(pid=project_id)
    log.debug("Processing request to add a new run", run=run)
    # Check the resource exists
    abort_if_project_doesnt_exist(project_id)

    # Check the caller has a valid results token. Yes it should be renamed.
    abort_if_invalid_results_token(project_id,
                                   request.headers.get('Authorization'))

    abort_if_project_in_error_state(project_id)

    run_model = Run.from_json(run, project_id)

    log.debug("Saving run")

    with db.DBConn() as db_conn:
        run_model.save(db_conn)
        project_object = db.get_project(db_conn, project_id)
        parties_contributed = db.get_number_parties_uploaded(
            db_conn, project_id)
        ready_to_run = parties_contributed == project_object['parties']
        log.debug(
            "Expecting {} parties to upload data. Have received {}".format(
                project_object['parties'], parties_contributed))
        if ready_to_run:
            log.info(
                "Scheduling task to carry out all runs for project {} now".
                format(project_id))
            update_run_mark_queued(db_conn, run_model.run_id)
        else:
            log.info("Task queued but won't start until CLKs are all uploaded")

    if ready_to_run:
        span = g.flask_tracer.get_span()
        span.set_tag("run_id", run_model.run_id)
        span.set_tag("project_id", run_model.project_id)
        check_for_executable_runs.delay(project_id, serialize_span(span))
    return RunDescription().dump(run_model), 201
예제 #7
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log = logger.bind(pid=project_id)
    headers = request.headers

    parent_span = g.flask_tracer.get_span()

    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        abort_if_project_doesnt_exist(project_id)
        if headers is None or 'Authorization' not in headers:
            safe_fail_request(401, message="Authentication token required")

        token = headers['Authorization']

        # Check the caller has valid token -> otherwise 403
        abort_if_invalid_dataprovider_token(token)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            get_db(), project_id)

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = None

    with opentracing.tracer.start_span('upload-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        if headers['Content-Type'] == "application/json":
            span.set_tag("content-type", 'json')
            # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
            #       enables running the web frontend with less memory.
            #       However, as connexion is very, very strict about input validation when it comes to json, it will always
            #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
            #       json into memory. -> issue #184

            receipt_token, raw_file = upload_json_clk_data(
                dp_id, get_json(), span)
            # Schedule a task to deserialize the hashes, and carry
            # out a pop count.
            handle_raw_upload.delay(project_id,
                                    dp_id,
                                    receipt_token,
                                    parent_span=serialize_span(span))
            log.info("Job scheduled to handle user uploaded hashes")
        elif headers['Content-Type'] == "application/octet-stream":
            span.set_tag("content-type", 'binary')
            log.info("Handling binary CLK upload")
            try:
                count, size = check_binary_upload_headers(headers)
                log.info(
                    f"Headers tell us to expect {count} encodings of {size} bytes"
                )
                span.log_kv({'count': count, 'size': size})
            except Exception:
                log.warning(
                    "Upload failed due to problem with headers in binary upload"
                )
                raise
            # Check against project level encoding size (if it has been set)
            if project_encoding_size is not None and size != project_encoding_size:
                # fail fast - we haven't stored the encoded data yet
                return safe_fail_request(
                    400, "Upload 'Hash-Size' doesn't match project settings")

            # TODO actually stream the upload data straight to Minio. Currently we can't because
            # connexion has already read the data before our handler is called!
            # https://github.com/zalando/connexion/issues/592
            # stream = get_stream()
            stream = BytesIO(request.data)
            log.debug(
                f"Stream size is {len(request.data)} B, and we expect {(6 + size)* count} B"
            )
            if len(request.data) != (6 + size) * count:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct"
                )
            try:
                receipt_token = upload_clk_data_binary(project_id, dp_id,
                                                       stream, count, size)
            except ValueError:
                safe_fail_request(
                    400,
                    "Uploaded data did not match the expected size. Check request headers are correct."
                )
        else:
            safe_fail_request(400, "Content Type not supported")

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
def pull_external_data(project_id, dp_id,
                       encoding_object_info,
                       blocks_object_info,
                       receipt_token, parent_span=None):
    """
    Load encoding and blocking data from object store.

    - pull blocking map into memory, create blocks in db
    - stream encodings into DB and add encoding + blocks from in memory dict.

    :param project_id: identifier for the project
    :param dp_id:
    :param encoding_object_info: a dictionary contains bucket and path of uploaded encoding
    :param blocks_object_info: a dictionary contains bucket and path of uploaded blocks
    :param receipt_token: token used to insert into database

    """
    env_credentials = parse_minio_credentials({
        'AccessKeyId': config.MINIO_ACCESS_KEY,
        'SecretAccessKey': config.MINIO_SECRET_KEY
    })
    log = logger.bind(pid=project_id, dp_id=dp_id)
    with DBConn() as conn:
        if not check_project_exists(conn, project_id):
            log.info("Project deleted, stopping immediately")
            return

        mc = connect_to_object_store(env_credentials)

    log.debug("Pulling blocking information from object store")
    response = mc.get_object(bucket_name=blocks_object_info['bucket'], object_name=blocks_object_info['path'])
    encoding_to_block_map = json.load(response)['blocks']

    log.debug("Counting the blocks")
    block_sizes = {}
    for encoding_id in encoding_to_block_map:
        _blocks = encoding_to_block_map[encoding_id]
        for block_id in _blocks:
            block_id = str(block_id)
            block_sizes[block_id] = block_sizes.setdefault(block_id, 0) + 1

    block_count = len(block_sizes)
    log.debug(f"Processing {block_count} blocks")

    # stream the encodings
    bucket_name = encoding_object_info['bucket']
    object_name = encoding_object_info['path']

    stat, encodings_stream = stat_and_stream_object(bucket_name, object_name, env_credentials)
    count = int(stat.metadata['X-Amz-Meta-Hash-Count'])
    size = int(stat.metadata['X-Amz-Meta-Hash-Size'])
    log.debug(f"Processing {count} encodings of size {size}")
    assert count == len(encoding_to_block_map), f"Expected {count} encodings in blocks got {len(encoding_to_block_map)}"

    with DBConn() as conn:
        with opentracing.tracer.start_span('update-metadata-db', child_of=parent_span):
            insert_encoding_metadata(conn, None, dp_id, receipt_token, encoding_count=count, block_count=block_count)
            update_encoding_metadata_set_encoding_size(conn, dp_id, size)
        with opentracing.tracer.start_span('create-block-entries-in-db', child_of=parent_span):
            log.debug("Adding blocks to db")
            insert_blocking_metadata(conn, dp_id, block_sizes)

        def ijson_encoding_iterator(encoding_stream):
            binary_formatter = binary_format(size)
            for encoding_id, encoding in zip(range(count), encoding_stream):
                yield (
                    str(encoding_id),
                    binary_formatter.pack(encoding_id, deserialize_bytes(encoding)),
                    encoding_to_block_map[str(encoding_id)]
                    )

        def encoding_iterator(encoding_stream):
            binary_formatter = binary_format(size)
            for encoding_id in range(count):
                yield (
                    str(encoding_id),
                    binary_formatter.pack(encoding_id, encoding_stream.read(size)),
                    encoding_to_block_map[str(encoding_id)]
                    )

        if object_name.endswith('.json'):
            encodings_stream = ijson.items(io.BytesIO(encodings_stream.data), 'clks.item')
            encoding_generator = ijson_encoding_iterator(encodings_stream)
        else:
            encoding_generator = encoding_iterator(encodings_stream)

        with opentracing.tracer.start_span('upload-encodings-to-db', child_of=parent_span):
            log.debug("Adding encodings and associated blocks to db")
            try:
                store_encodings_in_db(conn, dp_id, encoding_generator, size)
            except Exception as e:
                update_dataprovider_uploaded_state(conn, project_id, dp_id, 'error')
                log.warning(e)

        with opentracing.tracer.start_span('update-encoding-metadata', child_of=parent_span):
            update_encoding_metadata(conn, None, dp_id, 'ready')
            update_blocks_state(conn, dp_id, block_sizes.keys(), 'ready')

    # # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id, serialize_span(parent_span))
예제 #9
0
def handle_encoding_upload_json(project_id, dp_id, clk_json, receipt_token,
                                uses_blocking, parent_span):
    """
    Take user provided upload information - accepting multiple formats - and eventually
    injest into the database.

    Encodings uploaded directly in the JSON are first quarantined in the object store,
    and a background task deserializes them.

    Encodings that are in an object store are streamed directly into the database by
    a background task.
    """
    log = logger.bind(pid=project_id)
    log.info("Checking json is consistent")
    try:
        abort_if_inconsistent_upload(uses_blocking, clk_json)
    except ValueError as e:
        safe_fail_request(403, e.args[0])

    if "encodings" in clk_json and 'file' in clk_json['encodings']:
        # external encodings
        log.info("External encodings uploaded")
        encoding_object_info = clk_json['encodings']['file']
        object_name = encoding_object_info['path']
        _check_object_path_allowed(project_id, dp_id, object_name, log)

        encoding_credentials = clk_json['encodings'].get('credentials')
        # Schedule a background task to pull the encodings from the object store
        # This background task updates the database with encoding metadata assuming
        # that there are no blocks.
        if 'blocks' not in clk_json:
            log.info("scheduling task to pull encodings from object store")
            pull_external_data_encodings_only.delay(
                project_id,
                dp_id,
                encoding_object_info,
                encoding_credentials,
                receipt_token,
                parent_span=serialize_span(parent_span))
        else:
            # Need to deal with both encodings and blocks
            if 'file' in clk_json['blocks']:
                object_name = clk_json['blocks']['file']['path']
                _check_object_path_allowed(project_id, dp_id, object_name, log)
                # Blocks are in an external file
                blocks_object_info = clk_json['blocks']['file']
                blocks_credentials = clk_json['blocks'].get('credentials')
                log.info(
                    "scheduling task to pull both encodings and blocking data from object store"
                )
                pull_external_data.delay(
                    project_id,
                    dp_id,
                    encoding_object_info,
                    encoding_credentials,
                    blocks_object_info,
                    blocks_credentials,
                    receipt_token,
                    parent_span=serialize_span(parent_span))
            else:
                raise NotImplementedError(
                    "Don't currently handle combination of external encodings and blocks"
                )

        return

    # Convert uploaded JSON to common schema.
    #
    # The original JSON API simply accepted "clks", then came a combined encoding and
    # blocking API expecting the top level element "clknblocks". Finally an API that
    # specifies both "encodings" and "blocks" independently at the top level.
    #
    # We rewrite all into the "clknblocks" format.
    if "encodings" in clk_json:
        logger.debug(
            "converting from 'encodings' & 'blocks' format to 'clknblocks'")
        clk_json = convert_encoding_upload_to_clknblock(clk_json)

    is_valid_clks = not uses_blocking and 'clks' in clk_json
    element = 'clks' if is_valid_clks else 'clknblocks'

    if len(clk_json[element]) < 1:
        safe_fail_request(400, message="Missing CLKs information")

    filename = Config.RAW_FILENAME_FMT.format(receipt_token)
    logger.info("Storing user {} supplied {} from json".format(dp_id, element))

    with opentracing.tracer.start_span('splitting-json-clks',
                                       child_of=parent_span) as span:
        encoding_count = len(clk_json[element])
        span.set_tag(element, encoding_count)
        logger.debug(f"Received {encoding_count} {element}")

    if element == 'clks':
        logger.info("Rewriting provided json into clknsblocks format")
        clk_json = convert_clks_to_clknblocks(clk_json)
        element = 'clknblocks'

    logger.info("Counting block sizes and number of blocks")
    # {'clknblocks': [['UG9vcA==', '001', '211'], [...]]}
    block_sizes = {}
    for _, *elements_blocks in clk_json[element]:
        for el_block in elements_blocks:
            block_sizes[el_block] = block_sizes.setdefault(el_block, 0) + 1
    block_count = len(block_sizes)

    logger.info(f"Received {encoding_count} encodings in {block_count} blocks")
    for block in block_sizes:
        logger.info(f"Block {block} has {block_sizes[block]} elements")

    # write clk_json into a temp file
    tmp = tempfile.NamedTemporaryFile(mode='w')
    json.dump(clk_json, tmp)
    tmp.flush()
    with opentracing.tracer.start_span('save-clk-file-to-quarantine',
                                       child_of=parent_span) as span:
        span.set_tag('filename', filename)
        mc = connect_to_object_store()
        mc.fput_object(Config.MINIO_BUCKET,
                       filename,
                       tmp.name,
                       content_type='application/json')
    logger.info('Saved uploaded {} JSON to file {} in object store.'.format(
        element.upper(), filename))

    with opentracing.tracer.start_span('update-encoding-metadata',
                                       child_of=parent_span):
        with DBConn() as conn:
            db.insert_encoding_metadata(conn, filename, dp_id, receipt_token,
                                        encoding_count, block_count)
            db.insert_blocking_metadata(conn, dp_id, block_sizes)

    # Schedule a task to deserialize the encodings
    handle_raw_upload.delay(project_id,
                            dp_id,
                            receipt_token,
                            parent_span=serialize_span(parent_span))
예제 #10
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)
    log.debug("Starting data upload request")
    token = precheck_upload_token(project_id, headers, parent_span)
    receipt_token = generate_code()
    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)
        # get flag use_blocking from table projects
        uses_blocking = get_project_column(conn, project_id, 'uses_blocking')

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/json":
                span.set_tag("content-type", 'json')
                # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
                #       enables running the web frontend with less memory.
                #       However, as connexion is very, very strict about input validation when it comes to json, it will always
                #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
                #       json into memory. -> issue #184
                handle_encoding_upload_json(project_id,
                                            dp_id,
                                            get_json(),
                                            receipt_token,
                                            uses_blocking,
                                            parent_span=span)

                log.info("Job scheduled to handle users upload")
            elif headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)
                expected_bytes = binary_format(size).size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except ProblemException as e:
            # Have an exception that is safe for the user. We reset the upload state to
            # allow the user to try upload again.
            log.info(
                f"Problem occurred, returning status={e.status} - {e.detail}")
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn,
                                                 dp_id,
                                                 state='not_started')
            raise
        except Exception as e:
            log.warning("Unhandled error occurred during data upload")
            log.exception(e)
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            safe_fail_request(
                500, "Sorry, the server couldn't handle that request")

    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201
예제 #11
0
def project_binaryclks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """
    log, parent_span = bind_log_and_span(project_id)
    headers = request.headers
    token = precheck_upload_token(project_id, headers, parent_span)

    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")
    receipt_token = generate_code()

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)

                converted_stream = include_encoding_id_in_binary_stream(
                    stream, size, count)

                expected_bytes = size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, converted_stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except Exception:
            log.warning(
                "The dataprovider was not able to upload their clks,"
                " re-enable the corresponding upload token to be used.")

            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            raise
    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201