예제 #1
0
def get_result(dbinstance, project_id, run_id, token):
    result_type = db.get_project_column(dbinstance, project_id, 'result_type')
    auth_token_type = get_authorization_token_type_or_abort(project_id, token)

    if result_type == 'mapping':
        logger.info("Mapping result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"mapping": result}

    elif result_type == 'groups':
        logger.info("Groups result being returned")
        result = db.get_run_result(dbinstance, run_id)
        return {"groups": result}

    elif result_type == 'similarity_scores':
        logger.info("Similarity result being returned")
        return get_similarity_score_result(dbinstance, run_id)

    elif result_type == 'permutations':
        logger.info("Permutation result being returned")
        return get_permutations_result(project_id, run_id, dbinstance, token,
                                       auth_token_type)
    else:
        logger.warning("Unimplemented result type")
        safe_fail_request(500, message='Project has unknown result type')
def get_authorization_token_type_or_abort(project_id, token):
    """
    In case of a permutation with an unencrypted mask, we are using both the result token and the receipt tokens.
    The result token reveals the mask. The receipts tokens are used by the dataproviders to get their permutations.
    However, we do not know the type of token we have before checking.
    """
    logger.debug("checking if provided authorization is a results_token")
    # If the token is not a valid result token, it should be a receipt token.
    if not is_results_token_valid(project_id, token):
        logger.debug("checking if provided authorization is receipt_token")
        # If the token is not a valid receipt token, we abort.
        if not is_receipt_token_valid(project_id, token):
            safe_fail_request(403, message=INVALID_ACCESS_MSG)
        token_type = 'receipt_token'
    else:
        token_type = 'result_token'

    # Note that at this stage we have EITHER a receipt or result token, and depending on the result_type
    # that might mean the caller is not authorized.
    with DBConn() as conn:
        result_type = get_project_column(conn, project_id, 'result_type')
    if result_type in {'groups', 'similarity_scores'
                       } and token_type == 'receipt_token':
        logger.info("Caller provided receipt token to get results")
        safe_fail_request(403, message=INVALID_ACCESS_MSG)
    return token_type
예제 #3
0
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    mapping = similarity_result['mapping']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Just save the raw "mapping"
        log.debug("Saving the resulting map data to the db")
        result_id = insert_mapping_result(db, run_id, mapping)
        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Mapping result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        permute_mapping_data.apply_async(
            (project_id, run_id,
             similarity_result['lenf1'], similarity_result['lenf2'],
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark mapping job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
예제 #4
0
    def __init__(self, project_id, threshold, name, notes):
        self.project_id = project_id
        self.name = name
        self.notes = notes
        self.threshold = threshold
        self.run_id = generate_code()
        logger.info("Created run id", rid=self.run_id)

        self.type = 'no_mapping' \
            if db.get_project_column(db.get_db(), project_id, 'result_type') == 'similarity_scores' \
            else 'default'
예제 #5
0
def clks_uploaded_to_project(project_id, check_data_ready=False):
    """ See if the given project has had all parties contribute data.
    """
    logger.info("Counting contributing parties")
    conn = connect_db()
    if check_data_ready:
        parties_contributed = get_number_parties_ready(conn, project_id)
        logger.info("Parties where data is ready: {}".format(parties_contributed))
    else:
        parties_contributed = get_number_parties_uploaded(conn, project_id)
        logger.info("Parties where data is uploaded: {}".format(parties_contributed))
    number_parties = get_project_column(conn, project_id, 'parties')
    logger.info("{}/{} parties have contributed clks".format(parties_contributed, number_parties))
    return parties_contributed == number_parties
예제 #6
0
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None: return
    mc = connect_to_object_store()
    files = []
    data_size = 0

    for num, filename in similarity_result_files:
        if num > 0:
            files.append(filename)
            data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size

    log.debug("Aggregating result chunks from {} files, total size: {}".format(
        len(files), fmt_bytes(data_size)))

    result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files)

    log.info("Similarity score results are {}".format(fmt_bytes(data_size)))
    result_stream = chain_streams(result_file_stream_generator)

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Note: Storing the similarity scores for all result types
        result_filename = store_similarity_scores(result_stream, run_id, data_size, db)

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            lenf1, lenf2 = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.info("Deleting intermediate similarity score files from object store")
        mc.remove_objects(Config.MINIO_BUCKET, files)
        log.debug("Removing clk filters from redis cache")
        remove_from_cache(dp_ids[0])
        remove_from_cache(dp_ids[1])

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def save_and_permute(similarity_result, project_id, run_id, parent_span):
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Saving and possibly permuting data")
    groups = similarity_result['groups']

    # Note Postgres requires JSON object keys to be strings
    # Celery actually converts the json arguments in the same way

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        if result_type == "groups":
            # Save the raw groups
            log.debug("Saving the groups in the DB")
            result_id = insert_mapping_result(db, run_id, groups)
        else:
            # Turn groups into mapping and save that
            log.debug("Turning groups into mapping")
            mapping = groups_to_mapping(groups)
            log.debug("Saving mappuing in the DB")
            result_id = insert_mapping_result(db, run_id, mapping)

        dp_ids = get_dataprovider_ids(db, project_id)

    log.info("Result saved to db with result id {}".format(result_id))

    if result_type == "permutations":
        log.debug("Submitting job to permute mapping")
        dataset0_size, dataset1_size = similarity_result['datasetSizes']
        permute_mapping_data.apply_async(
            (project_id, run_id, dataset0_size, dataset1_size,
             save_and_permute.get_serialized_span()))
    else:
        log.debug("Mark job as complete")
        mark_run_complete.delay(run_id, save_and_permute.get_serialized_span())

    # Post similarity computation cleanup
    log.debug("Removing clk filters from redis cache")

    for dp_id in dp_ids:
        cache.remove_from_cache(dp_id)
    calculate_comparison_rate.delay()
예제 #8
0
def aggregate_comparisons(similarity_result_files,
                          project_id,
                          run_id,
                          parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None:
        raise TypeError("Inappropriate argument type - missing results files.")

    files = []
    for res in similarity_result_files:
        if res is None:
            log.warning(
                "Missing results during aggregation. Stopping processing.")
            raise TypeError(
                "Inappropriate argument type - results missing at aggregation step."
            )
        num, filesize, filename = res
        if num:
            assert filesize is not None
            assert filename is not None
            files.append((num, filesize, filename))
        else:
            assert filesize is None
            assert filename is None
    heapq.heapify(files)

    log.debug(f"Aggregating result chunks from {len(files)} files, "
              f"total size: {sum(map(operator.itemgetter(1), files))}")

    mc = connect_to_object_store()
    while len(files) > 1:
        file0 = heapq.heappop(files)
        file1 = heapq.heappop(files)
        merged_file = _merge_files(mc, log, file0, file1)
        heapq.heappush(files, merged_file)

    if not files:
        # No results. Let's chuck in an empty file.
        empty_file = _put_placeholder_empty_file(mc, log)
        files.append(empty_file)

    (merged_num, merged_filesize, merged_filename), = files
    log.info(f"Similarity score results in {merged_filename} in bucket "
             f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.")

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')
        result_id = insert_similarity_score_file(db, run_id, merged_filename)
        log.debug(f"Saved path to similarity scores file to db with id "
                  f"{result_id}")

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            dataset_sizes = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.debug("Removing clk filters from redis cache")
        for dp_id in dp_ids:
            remove_from_cache(dp_id)

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id,
                                aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(merged_filename, project_id, run_id, dataset_sizes,
                          aggregate_comparisons.get_serialized_span())
예제 #9
0
def get(project_id, run_id):
    log = logger.bind(pid=project_id, rid=run_id)
    parent_span = g.flask_tracer.get_span()
    log.debug("request run status")
    with opentracing.tracer.start_span('check-auth',
                                       child_of=parent_span) as span:
        # Check the project and run resources exist
        abort_if_run_doesnt_exist(project_id, run_id)

        # Check the caller has a valid results token. Yes it should be renamed.
        auth_token_type = get_authorization_token_type_or_abort(
            project_id, request.headers.get('Authorization'))
        log.debug(
            "Run status authorized using {} token".format(auth_token_type))

    with opentracing.tracer.start_span('get-status-from-db',
                                       child_of=parent_span) as span:
        dbinstance = get_db()
        run_status = db.get_run_status(dbinstance, run_id)
        project_in_error = db.get_encoding_error_count(dbinstance,
                                                       project_id) > 0
        span.set_tag('stage', run_status['stage'])

    run_type = RUN_TYPES[run_status['type']]
    state = 'error' if project_in_error else run_status['state']
    stage = run_status['stage']
    status = {
        "state": state,
        "time_added": run_status['time_added'],
        "stages": run_type['stages'],
        "current_stage": {
            "number":
            stage,
            "description":
            run_type['stage_descriptions'].get(
                stage, "there is no description for this stage")
        }
    }
    # trying to get progress if available
    if stage == 1:
        # waiting for CLKs
        abs_val = db.get_number_parties_uploaded(dbinstance, project_id)
        max_val = db.get_project_column(dbinstance, project_id, 'parties')
    elif stage == 2:
        # Computing similarity
        abs_val = cache.get_progress(run_id)
        if abs_val is not None:
            max_val = db.get_total_comparisons_for_project(
                dbinstance, project_id)
    else:
        # Solving for mapping (no progress)
        abs_val = None
    if abs_val is not None:
        progress = {
            'absolute': abs_val,
            'relative': (abs_val / max_val) if max_val != 0 else 0,
        }
        if progress['relative'] > 1.0:
            log.warning('oh no. more than 100% ??? abs: {}, max: {}'.format(
                abs_val, max_val))
        if run_status['stage'] in run_type['stage_progress_descriptions']:
            progress['description'] = run_type['stage_progress_descriptions'][
                run_status['stage']]
        status["current_stage"]["progress"] = progress
    if state == 'completed':
        status["time_started"] = run_status['time_started']
        status["time_completed"] = run_status['time_completed']
        return completed().dump(status)
    elif state == 'running' or state == 'queued' or state == 'created':
        status["time_started"] = run_status['time_started']
        return running().dump(status)
    elif state == 'error':
        log.warning(
            'handling the run status for state "error" is not implemented')
        return error().dump(status)
예제 #10
0
def project_clks_post(project_id):
    """
    Update a project to provide encoded PII data.
    """

    headers = request.headers

    log, parent_span = bind_log_and_span(project_id)
    log.debug("Starting data upload request")
    token = precheck_upload_token(project_id, headers, parent_span)
    receipt_token = generate_code()
    with DBConn() as conn:
        dp_id = db.get_dataprovider_id(conn, token)
        project_encoding_size = db.get_project_schema_encoding_size(
            conn, project_id)
        upload_state_updated = db.is_dataprovider_allowed_to_upload_and_lock(
            conn, dp_id)
        # get flag use_blocking from table projects
        uses_blocking = get_project_column(conn, project_id, 'uses_blocking')

    if not upload_state_updated:
        return safe_fail_request(
            403, "This token has already been used to upload clks.")

    log = log.bind(dp_id=dp_id)
    log.info("Receiving CLK data.")

    with opentracing.tracer.start_span('upload-clk-data',
                                       child_of=parent_span) as span:
        span.set_tag("project_id", project_id)
        try:
            if headers['Content-Type'] == "application/json":
                span.set_tag("content-type", 'json')
                # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This
                #       enables running the web frontend with less memory.
                #       However, as connexion is very, very strict about input validation when it comes to json, it will always
                #       consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as
                #       json into memory. -> issue #184
                handle_encoding_upload_json(project_id,
                                            dp_id,
                                            get_json(),
                                            receipt_token,
                                            uses_blocking,
                                            parent_span=span)

                log.info("Job scheduled to handle users upload")
            elif headers['Content-Type'] == "application/octet-stream":
                span.set_tag("content-type", 'binary')
                log.info("Handling binary CLK upload")
                try:
                    count, size = check_binary_upload_headers(headers)
                    log.info(
                        f"Headers tell us to expect {count} encodings of {size} bytes"
                    )
                    span.log_kv({'count': count, 'size': size})
                except Exception:
                    log.warning(
                        "Upload failed due to problem with headers in binary upload"
                    )
                    raise
                # Check against project level encoding size (if it has been set)
                if project_encoding_size is not None and size != project_encoding_size:
                    # fail fast - we haven't stored the encoded data yet
                    return safe_fail_request(
                        400,
                        "Upload 'Hash-Size' doesn't match project settings")

                # TODO actually stream the upload data straight to Minio. Currently we can't because
                # connexion has already read the data before our handler is called!
                # https://github.com/zalando/connexion/issues/592
                # stream = get_stream()
                stream = BytesIO(request.data)
                expected_bytes = binary_format(size).size * count
                log.debug(
                    f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B"
                )
                if len(request.data) != expected_bytes:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct"
                    )
                try:
                    upload_clk_data_binary(project_id, dp_id, stream,
                                           receipt_token, count, size)
                except ValueError:
                    safe_fail_request(
                        400,
                        "Uploaded data did not match the expected size. Check request headers are correct."
                    )
            else:
                safe_fail_request(400, "Content Type not supported")
        except ProblemException as e:
            # Have an exception that is safe for the user. We reset the upload state to
            # allow the user to try upload again.
            log.info(
                f"Problem occurred, returning status={e.status} - {e.detail}")
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn,
                                                 dp_id,
                                                 state='not_started')
            raise
        except Exception as e:
            log.warning("Unhandled error occurred during data upload")
            log.exception(e)
            with DBConn() as conn:
                db.set_dataprovider_upload_state(conn, dp_id, state='error')
            safe_fail_request(
                500, "Sorry, the server couldn't handle that request")

    with DBConn() as conn:
        db.set_dataprovider_upload_state(conn, dp_id, state='done')

    # Now work out if all parties have added their data
    if clks_uploaded_to_project(project_id):
        logger.info("All parties data present. Scheduling any queued runs")
        check_for_executable_runs.delay(project_id,
                                        serialize_span(parent_span))

    return {'message': 'Updated', 'receipt_token': receipt_token}, 201