예제 #1
0
def _retrieve_blocked_dataset_sizes_and_lookup(conn, project_id, dp_ids):
    """Fetch encoding counts for each dataset by block. And create lookup table from block_name to block_id

    :param dp_ids: Iterable of dataprovider database identifiers.
    :returns
        A 2-tuple of:

        - dataset sizes: a tuple of the number of encodings in each dataset.

        - dp_block_sizes: A map from dataprovider id to a dict mapping
          block id to the number of encodings from the dataprovider in the
          block.

          {dp_id -> {block_id -> block_size}}
          e.g. {33: {'1': 100}, 34: {'1': 100}, 35: {'1': 100}}
    """
    dataset_sizes = get_project_dataset_sizes(conn, project_id)

    dp_block_sizes = {}
    dp_block_lookups = {}
    for dp_id in dp_ids:
        block_sizes = {}
        lookup = {}
        for block_name, block_id, count in get_block_metadata(conn, dp_id):
            block_sizes[block_name] = count
            lookup[block_name] = block_id
        dp_block_sizes[dp_id] = block_sizes
        dp_block_lookups[dp_id] = lookup
    return dataset_sizes, dp_block_sizes, dp_block_lookups
예제 #2
0
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None: return
    mc = connect_to_object_store()
    files = []
    data_size = 0

    for num, filename in similarity_result_files:
        if num > 0:
            files.append(filename)
            data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size

    log.debug("Aggregating result chunks from {} files, total size: {}".format(
        len(files), fmt_bytes(data_size)))

    result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files)

    log.info("Similarity score results are {}".format(fmt_bytes(data_size)))
    result_stream = chain_streams(result_file_stream_generator)

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')

        # Note: Storing the similarity scores for all result types
        result_filename = store_similarity_scores(result_stream, run_id, data_size, db)

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            lenf1, lenf2 = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.info("Deleting intermediate similarity score files from object store")
        mc.remove_objects(Config.MINIO_BUCKET, files)
        log.debug("Removing clk filters from redis cache")
        remove_from_cache(dp_ids[0])
        remove_from_cache(dp_ids[1])

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
예제 #3
0
def _retrieve_blocked_dataset_sizes(conn, project_id, dp_ids):
    """Fetch encoding counts for each dataset by block.

    :param dp_ids: Iterable of dataprovider database identifiers.
    :returns
        A 2-tuple of:

        - dataset sizes: a tuple of the number of encodings in each dataset.

        - dp_block_sizes: A map from dataprovider id to a dict mapping
          block id to the number of encodings from the dataprovider in the
          block.

          {dp_id -> {block_id -> block_size}}
          e.g. {33: {'1': 100}, 34: {'1': 100}, 35: {'1': 100}}
    """
    dataset_sizes = get_project_dataset_sizes(conn, project_id)

    dp_block_sizes = {}
    for dp_id in dp_ids:
        dp_block_sizes[dp_id] = dict(get_block_metadata(conn, dp_id))
    return dataset_sizes, dp_block_sizes
예제 #4
0
def aggregate_comparisons(similarity_result_files,
                          project_id,
                          run_id,
                          parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    if similarity_result_files is None:
        raise TypeError("Inappropriate argument type - missing results files.")

    files = []
    for res in similarity_result_files:
        if res is None:
            log.warning(
                "Missing results during aggregation. Stopping processing.")
            raise TypeError(
                "Inappropriate argument type - results missing at aggregation step."
            )
        num, filesize, filename = res
        if num:
            assert filesize is not None
            assert filename is not None
            files.append((num, filesize, filename))
        else:
            assert filesize is None
            assert filename is None
    heapq.heapify(files)

    log.debug(f"Aggregating result chunks from {len(files)} files, "
              f"total size: {sum(map(operator.itemgetter(1), files))}")

    mc = connect_to_object_store()
    while len(files) > 1:
        file0 = heapq.heappop(files)
        file1 = heapq.heappop(files)
        merged_file = _merge_files(mc, log, file0, file1)
        heapq.heappush(files, merged_file)

    if not files:
        # No results. Let's chuck in an empty file.
        empty_file = _put_placeholder_empty_file(mc, log)
        files.append(empty_file)

    (merged_num, merged_filesize, merged_filename), = files
    log.info(f"Similarity score results in {merged_filename} in bucket "
             f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.")

    with DBConn() as db:
        result_type = get_project_column(db, project_id, 'result_type')
        result_id = insert_similarity_score_file(db, run_id, merged_filename)
        log.debug(f"Saved path to similarity scores file to db with id "
                  f"{result_id}")

        if result_type == "similarity_scores":
            # Post similarity computation cleanup
            dp_ids = get_dataprovider_ids(db, project_id)

        else:
            # we promote the run to the next stage
            progress_stage(db, run_id)
            dataset_sizes = get_project_dataset_sizes(db, project_id)

    # DB now committed, we can fire off tasks that depend on the new db state
    if result_type == "similarity_scores":
        log.debug("Removing clk filters from redis cache")
        for dp_id in dp_ids:
            remove_from_cache(dp_id)

        # Complete the run
        log.info("Marking run as complete")
        mark_run_complete.delay(run_id,
                                aggregate_comparisons.get_serialized_span())
    else:
        solver_task.delay(merged_filename, project_id, run_id, dataset_sizes,
                          aggregate_comparisons.get_serialized_span())
예제 #5
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info(f"Starting comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(
                conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return

        encoding_size = get_project_encoding_size(conn, project_id)

        log.info(f"Computing similarity for "
                 f"{' x '.join(map(str, dataset_sizes))} entities")
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters_object_filenames = tuple(
            get_filter_metadata(conn, dp_id) for dp_id in dp_ids)
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")

    chunk_infos = tuple(
        anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM,
                                             dataset_sizes=dataset_sizes))

    # Save filenames with chunk information.
    for chunk_info in chunk_infos:
        for chunk_dp_info in chunk_info:
            chunk_dp_index = chunk_dp_info['datasetIndex']
            chunk_dp_store_filename = filters_object_filenames[chunk_dp_index]
            chunk_dp_info['storeFilename'] = chunk_dp_store_filename

    log.info(f"Chunking into {len(chunk_infos)} computation tasks")
    current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunk_infos
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id, run_id,
        parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)
예제 #6
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1]))
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return
        else:
            lenf1, lenf2 = dataset_sizes

        encoding_size = get_project_encoding_size(conn, project_id)

        size = lenf1 * lenf2

        log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2))
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters1_object_filename = get_filter_metadata(conn, dp_ids[0])
        filters2_object_filename = get_filter_metadata(conn, dp_ids[1])
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")
        chunk_size = Config.get_task_chunk_size(size, threshold)
        if chunk_size is None:
            chunk_size = max(lenf1, lenf2)
        log.info("Chunks will contain {} entities per task".format(chunk_size))
        update_run_chunk(conn, project_id, chunk_size)
    job_chunks = []

    dp1_chunks = []
    dp2_chunks = []

    for chunk_start_index_dp1 in range(0, lenf1, chunk_size):
        dp1_chunks.append(
            (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1))
        )
    for chunk_start_index_dp2 in range(0, lenf2, chunk_size):
        dp2_chunks.append(
            (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2))
        )

    # Every chunk in dp1 has to be run against every chunk in dp2
    for dp1_chunk in dp1_chunks:
        for dp2_chunk in dp2_chunks:
            job_chunks.append((dp1_chunk, dp2_chunk, ))

    log.info("Chunking into {} computation tasks each with (at most) {} entities.".format(
        len(job_chunks), chunk_size))
    current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [compute_filter_similarity.si(
        chunk_dp1,
        chunk_dp2,
        project_id,
        run_id,
        threshold,
        encoding_size,
        span_serialized
    ) for chunk_dp1, chunk_dp2 in job_chunks]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error(
        on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)