示例#1
0
def check_and_set_project_encoding_size(project_id, conn):
    # Check for consistency between uploaded encodings and commit to a
    # project encoding size if one wasn't provided in the linkage schema
    log = logger.bind(pid=project_id)
    uploaded_encoding_sizes = get_uploaded_encoding_sizes(conn, project_id)
    first_uploaded_size = uploaded_encoding_sizes[0][1]
    schema_encoding_size = get_project_schema_encoding_size(conn, project_id)
    project_encoding_size = get_project_encoding_size(conn, project_id)
    # In order of preference:
    encoding_size = project_encoding_size or schema_encoding_size or first_uploaded_size
    log.debug(f"Uploaded encoding sizes: {uploaded_encoding_sizes}")
    log.debug(f"Encoding size set in schema: {schema_encoding_size}")
    log.debug(f"Project encoding size: {project_encoding_size}")

    log.info(f"Verifying uploads all have encoding size of {encoding_size} bytes.")
    for dp_id, enc_size in uploaded_encoding_sizes:
        if enc_size != encoding_size:
            log.warning(f"Set the encodings' upload state to error for dp={dp_id} and aborting processing")
            handle_invalid_encoding_data(project_id, dp_id)
            raise ValueError("Mismatch in encoding sizes. Stopping")
    if project_encoding_size is None:
        set_project_encoding_size(conn, project_id, encoding_size)

    if not config.MIN_ENCODING_SIZE <= encoding_size <= config.MAX_ENCODING_SIZE:
        # Set all uploads to error state
        for dp_id, _ in uploaded_encoding_sizes:
            handle_invalid_encoding_data(project_id, dp_id)
        raise ValueError("Encoding size out of configured bounds")

    if encoding_size % 8:
        raise ValueError("Encoding size must be multiple of 8 bytes (64 bits)")
示例#2
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    """Schedule all the entity comparisons as sub tasks for a run.

    At a high level this task:
    - checks if the project and run have been deleted and if so aborts.
    - retrieves metadata: the number and size of the datasets, the encoding size,
      and the number and size of blocks.
    - splits the work into independent "chunks" and schedules them to run in celery
    - schedules the follow up task to run after all the comparisons have been computed.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    current_span = create_comparison_jobs.span
    with DBConn() as conn:
        check_run_active(conn, project_id, run_id)

        dp_ids = get_dataprovider_ids(conn, project_id)
        number_of_datasets = len(dp_ids)
        assert number_of_datasets >= 2, "Expected at least 2 data providers"
        log.info(f"Scheduling comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")

        # Retrieve required metadata
        dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes(
            conn, project_id, dp_ids)

        log.info("Finding blocks in common between dataproviders")
        common_blocks = _get_common_blocks(dp_block_sizes, dp_ids)

        # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups
        encoding_size = get_project_encoding_size(conn, project_id)
        threshold = get_run(conn, run_id)['threshold']

    log.debug("Chunking computation task")
    # Create "chunks" of comparisons
    chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log)

    log.info(f"Chunking into {len(chunks)} computation tasks")
    current_span.log_kv({
        "event": "chunking",
        'num_chunks': len(chunks),
        'dataset-sizes': dataset_sizes
    })
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunks
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id=project_id, run_id=run_id,
        parent_span=span_serialized).on_error(
            run_failed_handler.s(run_id=run_id))
    log.info(f"Scheduling comparison tasks")
    future = chord(scoring_tasks)(callback_task)
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info(f"Starting comparison of CLKs from data provider ids: "
                 f"{', '.join(map(str, dp_ids))}")
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(
                conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return

        encoding_size = get_project_encoding_size(conn, project_id)

        log.info(f"Computing similarity for "
                 f"{' x '.join(map(str, dataset_sizes))} entities")
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters_object_filenames = tuple(
            get_filter_metadata(conn, dp_id) for dp_id in dp_ids)
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")

    chunk_infos = tuple(
        anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM,
                                             dataset_sizes=dataset_sizes))

    # Save filenames with chunk information.
    for chunk_info in chunk_infos:
        for chunk_dp_info in chunk_info:
            chunk_dp_index = chunk_dp_info['datasetIndex']
            chunk_dp_store_filename = filters_object_filenames[chunk_dp_index]
            chunk_dp_info['storeFilename'] = chunk_dp_store_filename

    log.info(f"Chunking into {len(chunk_infos)} computation tasks")
    current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [
        compute_filter_similarity.si(chunk_info, project_id, run_id, threshold,
                                     encoding_size, span_serialized)
        for chunk_info in chunk_infos
    ]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(
        project_id, run_id,
        parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)
示例#4
0
def create_comparison_jobs(project_id, run_id, parent_span=None):
    log = logger.bind(pid=project_id, run_id=run_id)
    with DBConn() as conn:

        dp_ids = get_dataprovider_ids(conn, project_id)
        assert len(dp_ids) >= 2, "Expected at least 2 data providers"
        log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1]))
        current_span = create_comparison_jobs.span

        if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id):
            log.info("Skipping as project or run not found in database.")
            return

        run_info = get_run(conn, run_id)
        threshold = run_info['threshold']

        dataset_sizes = get_project_dataset_sizes(conn, project_id)

        if len(dataset_sizes) < 2:
            log.warning("Unexpected number of dataset sizes in db. Stopping")
            update_run_mark_failure(conn, run_id)
            return
        else:
            lenf1, lenf2 = dataset_sizes

        encoding_size = get_project_encoding_size(conn, project_id)

        size = lenf1 * lenf2

        log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2))
        current_span.log_kv({"event": 'get-dataset-sizes'})

        filters1_object_filename = get_filter_metadata(conn, dp_ids[0])
        filters2_object_filename = get_filter_metadata(conn, dp_ids[1])
        current_span.log_kv({"event": 'get-metadata'})

        log.debug("Chunking computation task")
        chunk_size = Config.get_task_chunk_size(size, threshold)
        if chunk_size is None:
            chunk_size = max(lenf1, lenf2)
        log.info("Chunks will contain {} entities per task".format(chunk_size))
        update_run_chunk(conn, project_id, chunk_size)
    job_chunks = []

    dp1_chunks = []
    dp2_chunks = []

    for chunk_start_index_dp1 in range(0, lenf1, chunk_size):
        dp1_chunks.append(
            (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1))
        )
    for chunk_start_index_dp2 in range(0, lenf2, chunk_size):
        dp2_chunks.append(
            (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2))
        )

    # Every chunk in dp1 has to be run against every chunk in dp2
    for dp1_chunk in dp1_chunks:
        for dp2_chunk in dp2_chunks:
            job_chunks.append((dp1_chunk, dp2_chunk, ))

    log.info("Chunking into {} computation tasks each with (at most) {} entities.".format(
        len(job_chunks), chunk_size))
    current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)})
    span_serialized = create_comparison_jobs.get_serialized_span()

    # Prepare the Celery Chord that will compute all the similarity scores:
    scoring_tasks = [compute_filter_similarity.si(
        chunk_dp1,
        chunk_dp2,
        project_id,
        run_id,
        threshold,
        encoding_size,
        span_serialized
    ) for chunk_dp1, chunk_dp2 in job_chunks]

    if len(scoring_tasks) == 1:
        scoring_tasks.append(celery_bug_fix.si())

    callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error(
        on_chord_error.s(run_id=run_id))
    future = chord(scoring_tasks)(callback_task)