def _retrieve_blocked_dataset_sizes_and_lookup(conn, project_id, dp_ids): """Fetch encoding counts for each dataset by block. And create lookup table from block_name to block_id :param dp_ids: Iterable of dataprovider database identifiers. :returns A 2-tuple of: - dataset sizes: a tuple of the number of encodings in each dataset. - dp_block_sizes: A map from dataprovider id to a dict mapping block id to the number of encodings from the dataprovider in the block. {dp_id -> {block_id -> block_size}} e.g. {33: {'1': 100}, 34: {'1': 100}, 35: {'1': 100}} """ dataset_sizes = get_project_dataset_sizes(conn, project_id) dp_block_sizes = {} dp_block_lookups = {} for dp_id in dp_ids: block_sizes = {} lookup = {} for block_name, block_id, count in get_block_metadata(conn, dp_id): block_sizes[block_name] = count lookup[block_name] = block_id dp_block_sizes[dp_id] = block_sizes dp_block_lookups[dp_id] = lookup return dataset_sizes, dp_block_sizes, dp_block_lookups
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: return mc = connect_to_object_store() files = [] data_size = 0 for num, filename in similarity_result_files: if num > 0: files.append(filename) data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size log.debug("Aggregating result chunks from {} files, total size: {}".format( len(files), fmt_bytes(data_size))) result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files) log.info("Similarity score results are {}".format(fmt_bytes(data_size))) result_stream = chain_streams(result_file_stream_generator) with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Note: Storing the similarity scores for all result types result_filename = store_similarity_scores(result_stream, run_id, data_size, db) if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) lenf1, lenf2 = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.info("Deleting intermediate similarity score files from object store") mc.remove_objects(Config.MINIO_BUCKET, files) log.debug("Removing clk filters from redis cache") remove_from_cache(dp_ids[0]) remove_from_cache(dp_ids[1]) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def _retrieve_blocked_dataset_sizes(conn, project_id, dp_ids): """Fetch encoding counts for each dataset by block. :param dp_ids: Iterable of dataprovider database identifiers. :returns A 2-tuple of: - dataset sizes: a tuple of the number of encodings in each dataset. - dp_block_sizes: A map from dataprovider id to a dict mapping block id to the number of encodings from the dataprovider in the block. {dp_id -> {block_id -> block_size}} e.g. {33: {'1': 100}, 34: {'1': 100}, 35: {'1': 100}} """ dataset_sizes = get_project_dataset_sizes(conn, project_id) dp_block_sizes = {} for dp_id in dp_ids: dp_block_sizes[dp_id] = dict(get_block_metadata(conn, dp_id)) return dataset_sizes, dp_block_sizes
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: raise TypeError("Inappropriate argument type - missing results files.") files = [] for res in similarity_result_files: if res is None: log.warning( "Missing results during aggregation. Stopping processing.") raise TypeError( "Inappropriate argument type - results missing at aggregation step." ) num, filesize, filename = res if num: assert filesize is not None assert filename is not None files.append((num, filesize, filename)) else: assert filesize is None assert filename is None heapq.heapify(files) log.debug(f"Aggregating result chunks from {len(files)} files, " f"total size: {sum(map(operator.itemgetter(1), files))}") mc = connect_to_object_store() while len(files) > 1: file0 = heapq.heappop(files) file1 = heapq.heappop(files) merged_file = _merge_files(mc, log, file0, file1) heapq.heappush(files, merged_file) if not files: # No results. Let's chuck in an empty file. empty_file = _put_placeholder_empty_file(mc, log) files.append(empty_file) (merged_num, merged_filesize, merged_filename), = files log.info(f"Similarity score results in {merged_filename} in bucket " f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.") with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') result_id = insert_similarity_score_file(db, run_id, merged_filename) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}") if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) dataset_sizes = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: remove_from_cache(dp_id) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(merged_filename, project_id, run_id, dataset_sizes, aggregate_comparisons.get_serialized_span())
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info(f"Starting comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return encoding_size = get_project_encoding_size(conn, project_id) log.info(f"Computing similarity for " f"{' x '.join(map(str, dataset_sizes))} entities") current_span.log_kv({"event": 'get-dataset-sizes'}) filters_object_filenames = tuple( get_filter_metadata(conn, dp_id) for dp_id in dp_ids) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_infos = tuple( anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM, dataset_sizes=dataset_sizes)) # Save filenames with chunk information. for chunk_info in chunk_infos: for chunk_dp_info in chunk_info: chunk_dp_index = chunk_dp_info['datasetIndex'] chunk_dp_store_filename = filters_object_filenames[chunk_dp_index] chunk_dp_info['storeFilename'] = chunk_dp_store_filename log.info(f"Chunking into {len(chunk_infos)} computation tasks") current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunk_infos ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id, run_id, parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1])) current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return else: lenf1, lenf2 = dataset_sizes encoding_size = get_project_encoding_size(conn, project_id) size = lenf1 * lenf2 log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2)) current_span.log_kv({"event": 'get-dataset-sizes'}) filters1_object_filename = get_filter_metadata(conn, dp_ids[0]) filters2_object_filename = get_filter_metadata(conn, dp_ids[1]) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_size = Config.get_task_chunk_size(size, threshold) if chunk_size is None: chunk_size = max(lenf1, lenf2) log.info("Chunks will contain {} entities per task".format(chunk_size)) update_run_chunk(conn, project_id, chunk_size) job_chunks = [] dp1_chunks = [] dp2_chunks = [] for chunk_start_index_dp1 in range(0, lenf1, chunk_size): dp1_chunks.append( (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1)) ) for chunk_start_index_dp2 in range(0, lenf2, chunk_size): dp2_chunks.append( (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2)) ) # Every chunk in dp1 has to be run against every chunk in dp2 for dp1_chunk in dp1_chunks: for dp2_chunk in dp2_chunks: job_chunks.append((dp1_chunk, dp2_chunk, )) log.info("Chunking into {} computation tasks each with (at most) {} entities.".format( len(job_chunks), chunk_size)) current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [compute_filter_similarity.si( chunk_dp1, chunk_dp2, project_id, run_id, threshold, encoding_size, span_serialized ) for chunk_dp1, chunk_dp2 in job_chunks] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error( on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)