def prerun_check(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Sanity check that we need to compute run") with DBConn() as conn: if not check_project_exists(conn, project_id): log.debug("Project not found. Skipping") raise ProjectDeleted(project_id) res = get_run(conn, run_id) if res is None: log.debug(f"Run not found. Skipping") raise RunDeleted(run_id) try: state = get_run_state_for_update(conn, run_id) except psycopg2.OperationalError: log.warning("Run started in another task. Skipping this race.") return if state in {'running', 'completed', 'error'}: log.warning("Run already started. Skipping") return log.debug("Setting run as in progress") update_run_set_started(conn, run_id) log.debug("Getting dp ids for compute similarity task") dp_ids = get_dataprovider_ids(conn, project_id) log.debug("Data providers: {}".format(dp_ids)) create_comparison_jobs.delay(project_id, run_id, prerun_check.get_serialized_span()) log.info("CLK similarity computation scheduled")
def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") mapping = similarity_result['mapping'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Just save the raw "mapping" log.debug("Saving the resulting map data to the db") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Mapping result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") permute_mapping_data.apply_async( (project_id, run_id, similarity_result['lenf1'], similarity_result['lenf2'], save_and_permute.get_serialized_span())) else: log.debug("Mark mapping job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def create_comparison_jobs(project_id, run_id, parent_span=None): """Schedule all the entity comparisons as sub tasks for a run. At a high level this task: - checks if the project and run have been deleted and if so aborts. - retrieves metadata: the number and size of the datasets, the encoding size, and the number and size of blocks. - splits the work into independent "chunks" and schedules them to run in celery - schedules the follow up task to run after all the comparisons have been computed. """ log = logger.bind(pid=project_id, run_id=run_id) current_span = create_comparison_jobs.span with DBConn() as conn: check_run_active(conn, project_id, run_id) dp_ids = get_dataprovider_ids(conn, project_id) number_of_datasets = len(dp_ids) assert number_of_datasets >= 2, "Expected at least 2 data providers" log.info(f"Scheduling comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") # Retrieve required metadata dataset_sizes, dp_block_sizes = _retrieve_blocked_dataset_sizes( conn, project_id, dp_ids) log.info("Finding blocks in common between dataproviders") common_blocks = _get_common_blocks(dp_block_sizes, dp_ids) # We pass the encoding_size and threshold to the comparison tasks to minimize their db lookups encoding_size = get_project_encoding_size(conn, project_id) threshold = get_run(conn, run_id)['threshold'] log.debug("Chunking computation task") # Create "chunks" of comparisons chunks = _create_work_chunks(common_blocks, dp_block_sizes, dp_ids, log) log.info(f"Chunking into {len(chunks)} computation tasks") current_span.log_kv({ "event": "chunking", 'num_chunks': len(chunks), 'dataset-sizes': dataset_sizes }) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunks ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id=project_id, run_id=run_id, parent_span=span_serialized).on_error( run_failed_handler.s(run_id=run_id)) log.info(f"Scheduling comparison tasks") future = chord(scoring_tasks)(callback_task)
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: return mc = connect_to_object_store() files = [] data_size = 0 for num, filename in similarity_result_files: if num > 0: files.append(filename) data_size += mc.stat_object(Config.MINIO_BUCKET, filename).size log.debug("Aggregating result chunks from {} files, total size: {}".format( len(files), fmt_bytes(data_size))) result_file_stream_generator = (mc.get_object(Config.MINIO_BUCKET, result_filename) for result_filename in files) log.info("Similarity score results are {}".format(fmt_bytes(data_size))) result_stream = chain_streams(result_file_stream_generator) with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') # Note: Storing the similarity scores for all result types result_filename = store_similarity_scores(result_stream, run_id, data_size, db) if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) lenf1, lenf2 = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.info("Deleting intermediate similarity score files from object store") mc.remove_objects(Config.MINIO_BUCKET, files) log.debug("Removing clk filters from redis cache") remove_from_cache(dp_ids[0]) remove_from_cache(dp_ids[1]) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(result_filename, project_id, run_id, lenf1, lenf2, aggregate_comparisons.get_serialized_span())
def save_and_permute(similarity_result, project_id, run_id, parent_span): log = logger.bind(pid=project_id, run_id=run_id) log.debug("Saving and possibly permuting data") groups = similarity_result['groups'] # Note Postgres requires JSON object keys to be strings # Celery actually converts the json arguments in the same way with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') if result_type == "groups": # Save the raw groups log.debug("Saving the groups in the DB") result_id = insert_mapping_result(db, run_id, groups) else: # Turn groups into mapping and save that log.debug("Turning groups into mapping") mapping = groups_to_mapping(groups) log.debug("Saving mappuing in the DB") result_id = insert_mapping_result(db, run_id, mapping) dp_ids = get_dataprovider_ids(db, project_id) log.info("Result saved to db with result id {}".format(result_id)) if result_type == "permutations": log.debug("Submitting job to permute mapping") dataset0_size, dataset1_size = similarity_result['datasetSizes'] permute_mapping_data.apply_async( (project_id, run_id, dataset0_size, dataset1_size, save_and_permute.get_serialized_span())) else: log.debug("Mark job as complete") mark_run_complete.delay(run_id, save_and_permute.get_serialized_span()) # Post similarity computation cleanup log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: cache.remove_from_cache(dp_id) calculate_comparison_rate.delay()
def aggregate_comparisons(similarity_result_files, project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) if similarity_result_files is None: raise TypeError("Inappropriate argument type - missing results files.") files = [] for res in similarity_result_files: if res is None: log.warning( "Missing results during aggregation. Stopping processing.") raise TypeError( "Inappropriate argument type - results missing at aggregation step." ) num, filesize, filename = res if num: assert filesize is not None assert filename is not None files.append((num, filesize, filename)) else: assert filesize is None assert filename is None heapq.heapify(files) log.debug(f"Aggregating result chunks from {len(files)} files, " f"total size: {sum(map(operator.itemgetter(1), files))}") mc = connect_to_object_store() while len(files) > 1: file0 = heapq.heappop(files) file1 = heapq.heappop(files) merged_file = _merge_files(mc, log, file0, file1) heapq.heappush(files, merged_file) if not files: # No results. Let's chuck in an empty file. empty_file = _put_placeholder_empty_file(mc, log) files.append(empty_file) (merged_num, merged_filesize, merged_filename), = files log.info(f"Similarity score results in {merged_filename} in bucket " f"{Config.MINIO_BUCKET} take up {merged_filesize} bytes.") with DBConn() as db: result_type = get_project_column(db, project_id, 'result_type') result_id = insert_similarity_score_file(db, run_id, merged_filename) log.debug(f"Saved path to similarity scores file to db with id " f"{result_id}") if result_type == "similarity_scores": # Post similarity computation cleanup dp_ids = get_dataprovider_ids(db, project_id) else: # we promote the run to the next stage progress_stage(db, run_id) dataset_sizes = get_project_dataset_sizes(db, project_id) # DB now committed, we can fire off tasks that depend on the new db state if result_type == "similarity_scores": log.debug("Removing clk filters from redis cache") for dp_id in dp_ids: remove_from_cache(dp_id) # Complete the run log.info("Marking run as complete") mark_run_complete.delay(run_id, aggregate_comparisons.get_serialized_span()) else: solver_task.delay(merged_filename, project_id, run_id, dataset_sizes, aggregate_comparisons.get_serialized_span())
def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_span): """ Task which will create a permutation after a mapping has been completed. :param project_id: The project resource id :param run_id: The run id :param len_filters1: :param len_filters2: """ log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: mapping_str = get_run_result(conn, run_id) # Convert to int: int mapping = {int(k): int(mapping_str[k]) for k in mapping_str} log.info("Creating random permutations") log.debug( "Entities in dataset A: {}, Entities in dataset B: {}".format( len_filters1, len_filters2)) """ Pack all the entities that match in the **same** random locations in both permutations. Then fill in all the gaps! Dictionaries first, then converted to lists. """ smaller_dataset_size = min(len_filters1, len_filters2) log.debug("Smaller dataset size is {}".format(smaller_dataset_size)) number_in_common = len(mapping) a_permutation = {} # Should be length of filters1 b_permutation = {} # length of filters2 # By default mark all rows as NOT included in the mask mask = {i: False for i in range(smaller_dataset_size)} # start with all the possible indexes remaining_new_indexes = list(range(smaller_dataset_size)) log.info("Shuffling indices for matched entities") random.shuffle(remaining_new_indexes) log.info("Assigning random indexes for {} matched entities".format( number_in_common)) for mapping_number, a_index in enumerate(mapping): b_index = mapping[a_index] # Choose the index in the new mapping (randomly) mapping_index = remaining_new_indexes[mapping_number] a_permutation[a_index] = mapping_index b_permutation[b_index] = mapping_index # Mark the row included in the mask mask[mapping_index] = True remaining_new_indexes = set(remaining_new_indexes[number_in_common:]) log.info("Randomly adding all non matched entities") # Note the a and b datasets could be of different size. # At this point, both still have to use the remaining_new_indexes, and any # indexes that go over the number_in_common remaining_a_values = list( set(range(smaller_dataset_size, len_filters1)).union(remaining_new_indexes)) remaining_b_values = list( set(range(smaller_dataset_size, len_filters2)).union(remaining_new_indexes)) log.debug("Shuffle the remaining indices") random.shuffle(remaining_a_values) random.shuffle(remaining_b_values) # For every element in a's permutation for a_index in range(len_filters1): # Check if it is not already present if a_index not in a_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_a_values.pop() a_permutation[a_index] = mapping_index # For every eventual element in a's permutation for b_index in range(len_filters2): # Check if it is not already present if b_index not in b_permutation: # This index isn't yet mapped # choose and remove a random index from the extended list of those that remain # note this "could" be the same row (a NOP 1-1 permutation) mapping_index = remaining_b_values.pop() b_permutation[b_index] = mapping_index log.debug("Completed creating new permutations for each party") dp_ids = get_dataprovider_ids(conn, project_id) for i, permutation in enumerate([a_permutation, b_permutation]): # We convert here because celery and dicts with int keys don't play nice perm_list = convert_mapping_to_list(permutation) log.debug("Saving a permutation") insert_permutation(conn, dp_ids[i], run_id, perm_list) log.debug("Raw permutation data saved. Now saving raw mask") # Convert the mask dict to a list of 0/1 ints mask_list = convert_mapping_to_list( {int(key): 1 if value else 0 for key, value in mask.items()}) log.debug("Saving the mask") insert_permutation_mask(conn, project_id, run_id, mask_list) log.info("Mask saved") log.info("Committing database transaction") mark_run_complete.delay(run_id, permute_mapping_data.get_serialized_span())
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info(f"Starting comparison of CLKs from data provider ids: " f"{', '.join(map(str, dp_ids))}") current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists( conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return encoding_size = get_project_encoding_size(conn, project_id) log.info(f"Computing similarity for " f"{' x '.join(map(str, dataset_sizes))} entities") current_span.log_kv({"event": 'get-dataset-sizes'}) filters_object_filenames = tuple( get_filter_metadata(conn, dp_id) for dp_id in dp_ids) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_infos = tuple( anonlink.concurrency.split_to_chunks(Config.CHUNK_SIZE_AIM, dataset_sizes=dataset_sizes)) # Save filenames with chunk information. for chunk_info in chunk_infos: for chunk_dp_info in chunk_info: chunk_dp_index = chunk_dp_info['datasetIndex'] chunk_dp_store_filename = filters_object_filenames[chunk_dp_index] chunk_dp_info['storeFilename'] = chunk_dp_store_filename log.info(f"Chunking into {len(chunk_infos)} computation tasks") current_span.log_kv({"event": "chunking", 'num_chunks': len(chunk_infos)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [ compute_filter_similarity.si(chunk_info, project_id, run_id, threshold, encoding_size, span_serialized) for chunk_info in chunk_infos ] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s( project_id, run_id, parent_span=span_serialized).on_error(on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)
def create_comparison_jobs(project_id, run_id, parent_span=None): log = logger.bind(pid=project_id, run_id=run_id) with DBConn() as conn: dp_ids = get_dataprovider_ids(conn, project_id) assert len(dp_ids) >= 2, "Expected at least 2 data providers" log.info("Starting comparison of CLKs from data provider ids: {}, {}".format(dp_ids[0], dp_ids[1])) current_span = create_comparison_jobs.span if not check_project_exists(conn, project_id) or not check_run_exists(conn, project_id, run_id): log.info("Skipping as project or run not found in database.") return run_info = get_run(conn, run_id) threshold = run_info['threshold'] dataset_sizes = get_project_dataset_sizes(conn, project_id) if len(dataset_sizes) < 2: log.warning("Unexpected number of dataset sizes in db. Stopping") update_run_mark_failure(conn, run_id) return else: lenf1, lenf2 = dataset_sizes encoding_size = get_project_encoding_size(conn, project_id) size = lenf1 * lenf2 log.info("Computing similarity for {} x {} entities".format(lenf1, lenf2)) current_span.log_kv({"event": 'get-dataset-sizes'}) filters1_object_filename = get_filter_metadata(conn, dp_ids[0]) filters2_object_filename = get_filter_metadata(conn, dp_ids[1]) current_span.log_kv({"event": 'get-metadata'}) log.debug("Chunking computation task") chunk_size = Config.get_task_chunk_size(size, threshold) if chunk_size is None: chunk_size = max(lenf1, lenf2) log.info("Chunks will contain {} entities per task".format(chunk_size)) update_run_chunk(conn, project_id, chunk_size) job_chunks = [] dp1_chunks = [] dp2_chunks = [] for chunk_start_index_dp1 in range(0, lenf1, chunk_size): dp1_chunks.append( (filters1_object_filename, chunk_start_index_dp1, min(chunk_start_index_dp1 + chunk_size, lenf1)) ) for chunk_start_index_dp2 in range(0, lenf2, chunk_size): dp2_chunks.append( (filters2_object_filename, chunk_start_index_dp2, min(chunk_start_index_dp2 + chunk_size, lenf2)) ) # Every chunk in dp1 has to be run against every chunk in dp2 for dp1_chunk in dp1_chunks: for dp2_chunk in dp2_chunks: job_chunks.append((dp1_chunk, dp2_chunk, )) log.info("Chunking into {} computation tasks each with (at most) {} entities.".format( len(job_chunks), chunk_size)) current_span.log_kv({"event": "chunking", "chunksize": chunk_size, 'num_chunks': len(job_chunks)}) span_serialized = create_comparison_jobs.get_serialized_span() # Prepare the Celery Chord that will compute all the similarity scores: scoring_tasks = [compute_filter_similarity.si( chunk_dp1, chunk_dp2, project_id, run_id, threshold, encoding_size, span_serialized ) for chunk_dp1, chunk_dp2 in job_chunks] if len(scoring_tasks) == 1: scoring_tasks.append(celery_bug_fix.si()) callback_task = aggregate_comparisons.s(project_id, run_id, parent_span=span_serialized).on_error( on_chord_error.s(run_id=run_id)) future = chord(scoring_tasks)(callback_task)