def test_clear_progress(self):
     config.CACHE_EXPIRY = datetime.timedelta(seconds=1)
     runid = 'runtest_clear_progress'
     save_current_progress(1, runid, config)
     assert 1 == get_progress(runid)
     clear_progress(runid)
     assert get_progress(runid) is None
    def test_progress_increments(self):
        config.CACHE_EXPIRY = datetime.timedelta(seconds=1)
        runid = 'test_progress_increments'
        save_current_progress(1, runid, config)
        cached_progress = get_progress(runid)
        assert cached_progress == 1
        for i in range(99):
            save_current_progress(1, runid, config)

        assert 100 == get_progress(runid)
 def test_progress_expires(self):
     # Uses the minimum expiry of 1 second
     config.CACHE_EXPIRY = datetime.timedelta(seconds=1)
     runid = 'test_progress_expires'
     save_current_progress(42, runid, config)
     cached_progress = get_progress(runid)
     assert cached_progress == 42
     time.sleep(1)
     # After expiry the progress should be reset to None
     assert get_progress(runid) is None
 def test_storing_wrong_type(self):
     config.CACHE_EXPIRY = datetime.timedelta(seconds=1)
     runid = 'test_storing_wrong_type'
     with pytest.raises(redis.exceptions.ResponseError):
         save_current_progress(1.5, runid, config)
def compute_filter_similarity(chunk_info,
                              project_id,
                              run_id,
                              threshold,
                              encoding_size,
                              parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info:
        Chunk info returned by ``anonlink.concurrency.split_to_chunks``.
        Additionally, "storeFilename" is added to each dataset chunk.
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    @returns A 2-tuple: (num_results, results_filename_in_object_store)
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug(
        "Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(
                db, project_id, run_id):
            log.info("Failing task as project or run not found in database.")
            raise DBResourceMissing("project or run not found in database")

    chunk_info_dp1, chunk_info_dp2 = chunk_info

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(
        chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(
        chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.concurrency.process_chunk(
        chunk_info, (chunk_dp1, chunk_dp2),
        anonlink.similarities.dice_coefficient_accelerated,
        threshold,
        k=min(chunk_dp1_size, chunk_dp2_size))
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    sims, _, _ = chunk_results
    num_results = len(sims)

    if num_results:
        result_filename = Config.SIMILARITY_SCORES_FILENAME_FMT.format(
            generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(
            num_results, result_filename))

        bytes_iter, file_size \
            = anonlink.serialization.dump_candidate_pairs_iter(chunk_results)
        iter_stream = iterable_to_stream(bytes_iter)

        mc = connect_to_object_store()
        try:
            mc.put_object(Config.MINIO_BUCKET, result_filename, iter_stream,
                          file_size)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise
    else:
        result_filename = None
        file_size = None
    t5 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(
        run_id, comparisons_computed, len(chunk_results)))
    log.info(
        "Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Save: {:.3f}, Total: {:.3f}"
        .format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t5 - t0))
    return num_results, file_size, result_filename
示例#6
0
def compute_filter_similarity(chunk_info_dp1, chunk_info_dp2, project_id, run_id, threshold, encoding_size, parent_span=None):
    """Compute filter similarity between a chunk of filters in dataprovider 1,
    and a chunk of filters in dataprovider 2.

    :param chunk_info_dp1:
        A tuple containing:
            - object store filename
            - Chunk start index
            - Chunk stop index
    :param chunk_info_dp2:
    :param project_id:
    :param threshold:
    :param encoding_size: The size in bytes of each encoded entry
    :param parent_span: A serialized opentracing span context.
    """
    log = logger.bind(pid=project_id, run_id=run_id)
    log.debug("Computing similarity for a chunk of filters")
    span = compute_filter_similarity.span
    log.debug("Checking that the resource exists (in case of job being canceled)")
    with DBConn() as db:
        if not check_project_exists(db, project_id) or not check_run_exists(db, project_id, run_id):
            log.info("Stopping as project or run not found in database.")
            return None

    t0 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 1")
    chunk_dp1, chunk_dp1_size = get_chunk_from_object_store(chunk_info_dp1, encoding_size)

    t1 = time.time()
    log.debug("Fetching and deserializing chunk of filters for dataprovider 2")
    chunk_dp2, chunk_dp2_size = get_chunk_from_object_store(chunk_info_dp2, encoding_size)
    t2 = time.time()
    span.log_kv({'event': 'chunks are fetched and deserialized'})
    log.debug("Calculating filter similarity")
    span.log_kv({'size1': chunk_dp1_size, 'size2': chunk_dp2_size})
    chunk_results = anonlink.entitymatch.calculate_filter_similarity(chunk_dp1, chunk_dp2,
                                                                     threshold=threshold,
                                                                     k=min(chunk_dp1_size, chunk_dp2_size),
                                                                     use_python=False)
    t3 = time.time()
    span.log_kv({'event': 'similarities calculated'})

    # Update the number of comparisons completed
    comparisons_computed = chunk_dp1_size * chunk_dp2_size
    save_current_progress(comparisons_computed, run_id)

    t4 = time.time()

    partial_sparse_result = []
    # offset chunk's index
    offset_dp1 = chunk_info_dp1[1]
    offset_dp2 = chunk_info_dp2[1]

    log.debug("Offset DP1 by: {}, DP2 by: {}".format(offset_dp1, offset_dp2))
    for (ia, score, ib) in chunk_results:
        partial_sparse_result.append((ia + offset_dp1, ib + offset_dp2, score))

    t5 = time.time()

    num_results = len(partial_sparse_result)
    if num_results > 0:
        result_filename = 'chunk-res-{}.csv'.format(generate_code(12))
        log.info("Writing {} intermediate results to file: {}".format(num_results, result_filename))

        with open(result_filename, 'wt') as f:
            csvwriter = csv.writer(f)
            csvwriter.writerows(partial_sparse_result)

        # Now write these to the object store. and return the filename and summary
        # Will write a csv file for now
        mc = connect_to_object_store()
        try:
            mc.fput_object(Config.MINIO_BUCKET, result_filename, result_filename)
        except minio.ResponseError as err:
            log.warning("Failed to store result in minio")
            raise

        # If we don't delete the file we *do* run out of space
        os.remove(result_filename)
    else:
        result_filename = None
    t6 = time.time()

    log.info("run={} Comparisons: {}, Links above threshold: {}".format(run_id, comparisons_computed, len(chunk_results)))
    log.info("Prep: {:.3f} + {:.3f}, Solve: {:.3f}, Progress: {:.3f}, Offset: {:.3f}, Save: {:.3f}, Total: {:.3f}".format(
        t1 - t0,
        t2 - t1,
        t3 - t2,
        t4 - t3,
        t4 - t4,
        t6 - t5,
        t6 - t0)
    )
    return num_results, result_filename