Пример #1
0
def test_get_id_range_for_partition_with_one_over():
    """Checks that the proper upper and lower bound are retrieved even when the range of IDs leaves only 1 item
    in the last partition. There was a bug here before."""
    min_id = 1
    max_id = 101
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    assert id_range_item_count % partition_size == 1  # one over the partition size
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (min_id +
                           (partition_size * partition_range[-1])) == 101
    assert upper_bound == max_id == 101
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})
Пример #2
0
def test_get_id_range_for_partition_with_evenly_divisible():
    """Check all is good when set of records fit evenly into partitions (each partition full)"""
    min_id = 1
    max_id = 100
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    assert id_range_item_count % partition_size == 0  # evenly divisible
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (max_id - partition_size +
                           1) == (min_id +
                                  (partition_size * partition_range[-1]))
    assert upper_bound == max_id
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})
Пример #3
0
def test_get_id_range_for_partition_with_empty_partitions():
    """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by
    the partition size. There was a bug here before."""
    min_id = 1
    max_id = 100
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    record_ids = {1, 5, 7, 15, 19, 20, 41, 100}
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = len(record_ids)
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (min_id + (partition_size * partition_range[-1]))
    assert upper_bound == max_id
    assert _remove_seen_ids(ctrl, record_ids) == set({})
Пример #4
0
def test_get_id_range_for_partition_one_records():
    min_id = 1
    max_id = 1
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    etl_config = {"partition_size": 10000}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    partition_range = range(0, ctrl.config["partitions"])
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == min_id
    assert upper_bound == max_id
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})