def test_iterate(self):
        test_dict = {1 : {2 : 3,
                          4 : 5},
                     6 : {7 : 8},
                     9 : 10}

        expected_keys = [(1,2), (1,4), (6,7), (9,)]
        self.assertEqual(expected_keys, [x for x in flattened_keys(test_dict)])
    def test_sort_order(self):
        test_dict = {1 : {2 : 3,
                          4 : 5},
                     6 : {7 : 8},
                     9 : 10}

        key_order = [6,9,7,2,4,1]

        def my_sort_function(x):
            return key_order.index(x[0])

        expected_keys = [(6,7), (9,), (1,2), (1,4),]

        self.assertEqual(expected_keys, [x for x in flattened_keys(
                    test_dict, sort_function = my_sort_function)])
예제 #3
0
    def test_sort_order(self):
        test_dict = {1: {2: 3, 4: 5}, 6: {7: 8}, 9: 10}

        key_order = [6, 9, 7, 2, 4, 1]

        def my_sort_function(x):
            return key_order.index(x[0])

        expected_keys = [
            (6, 7),
            (9, ),
            (1, 2),
            (1, 4),
        ]

        self.assertEqual(expected_keys, [
            x
            for x in flattened_keys(test_dict, sort_function=my_sort_function)
        ])
    def test_multi_job_scan_share(self):
        job_ids = [1, 2]
        phase_zero_prefix_size = 4242

        worker_inputs = []
        worker_inputs.append({
                "host_A" : {
                    0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500), ("file_A_5", 6000)],
                    3 : [("file_A_6", 1000), ("file_A_7", 2000),
                         ("file_A_8", 1000)]
                    },
                "host_B" : {
                    0 : [("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500)],
                    3 : [("file_A_6", 1000)]
                    }
                })
        worker_inputs.append({
                "host_A" : {
                    0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500), ("file_A_5", 6000)],
                    3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                    },
                "host_B" : {
                    0 : [("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500)],
                    3 : [("file_A_7", 2000)]
                    }
                })

        read_requests = generate_read_requests(
            worker_inputs, phase_zero_prefix_size, job_ids)

        # Expected file assignments after scan-sharing merge

        expected_assignments = {
            "host_A" : {
                0 : [("file_A_1", 1000, [1,2]), ("file_A_2", 3000, [1,2])],
                1 : [("file_A_3", 1000, [1,2])],
                2 : [("file_A_4", 500, [1,2]), ("file_A_5", 6000, [1,2])],
                3 : [("file_A_6", 1000, [1,2]), ("file_A_7", 2000, [1,2]),
                     ("file_A_8", 1000, [1])]
                },
            "host_B" : {
                0 : [("file_A_2", 3000, [1,2])],
                1 : [("file_A_3", 1000, [1,2])],
                2 : [("file_A_4", 500, [1,2])],
                3 : [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])]
                }
            }

        for host, worker in utils.flattened_keys(expected_assignments):
            assignments = expected_assignments[host][worker]
            reqs = read_requests[host][worker]

            self.assertEqual((len(assignments) + 1) * 3, len(reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for job_id in job_ids:
                for assignment in assignments:
                    read_request = reqs[req_index]

                    self.assertEqual(assignment[0], read_request["path"])
                    self.assertEqual(
                        phase_zero_prefix_size, read_request["length"])
                    self.assertEqual([job_id], read_request["job_ids"])
                    self.assertEqual(0, read_request["offset"])
                    self.assertEqual(0, read_request["type"])

                    req_index += 1

                # These read requests should be followed by a halt request
                self.assertEqual(1, reqs[req_index]["type"])
                self.assertEqual([job_id], reqs[req_index]["job_ids"])
                req_index += 1

            # Next, should have a full phase one read request for each file
            for assignment in assignments:
                read_request = reqs[req_index]

                self.assertEqual(assignment[0], read_request["path"])
                self.assertEqual(assignment[1], read_request["length"])
                self.assertEqual(assignment[2], read_request["job_ids"])
                self.assertEqual(0, read_request["offset"])
                self.assertEqual(0, read_request["type"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, reqs[req_index]["type"])
            self.assertEqual(job_ids, reqs[req_index]["job_ids"])
    def test_single_job(self):
        worker_inputs = {
            "host_A" : {
                0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                1 : [("file_A_3", 1000)],
                2 : [("file_A_4", 500), ("file_A_5", 6000)],
                3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                },
            "host_B" : {
                0 : [("file_A_2", 3000)],
                1 : [("file_A_3", 1000)],
                2 : [("file_A_4", 500)],
                3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                }
            }

        phase_zero_prefix_size = 4242
        job_ids = [1]

        read_requests = generate_read_requests(
            [worker_inputs], phase_zero_prefix_size, job_ids)

        for host, worker in utils.flattened_keys(worker_inputs):
            worker_reqs = read_requests[host][worker]

            self.assertEqual(
                2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(phase_zero_prefix_size, req["length"])

                req_index += 1

            # Halt request for phase zero should come after that
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1

            # Next, should have a full phase one read request for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(length, req["length"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1
def generate_read_requests(
    job_inputs, phase_zero_sample_rate, phase_zero_sample_points_per_file,
    tuple_start_offset, job_ids, phases = list([0, 1])):

    assert phase_zero_sample_rate <= 1.0,\
        "Cannot have a sample rate greater than 1. Got %f" % (
        phase_zero_sample_rate)

    scan_shared_inputs = utils.NestedDict(3, list)

    for (job_id, worker_inputs) in itertools.izip(job_ids, job_inputs):
        for host, worker in utils.flattened_keys(worker_inputs):
            for file_info in worker_inputs[host][worker]:
                scan_shared_inputs[host][worker][file_info].append(job_id)

    read_requests = utils.NestedDict(2, list)
    phase_one_read_requests = utils.NestedDict(2, list)

    for phase in sorted(phases):
        if phase == 0:
            # Read file prefixes for phase zero sampling for each job
            for job_id in job_ids:
                for host, worker, file_info in utils.flattened_keys(
                    scan_shared_inputs):

                    file_url, file_length = file_info
                    # Compute the number of sampled bytes from the sample rate
                    sample_length = file_length * phase_zero_sample_rate

                    # Compute the number of bytes per sample point from the
                    # number of sample points within a file
                    if phase_zero_sample_points_per_file > 1:
                        assert tuple_start_offset != 0, "Cannot sample " \
                            "multiple points per file without specifying a " \
                            "tuple start offset."
                    sample_length_per_sample_point = \
                        sample_length / phase_zero_sample_points_per_file
                    sample_point_offset = \
                        file_length / phase_zero_sample_points_per_file

                    # If we know tuple boundary offsets, then force whole tuples
                    if tuple_start_offset > 0:
                        sample_length_per_sample_point -= \
                            sample_length_per_sample_point % tuple_start_offset
                        sample_point_offset -= \
                            sample_point_offset % tuple_start_offset

                    # At this point we are guaranteed that
                    #  sample length <= sample offset
                    #  both are multiples of the tuple length if fixed size
                    #    tuples

                    for i in xrange(phase_zero_sample_points_per_file):
                        # Chunk up the sample data into fixed size samples
                        # spread evenly across the file
                        read_requests[host][worker].append(
                            generate_read_request(
                                [job_id], file_url, i * sample_point_offset,
                                sample_length_per_sample_point))

                # Add a halt request after all of the samples for this worker
                for host, worker in utils.flattened_keys(read_requests):
                    read_requests[host][worker].append(generate_halt_request(
                            [job_id]))

        elif phase == 1:
            for host, worker, file_info in utils.flattened_keys(
                scan_shared_inputs):

                file_url, file_length = file_info
                file_jobs = scan_shared_inputs[host][worker][file_info]

                phase_one_read_requests[host][worker].append(
                    generate_read_request(file_jobs, file_url, 0, file_length))

            for host, worker in utils.flattened_keys(phase_one_read_requests):
                # Randomly permute input files in phase one.
                requests = list(phase_one_read_requests[host][worker])
                random.shuffle(requests)
                for request in requests:
                    read_requests[host][worker].append(request)

                read_requests[host][worker].append(generate_halt_request(
                        job_ids))

    return read_requests
예제 #7
0
    def test_iterate(self):
        test_dict = {1: {2: 3, 4: 5}, 6: {7: 8}, 9: 10}

        expected_keys = [(1, 2), (1, 4), (6, 7), (9, )]
        self.assertEqual(expected_keys, [x for x in flattened_keys(test_dict)])
예제 #8
0
    def test_multi_job_scan_share(self):
        job_ids = [1, 2]
        phase_zero_prefix_size = 4242

        worker_inputs = []
        worker_inputs.append({
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_6", 1000)]
            }
        })
        worker_inputs.append({
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_7", 2000)]
            }
        })

        read_requests = generate_read_requests(worker_inputs,
                                               phase_zero_prefix_size, job_ids)

        # Expected file assignments after scan-sharing merge

        expected_assignments = {
            "host_A": {
                0: [("file_A_1", 1000, [1, 2]), ("file_A_2", 3000, [1, 2])],
                1: [("file_A_3", 1000, [1, 2])],
                2: [("file_A_4", 500, [1, 2]), ("file_A_5", 6000, [1, 2])],
                3: [("file_A_6", 1000, [1, 2]), ("file_A_7", 2000, [1, 2]),
                    ("file_A_8", 1000, [1])]
            },
            "host_B": {
                0: [("file_A_2", 3000, [1, 2])],
                1: [("file_A_3", 1000, [1, 2])],
                2: [("file_A_4", 500, [1, 2])],
                3: [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])]
            }
        }

        for host, worker in utils.flattened_keys(expected_assignments):
            assignments = expected_assignments[host][worker]
            reqs = read_requests[host][worker]

            self.assertEqual((len(assignments) + 1) * 3, len(reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for job_id in job_ids:
                for assignment in assignments:
                    read_request = reqs[req_index]

                    self.assertEqual(assignment[0], read_request["path"])
                    self.assertEqual(phase_zero_prefix_size,
                                     read_request["length"])
                    self.assertEqual([job_id], read_request["job_ids"])
                    self.assertEqual(0, read_request["offset"])
                    self.assertEqual(0, read_request["type"])

                    req_index += 1

                # These read requests should be followed by a halt request
                self.assertEqual(1, reqs[req_index]["type"])
                self.assertEqual([job_id], reqs[req_index]["job_ids"])
                req_index += 1

            # Next, should have a full phase one read request for each file
            for assignment in assignments:
                read_request = reqs[req_index]

                self.assertEqual(assignment[0], read_request["path"])
                self.assertEqual(assignment[1], read_request["length"])
                self.assertEqual(assignment[2], read_request["job_ids"])
                self.assertEqual(0, read_request["offset"])
                self.assertEqual(0, read_request["type"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, reqs[req_index]["type"])
            self.assertEqual(job_ids, reqs[req_index]["job_ids"])
예제 #9
0
    def test_single_job(self):
        worker_inputs = {
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            }
        }

        phase_zero_prefix_size = 4242
        job_ids = [1]

        read_requests = generate_read_requests([worker_inputs],
                                               phase_zero_prefix_size, job_ids)

        for host, worker in utils.flattened_keys(worker_inputs):
            worker_reqs = read_requests[host][worker]

            self.assertEqual(2 * (len(worker_inputs[host][worker]) + 1),
                             len(worker_reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(phase_zero_prefix_size, req["length"])

                req_index += 1

            # Halt request for phase zero should come after that
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1

            # Next, should have a full phase one read request for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(length, req["length"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1