Пример #1
0
    def test_portal_worker(self):
        self._prepare_test()
        map_task = dp_pb.MapTask()
        map_task.output_base_dir = self._partition_output_dir
        map_task.output_partition_num = self._output_partition_num
        map_task.partition_id = 0
        map_task.task_name = 'map_part_{}'.format(map_task.partition_id)
        map_task.part_field = 'example_id'
        map_task.data_portal_type = dp_pb.DataPortalType.Streaming
        for partition_id in range(self._input_partition_num):
            map_task.fpaths.append(self._get_input_fpath(partition_id))

        # partitioner
        task = dp_pb.NewTaskResponse()
        task.map_task.CopyFrom(map_task)
        self._portal_worker._run_map_task(task.map_task)

        self._check_partitioner(task.map_task)

        # merge
        total_cnt = 0
        for partition_id in range(self._output_partition_num):
            reduce_task = dp_pb.ReduceTask()
            reduce_task.map_base_dir = self._partition_output_dir
            reduce_task.reduce_base_dir = self._merge_output_dir
            reduce_task.partition_id = partition_id
            reduce_task.task_name = 'reduce_part_{}'.format(partition_id)
            self._portal_worker._run_reduce_task(reduce_task)
            total_cnt += self._check_merge(reduce_task)

        self.assertEqual(total_cnt,
                         self._partition_item_num * self._input_partition_num)
        self._clean_up()
    def test_portal_worker(self):
        self._prepare_test()
        map_task = dp_pb.MapTask()
        map_task.output_base_dir = self._partition_output_dir
        map_task.output_partition_num = self._output_partition_num
        map_task.partition_id = 0
        for partition_id in range(self._input_partition_num):
            map_task.fpaths.append(self._get_input_fpath(partition_id))

        # partitioner
        task = dp_pb.NewTaskResponse()
        task.map_task.CopyFrom(map_task)
        self._portal_worker._run_map_task(task.map_task)

        self._check_partitioner(task.map_task)

        # merge
        total_cnt = 0
        for partition_id in range(self._output_partition_num):
            reduce_task = dp_pb.ReduceTask()
            reduce_task.map_base_dir = self._partition_output_dir
            reduce_task.reduce_base_dir = self._merge_output_dir
            reduce_task.partition_id = partition_id
            self._portal_worker._run_reduce_task(reduce_task)
            total_cnt += self._check_merge(reduce_task)

        self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num)
        self._clean_up()
Пример #3
0
 def _create_reduce_task(self, partition_id):
     assert self._processing_job is not None
     job = self._processing_job
     job_id = job.job_id
     return dp_pb.ReduceTask(
         map_base_dir=self._map_output_dir(job_id),
         reduce_base_dir=self._reduce_output_dir(job_id),
         partition_id=partition_id)
Пример #4
0
 def _create_reduce_task(self, rank_id, partition_id):
     assert self._processing_job is not None
     job = self._processing_job
     job_id = job.job_id
     task_name = '{}-dp_portal_job_{:08}-part-{:04}-reduce'.format(
         self._portal_manifest.name, job_id, partition_id)
     logging.info("Data portal worker-%d is allocated reduce task %s for "\
                  "partition %d of job %d. the reduce base dir %s"\
                  "-----------------\n", rank_id, task_name,
                  partition_id, job_id, self._reduce_output_dir(job_id))
     return dp_pb.ReduceTask(
         task_name=task_name,
         map_base_dir=self._map_output_dir(job_id),
         reduce_base_dir=self._reduce_output_dir(job_id),
         partition_id=partition_id)
Пример #5
0
    def test_portal_worker(self):
        self._prepare_test()
        task = self._run_map_task()
        total_cnt = self._check_partitioner(task.map_task)
        self.assertEqual(total_cnt,
                         self._partition_item_num * self._input_partition_num)

        # merge
        total_cnt = 0
        for partition_id in range(self._output_partition_num):
            reduce_task = dp_pb.ReduceTask()
            reduce_task.map_base_dir = self._partition_output_dir
            reduce_task.reduce_base_dir = self._merge_output_dir
            reduce_task.partition_id = partition_id
            reduce_task.task_name = 'reduce_part_{}'.format(partition_id)
            self._portal_worker._run_reduce_task(reduce_task)
            total_cnt += self._check_merge(reduce_task)

        self.assertEqual(total_cnt,
                         self._partition_item_num * self._input_partition_num)
        self._clean_up()