def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 map_task.task_name = 'map_part_{}'.format(map_task.partition_id) map_task.part_field = 'example_id' map_task.data_portal_type = dp_pb.DataPortalType.Streaming for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id reduce_task.task_name = 'reduce_part_{}'.format(partition_id) self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def _create_reduce_task(self, partition_id): assert self._processing_job is not None job = self._processing_job job_id = job.job_id return dp_pb.ReduceTask( map_base_dir=self._map_output_dir(job_id), reduce_base_dir=self._reduce_output_dir(job_id), partition_id=partition_id)
def _create_reduce_task(self, rank_id, partition_id): assert self._processing_job is not None job = self._processing_job job_id = job.job_id task_name = '{}-dp_portal_job_{:08}-part-{:04}-reduce'.format( self._portal_manifest.name, job_id, partition_id) logging.info("Data portal worker-%d is allocated reduce task %s for "\ "partition %d of job %d. the reduce base dir %s"\ "-----------------\n", rank_id, task_name, partition_id, job_id, self._reduce_output_dir(job_id)) return dp_pb.ReduceTask( task_name=task_name, map_base_dir=self._map_output_dir(job_id), reduce_base_dir=self._reduce_output_dir(job_id), partition_id=partition_id)
def test_portal_worker(self): self._prepare_test() task = self._run_map_task() total_cnt = self._check_partitioner(task.map_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id reduce_task.task_name = 'reduce_part_{}'.format(partition_id) self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()