def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 map_task.task_name = 'map_part_{}'.format(map_task.partition_id) map_task.part_field = 'example_id' map_task.data_portal_type = dp_pb.DataPortalType.Streaming for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id reduce_task.task_name = 'reduce_part_{}'.format(partition_id) self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def _create_map_task(self, rank_id, partition_id): assert self._processing_job is not None job = self._processing_job map_fpaths = [] for fpath in job.fpaths: if hash(fpath) % self._output_partition_num == partition_id: map_fpaths.append(fpath) task_name = '{}-dp_portal_job_{:08}-part-{:04}-map'.format( self._portal_manifest.name, job.job_id, partition_id ) logging.info("Data portal worker-%d is allocated map task %s for "\ "partition %d of job %d. the map task has %d files"\ "-----------------\n", rank_id, task_name, partition_id, job.job_id, len(map_fpaths)) for seq, fpath in enumerate(map_fpaths): logging.info("%d. %s", seq, fpath) logging.info("---------------------------------\n") manifset = self._sync_portal_manifest() return dp_pb.MapTask(task_name=task_name, fpaths=map_fpaths, output_base_dir=self._map_output_dir(job.job_id), output_partition_num=self._output_partition_num, partition_id=partition_id, part_field=self._get_part_field(), data_portal_type=manifset.data_portal_type)
def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def _create_map_task(self, rank_id, partition_id): assert self._processing_job is not None job = self._processing_job map_fpaths = [] for fpath in job.fpaths: fname = path.basename(fpath) if hash(fname) % self._output_partition_num == partition_id: map_fpaths.append(fpath) return dp_pb.MapTask(fpaths=map_fpaths, output_base_dir=self._map_output_dir(job.job_id), output_partition_num=self._output_partition_num, partition_id=partition_id)
def _run_map_task(self): map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 map_task.task_name = 'map_part_{}'.format(map_task.partition_id) map_task.part_field = 'example_id' map_task.data_portal_type = dp_pb.DataPortalType.Streaming for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) return task