예제 #1
0
    def test_portal_worker(self):
        self._prepare_test()
        map_task = dp_pb.MapTask()
        map_task.output_base_dir = self._partition_output_dir
        map_task.output_partition_num = self._output_partition_num
        map_task.partition_id = 0
        map_task.task_name = 'map_part_{}'.format(map_task.partition_id)
        map_task.part_field = 'example_id'
        map_task.data_portal_type = dp_pb.DataPortalType.Streaming
        for partition_id in range(self._input_partition_num):
            map_task.fpaths.append(self._get_input_fpath(partition_id))

        # partitioner
        task = dp_pb.NewTaskResponse()
        task.map_task.CopyFrom(map_task)
        self._portal_worker._run_map_task(task.map_task)

        self._check_partitioner(task.map_task)

        # merge
        total_cnt = 0
        for partition_id in range(self._output_partition_num):
            reduce_task = dp_pb.ReduceTask()
            reduce_task.map_base_dir = self._partition_output_dir
            reduce_task.reduce_base_dir = self._merge_output_dir
            reduce_task.partition_id = partition_id
            reduce_task.task_name = 'reduce_part_{}'.format(partition_id)
            self._portal_worker._run_reduce_task(reduce_task)
            total_cnt += self._check_merge(reduce_task)

        self.assertEqual(total_cnt,
                         self._partition_item_num * self._input_partition_num)
        self._clean_up()
 def _create_map_task(self, rank_id, partition_id):
     assert self._processing_job is not None
     job = self._processing_job
     map_fpaths = []
     for fpath in job.fpaths:
         if hash(fpath) % self._output_partition_num == partition_id:
             map_fpaths.append(fpath)
     task_name = '{}-dp_portal_job_{:08}-part-{:04}-map'.format(
             self._portal_manifest.name, job.job_id, partition_id
         )
     logging.info("Data portal worker-%d is allocated map task %s for "\
                  "partition %d of job %d. the map task has %d files"\
                  "-----------------\n", rank_id, task_name,
                  partition_id, job.job_id, len(map_fpaths))
     for seq, fpath in enumerate(map_fpaths):
         logging.info("%d. %s", seq, fpath)
     logging.info("---------------------------------\n")
     manifset = self._sync_portal_manifest()
     return dp_pb.MapTask(task_name=task_name,
                          fpaths=map_fpaths,
                          output_base_dir=self._map_output_dir(job.job_id),
                          output_partition_num=self._output_partition_num,
                          partition_id=partition_id,
                          part_field=self._get_part_field(),
                          data_portal_type=manifset.data_portal_type)
    def test_portal_worker(self):
        self._prepare_test()
        map_task = dp_pb.MapTask()
        map_task.output_base_dir = self._partition_output_dir
        map_task.output_partition_num = self._output_partition_num
        map_task.partition_id = 0
        for partition_id in range(self._input_partition_num):
            map_task.fpaths.append(self._get_input_fpath(partition_id))

        # partitioner
        task = dp_pb.NewTaskResponse()
        task.map_task.CopyFrom(map_task)
        self._portal_worker._run_map_task(task.map_task)

        self._check_partitioner(task.map_task)

        # merge
        total_cnt = 0
        for partition_id in range(self._output_partition_num):
            reduce_task = dp_pb.ReduceTask()
            reduce_task.map_base_dir = self._partition_output_dir
            reduce_task.reduce_base_dir = self._merge_output_dir
            reduce_task.partition_id = partition_id
            self._portal_worker._run_reduce_task(reduce_task)
            total_cnt += self._check_merge(reduce_task)

        self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num)
        self._clean_up()
예제 #4
0
 def _create_map_task(self, rank_id, partition_id):
     assert self._processing_job is not None
     job = self._processing_job
     map_fpaths = []
     for fpath in job.fpaths:
         fname = path.basename(fpath)
         if hash(fname) % self._output_partition_num == partition_id:
             map_fpaths.append(fpath)
     return dp_pb.MapTask(fpaths=map_fpaths,
                          output_base_dir=self._map_output_dir(job.job_id),
                          output_partition_num=self._output_partition_num,
                          partition_id=partition_id)
예제 #5
0
    def _run_map_task(self):
        map_task = dp_pb.MapTask()
        map_task.output_base_dir = self._partition_output_dir
        map_task.output_partition_num = self._output_partition_num
        map_task.partition_id = 0
        map_task.task_name = 'map_part_{}'.format(map_task.partition_id)
        map_task.part_field = 'example_id'
        map_task.data_portal_type = dp_pb.DataPortalType.Streaming
        for partition_id in range(self._input_partition_num):
            map_task.fpaths.append(self._get_input_fpath(partition_id))

        # partitioner
        task = dp_pb.NewTaskResponse()
        task.map_task.CopyFrom(map_task)
        self._portal_worker._run_map_task(task.map_task)
        return task