Пример #1
0
 def _publish_raw_data(self, job_id):
     portal_manifest = self._sync_portal_manifest()
     output_dir = None
     if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
         output_dir = common.portal_map_output_dir(
             portal_manifest.output_base_dir, job_id)
     else:
         output_dir = common.portal_reduce_output_dir(
             portal_manifest.output_base_dir, job_id)
     for partition_id in range(self._output_partition_num):
         dpath = path.join(output_dir, common.partition_repr(partition_id))
         fnames = []
         if gfile.Exists(dpath) and gfile.IsDirectory(dpath):
             fnames = [
                 f for f in gfile.ListDirectory(dpath)
                 if f.endswith(common.RawDataFileSuffix)
             ]
         publish_fpaths = []
         if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
             publish_fpaths = self._publish_psi_raw_data(
                 partition_id, dpath, fnames)
         else:
             publish_fpaths = self._publish_streaming_raw_data(
                 partition_id, dpath, fnames)
         logging.info("Data Portal Master publish %d file for partition "\
                      "%d of streaming job %d\n----------\n",
                      len(publish_fpaths), partition_id, job_id)
         for seq, fpath in enumerate(publish_fpaths):
             logging.info("%d. %s", seq, fpath)
         logging.info("------------------------------------------\n")
 def _check_reduce_task(self, reduce_task, partition_id, portal_manifest):
     self.assertEqual(reduce_task.partition_id, partition_id)
     self.assertEqual(reduce_task.map_base_dir,
                      common.portal_map_output_dir(portal_manifest.output_base_dir,
                                                   portal_manifest.name, 0))
     self.assertEqual(reduce_task.reduce_base_dir,
                      common.portal_reduce_output_dir(portal_manifest.output_base_dir,
                                                      portal_manifest.name, 0))
Пример #3
0
 def _check_map_task(self, map_task, fnames, partition_id, portal_manifest):
     self.assertEqual(map_task.output_partition_num, portal_manifest.output_partition_num)
     fnames.sort()
     fpaths = [os.path.join(portal_manifest.input_base_dir, f) for f in fnames 
               if (fnmatch(f, portal_manifest.input_file_wildcard) and \
                       hash(f) % map_task.output_partition_num == partition_id)]
     self.assertEqual(len(fpaths), len(map_task.fpaths))
     for index, fpath in enumerate(fpaths):
         self.assertEqual(fpath, map_task.fpaths[index])
     self.assertEqual(map_task.output_base_dir,
                      common.portal_map_output_dir(portal_manifest.output_base_dir, 0))
Пример #4
0
 def _publish_raw_data(self, job_id):
     portal_manifest = self._sync_portal_manifest()
     output_dir = None
     if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
         output_dir = common.portal_map_output_dir(
             portal_manifest.output_base_dir, portal_manifest.name, job_id)
     else:
         output_dir = common.portal_reduce_output_dir(
             portal_manifest.output_base_dir, portal_manifest.name, job_id)
     for partition_id in range(self._output_partition_num):
         dpath = path.join(output_dir, common.partition_repr(partition_id))
         fpaths = [
             path.join(dpath, f) for f in gfile.ListDirectory(dpath)
             if f.endswith(common.RawDataFileSuffix)
         ]
         self._publisher.publish_raw_data(partition_id, fpaths)
         if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
             self._publisher.finish_raw_data(partition_id)
Пример #5
0
 def _publish_raw_data(self, job_id):
     portal_manifest = self._sync_portal_manifest()
     output_dir = None
     if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
         output_dir = common.portal_map_output_dir(
             portal_manifest.output_base_dir, job_id)
     else:
         output_dir = common.portal_reduce_output_dir(
             portal_manifest.output_base_dir, job_id)
     for partition_id in range(self._output_partition_num):
         dpath = path.join(output_dir, common.partition_repr(partition_id))
         fnames = []
         if gfile.Exists(dpath) and gfile.IsDirectory(dpath):
             fnames = [
                 f for f in gfile.ListDirectory(dpath)
                 if f.endswith(common.RawDataFileSuffix)
             ]
         if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
             self._publish_psi_raw_data(partition_id, dpath, fnames)
         else:
             self._publish_streaming_raw_data(partition_id, dpath, fnames)
Пример #6
0
 def _map_output_dir(self, job_id):
     return common.portal_map_output_dir(
         self._portal_manifest.output_base_dir, job_id)