示例#1
0
 def _check_processing_job_finished(self):
     if not self._all_job_part_finished():
         return False
     processing_job = self._sync_processing_job()
     if not processing_job.finished:
         finished_job = dp_pb.DataPortalJob()
         finished_job.MergeFrom(self._processing_job)
         finished_job.finished = True
         self._update_processing_job(finished_job)
     self._processing_job = None
     self._job_part_map = {}
     portal_mainifest = self._sync_portal_manifest()
     if portal_mainifest.processing_job_id >= 0:
         self._publish_raw_data(portal_mainifest.processing_job_id)
         new_portal_manifest = dp_pb.DataPortalManifest()
         new_portal_manifest.MergeFrom(self._sync_portal_manifest())
         new_portal_manifest.processing_job_id = -1
         self._update_portal_manifest(new_portal_manifest)
     if processing_job is not None:
         logging.info("Data Portal job %d has finished. Processed %d "\
                      "following fpaths\n------------\n",
                      processing_job.job_id, len(processing_job.fpaths))
         for seq, fpath in enumerate(processing_job.fpaths):
             logging.info("%d. %s", seq, fpath)
         logging.info("---------------------------------\n")
     return True
示例#2
0
 def _launch_new_portal_job(self):
     assert self._sync_processing_job() is None
     all_fpaths = self._list_input_dir()
     rest_fpaths = []
     for fpath in all_fpaths:
         if fpath not in self._processed_fpath:
             rest_fpaths.append(fpath)
     if len(rest_fpaths) == 0:
         logging.info("no file left for portal")
         return
     rest_fpaths.sort()
     portal_mainifest = self._sync_portal_manifest()
     new_job = dp_pb.DataPortalJob(job_id=portal_mainifest.next_job_id,
                                   finished=False,
                                   fpaths=rest_fpaths)
     self._update_processing_job(new_job)
     new_portal_manifest = dp_pb.DataPortalManifest()
     new_portal_manifest.MergeFrom(portal_mainifest)
     new_portal_manifest.next_job_id += 1
     new_portal_manifest.processing_job_id = new_job.job_id
     self._update_portal_manifest(new_portal_manifest)
     for partition_id in range(self._output_partition_num):
         self._sync_job_part(new_job.job_id, partition_id)
     logging.info("Data Portal job %d has lanuched. %d files will be"\
                  "processed\n------------\n",
                  new_job.job_id, len(new_job.fpaths))
     for seq, fpath in enumerate(new_job.fpaths):
         logging.info("%d. %s", seq, fpath)
     logging.info("---------------------------------\n")
 def _sync_portal_job(self, job_id):
     kvstore_key = common.portal_job_kvstore_key(self._portal_name, job_id)
     data = self._kvstore.get_data(kvstore_key)
     if data is not None:
         return text_format.Parse(data,
                                  dp_pb.DataPortalJob(),
                                  allow_unknown_field=True)
     return None
 def _check_portal_job(self, etcd, fnames, portal_manifest, job_id):
     etcd_key = common.portal_job_etcd_key(portal_manifest.name, job_id)
     data = etcd.get_data(etcd_key)
     self.assertIsNotNone(data)
     portal_job = text_format.Parse(data, dp_pb.DataPortalJob())
     self.assertEqual(job_id, portal_job.job_id)
     self.assertFalse(portal_job.finished)
     fnames.sort()
     fpaths = [os.path.join(portal_manifest.input_base_dir, f) for f in fnames 
               if fnmatch(f, portal_manifest.input_file_wildcard)]
     self.assertEqual(len(fpaths), len(portal_job.fpaths))
     for index, fpath in enumerate(fpaths):
         self.assertEqual(fpath, portal_job.fpaths[index])
示例#5
0
 def _check_processing_job_finished(self):
     if not self._all_job_part_finished():
         return False
     processing_job = self._sync_processing_job()
     if not processing_job.finished:
         finished_job = dp_pb.DataPortalJob()
         finished_job.MergeFrom(self._processing_job)
         finished_job.finished = True
         self._update_processing_job(finished_job)
     self._processing_job = None
     self._job_part_map = {}
     portal_mainifest = self._sync_portal_manifest()
     if portal_mainifest.processing_job_id > 0:
         new_portal_manifest = dp_pb.DataPortalManifest()
         new_portal_manifest.MergeFrom(self._sync_portal_manifest())
         new_portal_manifest.processing_job_id = -1
         self._update_portal_manifest(new_portal_manifest)
     return True
 def _sync_portal_job(self, job_id):
     etcd_key = common.portal_job_etcd_key(self._portal_name, job_id)
     data = self._etcd.get_data(etcd_key)
     if data is not None:
         return text_format.Parse(data, dp_pb.DataPortalJob())
     return None