def _check_processing_job_finished(self): if not self._all_job_part_finished(): return False processing_job = self._sync_processing_job() if not processing_job.finished: finished_job = dp_pb.DataPortalJob() finished_job.MergeFrom(self._processing_job) finished_job.finished = True self._update_processing_job(finished_job) self._processing_job = None self._job_part_map = {} portal_mainifest = self._sync_portal_manifest() if portal_mainifest.processing_job_id >= 0: self._publish_raw_data(portal_mainifest.processing_job_id) new_portal_manifest = dp_pb.DataPortalManifest() new_portal_manifest.MergeFrom(self._sync_portal_manifest()) new_portal_manifest.processing_job_id = -1 self._update_portal_manifest(new_portal_manifest) if processing_job is not None: logging.info("Data Portal job %d has finished. Processed %d "\ "following fpaths\n------------\n", processing_job.job_id, len(processing_job.fpaths)) for seq, fpath in enumerate(processing_job.fpaths): logging.info("%d. %s", seq, fpath) logging.info("---------------------------------\n") return True
def _launch_new_portal_job(self): assert self._sync_processing_job() is None all_fpaths = self._list_input_dir() rest_fpaths = [] for fpath in all_fpaths: if fpath not in self._processed_fpath: rest_fpaths.append(fpath) if len(rest_fpaths) == 0: logging.info("no file left for portal") return rest_fpaths.sort() portal_mainifest = self._sync_portal_manifest() new_job = dp_pb.DataPortalJob(job_id=portal_mainifest.next_job_id, finished=False, fpaths=rest_fpaths) self._update_processing_job(new_job) new_portal_manifest = dp_pb.DataPortalManifest() new_portal_manifest.MergeFrom(portal_mainifest) new_portal_manifest.next_job_id += 1 new_portal_manifest.processing_job_id = new_job.job_id self._update_portal_manifest(new_portal_manifest) for partition_id in range(self._output_partition_num): self._sync_job_part(new_job.job_id, partition_id) logging.info("Data Portal job %d has lanuched. %d files will be"\ "processed\n------------\n", new_job.job_id, len(new_job.fpaths)) for seq, fpath in enumerate(new_job.fpaths): logging.info("%d. %s", seq, fpath) logging.info("---------------------------------\n")
def _sync_portal_job(self, job_id): kvstore_key = common.portal_job_kvstore_key(self._portal_name, job_id) data = self._kvstore.get_data(kvstore_key) if data is not None: return text_format.Parse(data, dp_pb.DataPortalJob(), allow_unknown_field=True) return None
def _check_portal_job(self, etcd, fnames, portal_manifest, job_id): etcd_key = common.portal_job_etcd_key(portal_manifest.name, job_id) data = etcd.get_data(etcd_key) self.assertIsNotNone(data) portal_job = text_format.Parse(data, dp_pb.DataPortalJob()) self.assertEqual(job_id, portal_job.job_id) self.assertFalse(portal_job.finished) fnames.sort() fpaths = [os.path.join(portal_manifest.input_base_dir, f) for f in fnames if fnmatch(f, portal_manifest.input_file_wildcard)] self.assertEqual(len(fpaths), len(portal_job.fpaths)) for index, fpath in enumerate(fpaths): self.assertEqual(fpath, portal_job.fpaths[index])
def _check_processing_job_finished(self): if not self._all_job_part_finished(): return False processing_job = self._sync_processing_job() if not processing_job.finished: finished_job = dp_pb.DataPortalJob() finished_job.MergeFrom(self._processing_job) finished_job.finished = True self._update_processing_job(finished_job) self._processing_job = None self._job_part_map = {} portal_mainifest = self._sync_portal_manifest() if portal_mainifest.processing_job_id > 0: new_portal_manifest = dp_pb.DataPortalManifest() new_portal_manifest.MergeFrom(self._sync_portal_manifest()) new_portal_manifest.processing_job_id = -1 self._update_portal_manifest(new_portal_manifest) return True
def _sync_portal_job(self, job_id): etcd_key = common.portal_job_etcd_key(self._portal_name, job_id) data = self._etcd.get_data(etcd_key) if data is not None: return text_format.Parse(data, dp_pb.DataPortalJob()) return None