def _preload_raw_data_meta(self): manifest_kvstore_key = common.partition_manifest_kvstore_key( self._data_source.data_source_meta.name, self._partition_id) all_metas = [] index_metas = [] for key, val in self._kvstore.get_prefix_kvs(manifest_kvstore_key, True): bkey = os.path.basename(key) if not bkey.decode().startswith(common.RawDataMetaPrefix): continue index = int(bkey[len(common.RawDataMetaPrefix):]) meta = text_format.Parse(val, dj_pb.RawDataMeta()) all_metas.append((index, meta)) if meta.start_index != -1: index_meta = visitor.IndexMeta(index, meta.start_index, meta.file_path) index_metas.append(index_meta) all_metas = sorted(all_metas, key=lambda meta: meta[0]) for process_index, meta in enumerate(all_metas): if process_index != meta[0]: logging.fatal("process_index mismatch with index %d != %d "\ "for file path %s", process_index, meta[0], meta[1].file_path) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access return all_metas, index_metas
def finish_example_id_dumper(self): self._tf_record_writer.close() if self.dumped_example_id_count() > 0: fpath = self._get_dumped_fpath() gfile.Rename(self._tmp_fpath, fpath, True) index_meta = visitor.IndexMeta(self._process_index, self._start_index, fpath) return index_meta, self._end_index assert self._start_index == self._end_index, "no example id dumped" gfile.Remove(self._tmp_fpath) return None, None
def _new_index_meta(self, process_index, start_index): if not self._visit_only: raise RuntimeError("_new_index_meta only support visit only") assert self._anchor is not None, "anchor is always in visit_only mode" if self._check_index_dumped(start_index): fname = encode_example_id_dumped_fname(process_index, start_index) fpath = os.path.join(self._example_dumped_dir(), fname) if not gfile.Exists(fpath): logging.fatal("%d has been dumpped however %s not "\ "in file system", start_index, fpath) os._exit(-1) # pylint: disable=protected-access return visitor.IndexMeta(process_index, start_index, fpath) return None
def decode_index_meta(fpath): fname = path.basename(fpath) index_str = fname[:-len(DoneFileSuffix)] try: items = index_str.split('-') if len(items) != 2: raise RuntimeError("fname {} format error".format(fname)) process_index, start_index = int(items[0]), int(items[1]) except Exception as e: # pylint: disable=broad-except logging.fatal("fname %s not satisfied with pattern process_index-"\ "start_index", fname) os._exit(-1) # pylint: disable=protected-access else: return visitor.IndexMeta(process_index, start_index, fpath) return None
def _new_index_meta(self, process_index, start_index): if self._manifest.next_process_index <= process_index: return None raw_data_meta = None if process_index < len(self._all_metas): assert process_index == self._all_metas[process_index][0], \ "process index should equal {} != {}".format( process_index, self._all_metas[process_index][0] ) raw_data_meta = self._all_metas[process_index][1] else: assert process_index == len(self._all_metas), \ "the process index should be the next all metas "\ "{}(process_index) != {}(size of all_metas)".format( process_index, len(self._all_metas) ) raw_data_meta = self._sync_raw_data_meta(process_index) if raw_data_meta is None: logging.fatal("the raw data of partition %d index with "\ "%d must in etcd", self._partition_id, process_index) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access self._all_metas.append((process_index, raw_data_meta)) if raw_data_meta.start_index == -1: new_meta = dj_pb.RawDataMeta() new_meta.MergeFrom(raw_data_meta) new_meta.start_index = start_index odata = text_format.MessageToString(raw_data_meta) ndata = text_format.MessageToString(new_meta) etcd_key = common.raw_data_meta_etcd_key( self._data_source.data_source_meta.name, self._partition_id, process_index ) if not self._etcd.cas(etcd_key, odata, ndata): raw_data_meta = self._sync_raw_data_meta(process_index) assert raw_data_meta is not None, \ "the raw data meta of process index {} "\ "must not None".format(process_index) if raw_data_meta.start_index != start_index: logging.fatal("raw data of partition %d index with "\ "%d must start with %d", self._partition_id, process_index, start_index) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access return visitor.IndexMeta(process_index, start_index, raw_data_meta.file_path)
def _next_internal(self): if not self._finished: try: while True: tf_item = None if self._fiter is None: self._fiter = TfRecordIter(self._raw_data_options) meta = visitor.IndexMeta(0, 0, self._fpath) self._fiter.reset_iter(meta, True) tf_item = self._fiter.get_item() else: _, tf_item = next(self._fiter) return Merge.RecordItem(self._fpath_id, tf_item) except StopIteration: self._finished = True raise StopIteration("%s has been iter finished" % self._fpath)
def _next_internal(self): if not self._finished: try: item = None if self._fiter is None: self._fiter = create_raw_data_iter(self._reader_options) meta = visitor.IndexMeta(0, 0, self._fpath) self._fiter.reset_iter(meta, True) item = self._fiter.get_item() else: _, item = next(self._fiter) assert item is not None return SortRunReader.MergeItem(item, self._reader_index, self._comparator) except StopIteration: self._finished = True raise StopIteration("%s has been iter finished" % self._fpath)
def _next_internal(self): if not self._finished: try: item = None if self._fiter is None: raw_data_options = \ dj_pb.RawDataOptions(raw_data_iter='CSV_DICT') self._fiter = CsvDictIter(raw_data_options) meta = visitor.IndexMeta(0, 0, self._fpath) self._fiter.reset_iter(meta, True) item = self._fiter.get_item() else: _, item = next(self._fiter) assert item is not None return SortRunReader.MergeItem(item, self._reader_index) except StopIteration: self._finished = True raise StopIteration("%s has been iter finished" % self._fpath)
def _next_internal(self): if not self._finished: try: while True: tf_item = None if self._fiter is None: self._fiter = TfRecordIter(self._raw_data_options) meta = visitor.IndexMeta(0, 0, self._fpath) self._fiter.reset_iter(meta, True) tf_item = self._fiter.get_item() else: _, tf_item = next(self._fiter) valid, reason = \ self._example_validator.validate_example(tf_item) if valid: return PotralHourlyInputReducer.RecordItem( self._partition_id, tf_item ) logging.warning("skip record in %s, reason %s", self._fpath, reason) except StopIteration: self._finished = True raise StopIteration("%s has been iter finished" % self._fpath)
def _new_index_meta(self, process_index, start_index): if process_index >= len(self._input_fpaths): return None return visitor.IndexMeta(process_index, start_index, self._input_fpaths[process_index])