Exemplo n.º 1
0
 def _preload_raw_data_meta(self):
     manifest_kvstore_key = common.partition_manifest_kvstore_key(
         self._data_source.data_source_meta.name, self._partition_id)
     all_metas = []
     index_metas = []
     for key, val in self._kvstore.get_prefix_kvs(manifest_kvstore_key,
                                                  True):
         bkey = os.path.basename(key)
         if not bkey.decode().startswith(common.RawDataMetaPrefix):
             continue
         index = int(bkey[len(common.RawDataMetaPrefix):])
         meta = text_format.Parse(val, dj_pb.RawDataMeta())
         all_metas.append((index, meta))
         if meta.start_index != -1:
             index_meta = visitor.IndexMeta(index, meta.start_index,
                                            meta.file_path)
             index_metas.append(index_meta)
     all_metas = sorted(all_metas, key=lambda meta: meta[0])
     for process_index, meta in enumerate(all_metas):
         if process_index != meta[0]:
             logging.fatal("process_index mismatch with index %d != %d "\
                           "for file path %s", process_index, meta[0],
                           meta[1].file_path)
             traceback.print_stack()
             os._exit(-1)  # pylint: disable=protected-access
     return all_metas, index_metas
Exemplo n.º 2
0
 def finish_example_id_dumper(self):
     self._tf_record_writer.close()
     if self.dumped_example_id_count() > 0:
         fpath = self._get_dumped_fpath()
         gfile.Rename(self._tmp_fpath, fpath, True)
         index_meta = visitor.IndexMeta(self._process_index,
                                        self._start_index, fpath)
         return index_meta, self._end_index
     assert self._start_index == self._end_index, "no example id dumped"
     gfile.Remove(self._tmp_fpath)
     return None, None
Exemplo n.º 3
0
 def _new_index_meta(self, process_index, start_index):
     if not self._visit_only:
         raise RuntimeError("_new_index_meta only support visit only")
     assert self._anchor is not None, "anchor is always in visit_only mode"
     if self._check_index_dumped(start_index):
         fname = encode_example_id_dumped_fname(process_index, start_index)
         fpath = os.path.join(self._example_dumped_dir(), fname)
         if not gfile.Exists(fpath):
             logging.fatal("%d has been dumpped however %s not "\
                           "in file system", start_index, fpath)
             os._exit(-1)  # pylint: disable=protected-access
         return visitor.IndexMeta(process_index, start_index, fpath)
     return None
Exemplo n.º 4
0
def decode_index_meta(fpath):
    fname = path.basename(fpath)
    index_str = fname[:-len(DoneFileSuffix)]
    try:
        items = index_str.split('-')
        if len(items) != 2:
            raise RuntimeError("fname {} format error".format(fname))
        process_index, start_index = int(items[0]), int(items[1])
    except Exception as e:  # pylint: disable=broad-except
        logging.fatal("fname %s not satisfied with pattern process_index-"\
                      "start_index", fname)
        os._exit(-1)  # pylint: disable=protected-access
    else:
        return visitor.IndexMeta(process_index, start_index, fpath)
    return None
Exemplo n.º 5
0
 def _new_index_meta(self, process_index, start_index):
     if self._manifest.next_process_index <= process_index:
         return None
     raw_data_meta = None
     if process_index < len(self._all_metas):
         assert process_index == self._all_metas[process_index][0], \
             "process index should equal {} != {}".format(
                 process_index, self._all_metas[process_index][0]
             )
         raw_data_meta = self._all_metas[process_index][1]
     else:
         assert process_index == len(self._all_metas), \
             "the process index should be the next all metas "\
             "{}(process_index) != {}(size of all_metas)".format(
                     process_index, len(self._all_metas)
                 )
         raw_data_meta = self._sync_raw_data_meta(process_index)
         if raw_data_meta is None:
             logging.fatal("the raw data of partition %d index with "\
                           "%d must in etcd",
                           self._partition_id, process_index)
             traceback.print_stack()
             os._exit(-1) # pylint: disable=protected-access
         self._all_metas.append((process_index, raw_data_meta))
     if raw_data_meta.start_index == -1:
         new_meta = dj_pb.RawDataMeta()
         new_meta.MergeFrom(raw_data_meta)
         new_meta.start_index = start_index
         odata = text_format.MessageToString(raw_data_meta)
         ndata = text_format.MessageToString(new_meta)
         etcd_key = common.raw_data_meta_etcd_key(
                 self._data_source.data_source_meta.name,
                 self._partition_id, process_index
             )
         if not self._etcd.cas(etcd_key, odata, ndata):
             raw_data_meta = self._sync_raw_data_meta(process_index)
             assert raw_data_meta is not None, \
                 "the raw data meta of process index {} "\
                 "must not None".format(process_index)
             if raw_data_meta.start_index != start_index:
                 logging.fatal("raw data of partition %d index with "\
                               "%d must start with %d",
                               self._partition_id, process_index,
                               start_index)
                 traceback.print_stack()
                 os._exit(-1) # pylint: disable=protected-access
     return visitor.IndexMeta(process_index, start_index,
                              raw_data_meta.file_path)
Exemplo n.º 6
0
 def _next_internal(self):
     if not self._finished:
         try:
             while True:
                 tf_item = None
                 if self._fiter is None:
                     self._fiter = TfRecordIter(self._raw_data_options)
                     meta = visitor.IndexMeta(0, 0, self._fpath)
                     self._fiter.reset_iter(meta, True)
                     tf_item = self._fiter.get_item()
                 else:
                     _, tf_item = next(self._fiter)
                 return Merge.RecordItem(self._fpath_id, tf_item)
         except StopIteration:
             self._finished = True
     raise StopIteration("%s has been iter finished" % self._fpath)
Exemplo n.º 7
0
 def _next_internal(self):
     if not self._finished:
         try:
             item = None
             if self._fiter is None:
                 self._fiter = create_raw_data_iter(self._reader_options)
                 meta = visitor.IndexMeta(0, 0, self._fpath)
                 self._fiter.reset_iter(meta, True)
                 item = self._fiter.get_item()
             else:
                 _, item = next(self._fiter)
             assert item is not None
             return SortRunReader.MergeItem(item, self._reader_index,
                                            self._comparator)
         except StopIteration:
             self._finished = True
     raise StopIteration("%s has been iter finished" % self._fpath)
Exemplo n.º 8
0
 def _next_internal(self):
     if not self._finished:
         try:
             item = None
             if self._fiter is None:
                 raw_data_options = \
                     dj_pb.RawDataOptions(raw_data_iter='CSV_DICT')
                 self._fiter = CsvDictIter(raw_data_options)
                 meta = visitor.IndexMeta(0, 0, self._fpath)
                 self._fiter.reset_iter(meta, True)
                 item = self._fiter.get_item()
             else:
                 _, item = next(self._fiter)
             assert item is not None
             return SortRunReader.MergeItem(item, self._reader_index)
         except StopIteration:
             self._finished = True
     raise StopIteration("%s has been iter finished" % self._fpath)
Exemplo n.º 9
0
 def _next_internal(self):
     if not self._finished:
         try:
             while True:
                 tf_item = None
                 if self._fiter is None:
                     self._fiter = TfRecordIter(self._raw_data_options)
                     meta = visitor.IndexMeta(0, 0, self._fpath)
                     self._fiter.reset_iter(meta, True)
                     tf_item = self._fiter.get_item()
                 else:
                     _, tf_item = next(self._fiter)
                 valid, reason = \
                     self._example_validator.validate_example(tf_item)
                 if valid:
                     return PotralHourlyInputReducer.RecordItem(
                             self._partition_id, tf_item
                         )
                 logging.warning("skip record in %s, reason %s",
                                 self._fpath, reason)
         except StopIteration:
             self._finished = True
     raise StopIteration("%s has been iter finished" % self._fpath)
Exemplo n.º 10
0
 def _new_index_meta(self, process_index, start_index):
     if process_index >= len(self._input_fpaths):
         return None
     return visitor.IndexMeta(process_index, start_index,
                              self._input_fpaths[process_index])