class ExampleIdBatchFetcher(ItemBatchSeqProcessor): def __init__(self, etcd, data_source, partition_id, raw_data_options, batch_processor_options): super(ExampleIdBatchFetcher, self).__init__(batch_processor_options.max_flying_item) self._raw_data_visitor = RawDataVisitor(etcd, data_source, partition_id, raw_data_options) self._batch_size = batch_processor_options.batch_size self._partition_id = partition_id ds_name = data_source.data_source_meta.name self._metric_tags = { 'data_source_name': ds_name, 'partition': self._partition_id } @classmethod def name(cls): return 'ExampleIdBatchFetcher' def _make_item_batch(self, begin_index): return ExampleIdBatch(self._partition_id, begin_index) def _make_inner_generator(self, next_index): self._raw_data_visitor.active_visitor() if next_index == 0: self._raw_data_visitor.reset() else: self._raw_data_visitor.seek(next_index - 1) while not self._raw_data_visitor.finished() and \ not self._fly_item_full(): next_batch = self._make_item_batch(next_index) for (index, item) in self._raw_data_visitor: if index != next_index: logging.fatal("index of raw data visitor for partition "\ "%d is not consecutive, %d != %d", self._partition_id, index, next_index) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access next_batch.append(item) next_index += 1 if len(next_batch) > self._batch_size: break yield next_batch, self._raw_data_visitor.finished() yield self._make_item_batch(next_index), \ self._raw_data_visitor.finished() def _get_metrics_tags(self): return self._metric_tags
class DataBlockDumperManager(object): def __init__(self, etcd, data_source, partition_id, raw_data_options, data_block_builder_options): self._lock = threading.Lock() self._data_source = data_source self._partition_id = partition_id self._data_block_manager = \ DataBlockManager(data_source, partition_id) self._raw_data_visitor = \ RawDataVisitor(etcd, data_source, partition_id, raw_data_options) self._data_block_builder_options = data_block_builder_options self._next_data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._fly_data_block_meta = [] self._state_stale = False self._synced_data_block_meta_finished = False def get_next_data_block_index(self): with self._lock: return self._next_data_block_index def get_dumped_data_block_index(self): return self._data_block_manager.get_dumped_data_block_count() - 1 def add_synced_data_block_meta(self, meta): with self._lock: if self._synced_data_block_meta_finished: raise RuntimeError( "data block dmuper manager has been mark as "\ "no more data block meta" ) if self._next_data_block_index != meta.data_block_index: return False, self._next_data_block_index self._fly_data_block_meta.append(meta) self._next_data_block_index += 1 return True, self._next_data_block_index def finish_sync_data_block_meta(self): with self._lock: self._synced_data_block_meta_finished = True def need_dump(self): with self._lock: return len(self._fly_data_block_meta) > 0 def is_synced_data_block_meta_finished(self): with self._lock: return self._synced_data_block_meta_finished @contextmanager def make_data_block_dumper(self): self._sync_with_data_block_manager() self._acquire_state_stale() yield self._dump_data_blocks self._release_state_stale() def _dump_data_blocks(self): while self.need_dump(): meta = self._get_next_data_block_meta() if meta is not None: self._raw_data_visitor.active_visitor() self._dump_data_block_by_meta(meta) def data_block_meta_sync_finished(self): with self._lock: return self._synced_data_block_meta_finished def _acquire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _get_next_data_block_meta(self): with self._lock: if len(self._fly_data_block_meta) == 0: return None return self._fly_data_block_meta[0] @contextmanager def _make_data_block_builder(self, meta): assert self._partition_id == meta.partition_id, \ "partition id of building data block meta mismatch "\ "{} != {}".format(self._partition_id, meta.partition_id) builder = None expt = None try: builder = create_data_block_builder( self._data_block_builder_options, self._data_source.data_block_dir, self._data_source.data_source_meta.name, self._partition_id, meta.data_block_index) builder.init_by_meta(meta) builder.set_data_block_manager(self._data_block_manager) yield builder except Exception as e: # pylint: disable=broad-except logging.warning("Failed make data block builder, " \ "reason %s", e) expt = e if builder is not None: del builder if expt is not None: raise expt def _dump_data_block_by_meta(self, meta): assert meta is not None, "input data block must not be None" with self._make_data_block_builder(meta) as data_block_builder: try: if meta.leader_start_index == 0: self._raw_data_visitor.reset() else: assert meta.leader_start_index > 0, \ "leader start index must be positive" self._raw_data_visitor.seek(meta.leader_start_index - 1) except StopIteration: logging.fatal("raw data finished before when seek to %d", meta.leader_start_index - 1) os._exit(-1) # pylint: disable=protected-access match_index = 0 example_num = len(meta.example_ids) for (index, item) in self._raw_data_visitor: example_id = item.example_id if example_id == meta.example_ids[match_index]: data_block_builder.write_item(item) match_index += 1 if match_index >= example_num: break if index >= meta.leader_end_index: break if match_index < example_num: logging.fatal( "Data lose corrupt! only match %d/%d example " "for data block %s", match_index, example_num, meta.block_id) os._exit(-1) # pylint: disable=protected-access dumped_meta = data_block_builder.finish_data_block() assert dumped_meta == meta, "the generated dumped meta shoud "\ "be the same with input mata" with self._lock: assert self._fly_data_block_meta[0] == meta self._fly_data_block_meta.pop(0) def _is_state_stale(self): with self._lock: return self._state_stale def _sync_with_data_block_manager(self): if self._is_state_stale(): self._evict_dumped_data_block_meta() def _evict_dumped_data_block_meta(self): next_data_block_index = \ self._data_block_manager.get_dumped_data_block_count() with self._lock: skip_count = 0 for meta in self._fly_data_block_meta: if meta.data_block_index >= next_data_block_index: break skip_count += 1 self._fly_data_block_meta = \ self._fly_data_block_meta[skip_count:]
class ExampleJoiner(object): def __init__(self, example_joiner_options, raw_data_options, etcd, data_source, partition_id): self._lock = threading.Lock() self._example_joiner_options = example_joiner_options self._raw_data_options = raw_data_options self._data_source = data_source self._partition_id = partition_id self._leader_visitor = \ ExampleIdVisitor(etcd, self._data_source, self._partition_id) self._follower_visitor = \ RawDataVisitor(etcd, self._data_source, self._partition_id, raw_data_options) self._data_block_manager = \ DataBlockManager(self._data_source, self._partition_id) self._data_block_builder = None self._state_stale = False self._follower_restart_index = 0 self._sync_example_id_finished = False self._raw_data_finished = False self._join_finished = False self._latest_dump_timestamp = time.time() self._sync_state() @contextmanager def make_example_joiner(self): state_stale = self._is_state_stale() self._acuqire_state_stale() yield self._inner_joiner(state_stale) self._release_state_stale() @classmethod def name(cls): return 'BASE_EXAMPLE_JOINER' def get_data_block_meta_by_index(self, index): with self._lock: manager = self._data_block_manager return self._join_finished, \ manager.get_data_block_meta_by_index(index) def get_dumped_data_block_count(self): return self._data_block_manager.get_dumped_data_block_count() def is_join_finished(self): with self._lock: return self._join_finished def set_sync_example_id_finished(self): with self._lock: self._sync_example_id_finished = True def set_raw_data_finished(self): with self._lock: self._raw_data_finished = True def is_sync_example_id_finished(self): with self._lock: return self._sync_example_id_finished def is_raw_data_finished(self): with self._lock: return self._raw_data_finished def need_join(self): with self._lock: if self._join_finished: return False if self._state_stale or self._sync_example_id_finished: return True if self._follower_visitor.is_visitor_stale() or \ self._leader_visitor.is_visitor_stale(): return True if not self._follower_visitor.finished() and \ not self._leader_visitor.finished(): return True return self._need_finish_data_block_since_interval() def _inner_joiner(self, reset_state): raise NotImplementedError( "_inner_joiner not implement for base class: %s" % ExampleJoiner.name()) def _is_state_stale(self): with self._lock: return self._state_stale def _active_visitors(self): self._leader_visitor.active_visitor() self._follower_visitor.active_visitor() def _sync_state(self): meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") else: self._leader_visitor.reset() self._follower_visitor.reset() def _get_data_block_builder(self, create_if_no_existed): if self._data_block_builder is None and create_if_no_existed: data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._data_source.data_source_meta.name, self._partition_id, data_block_index, self._example_joiner_options.data_block_dump_threshold) self._data_block_builder.set_data_block_manager( self._data_block_manager) self._data_block_builder.set_follower_restart_index( self._follower_restart_index) return self._data_block_builder def _finish_data_block(self): if self._data_block_builder is not None: meta = self._data_block_builder.finish_data_block() self._reset_data_block_builder() self._update_latest_dump_timestamp() return meta return None def _reset_data_block_builder(self): builder = None with self._lock: builder = self._data_block_builder self._data_block_builder = None if builder is not None: del builder def _update_latest_dump_timestamp(self): with self._lock: self._latest_dump_timestamp = time.time() def _acuqire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _set_join_finished(self): with self._lock: self._join_finished = True def _need_finish_data_block_since_interval(self): dump_interval = self._example_joiner_options.data_block_dump_interval duration_since_dump = time.time() - self._latest_dump_timestamp return 0 < dump_interval <= duration_since_dump
class ExampleJoiner(object): def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): self._lock = threading.Lock() self._example_joiner_options = example_joiner_options self._raw_data_options = raw_data_options self._data_source = data_source self._partition_id = partition_id self._leader_visitor = \ ExampleIdVisitor(kvstore, self._data_source, self._partition_id) self._follower_visitor = \ RawDataVisitor(kvstore, self._data_source, self._partition_id, raw_data_options) self._data_block_manager = \ DataBlockManager(self._data_source, self._partition_id) meta = self._data_block_manager.get_lastest_data_block_meta() if meta is None: self._joiner_stats = JoinerStats(0, -1, -1) else: stats_info = meta.joiner_stats_info self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num, stats_info.leader_stats_index, stats_info.follower_stats_index) self._data_block_builder_options = data_block_builder_options self._data_block_builder = None self._state_stale = False self._follower_restart_index = 0 self._sync_example_id_finished = False self._raw_data_finished = False self._join_finished = False ds_name = self._data_source.data_source_meta.name self._metrics_tags = { 'data_source_name': ds_name, 'partition': partition_id, 'joiner_name': self.name() } self._optional_stats = OptionalStats(raw_data_options, self._metrics_tags) self._latest_dump_timestamp = time.time() self._sync_state() @contextmanager def make_example_joiner(self): state_stale = self._is_state_stale() self._acuqire_state_stale() yield self._inner_joiner(state_stale) self._release_state_stale() @classmethod def name(cls): return 'BASE_EXAMPLE_JOINER' def get_data_block_meta_by_index(self, index): with self._lock: manager = self._data_block_manager return self._join_finished, \ manager.get_data_block_meta_by_index(index) def get_dumped_data_block_count(self): return self._data_block_manager.get_dumped_data_block_count() def is_join_finished(self): with self._lock: return self._join_finished def set_sync_example_id_finished(self): with self._lock: self._sync_example_id_finished = True def set_raw_data_finished(self): with self._lock: self._raw_data_finished = True def is_sync_example_id_finished(self): with self._lock: return self._sync_example_id_finished def is_raw_data_finished(self): with self._lock: return self._raw_data_finished def need_join(self): with self._lock: if self._join_finished: return False if self._state_stale or self._sync_example_id_finished: return True if self._follower_visitor.is_visitor_stale() or \ self._leader_visitor.is_visitor_stale(): return True if not self._follower_visitor.finished() and \ not self._leader_visitor.finished(): return True return self._need_finish_data_block_since_interval() def _prepare_join(self, state_stale): if state_stale: self._sync_state() self._reset_data_block_builder() sync_example_id_finished = self.is_sync_example_id_finished() raw_data_finished = self.is_raw_data_finished() self._active_visitors() return sync_example_id_finished, raw_data_finished def _inner_joiner(self, reset_state): raise NotImplementedError( "_inner_joiner not implement for base class: %s" % ExampleJoiner.name()) def _is_state_stale(self): with self._lock: return self._state_stale def _active_visitors(self): self._leader_visitor.active_visitor() self._follower_visitor.active_visitor() def _sync_state(self): meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") else: self._leader_visitor.reset() self._follower_visitor.reset() def _get_data_block_builder(self, create_if_no_existed): if self._data_block_builder is None and create_if_no_existed: data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._data_block_builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, self._partition_id, data_block_index, self._data_block_builder_options, self._example_joiner_options.data_block_dump_threshold) self._data_block_builder.set_data_block_manager( self._data_block_manager) self._data_block_builder.set_follower_restart_index( self._follower_restart_index) return self._data_block_builder def _finish_data_block(self): if self._data_block_builder is not None: self._data_block_builder.set_join_stats_info( self._create_join_stats_info()) meta = self._data_block_builder.finish_data_block( True, self._metrics_tags) self._optional_stats.emit_optional_stats() self._reset_data_block_builder() self._update_latest_dump_timestamp() return meta return None def _create_join_stats_info(self): builder = self._get_data_block_builder(False) nstats_cum_join_num = self._joiner_stats.calc_stats_joined_num() nactual_cum_join_num = 0 if builder is None \ else builder.example_count() meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: nactual_cum_join_num += meta.joiner_stats_info.actual_cum_join_num return dj_pb.JoinerStatsInfo( stats_cum_join_num=nstats_cum_join_num, actual_cum_join_num=nactual_cum_join_num, leader_stats_index=self._joiner_stats.get_leader_stats_index(), follower_stats_index=self._joiner_stats.get_follower_stats_index()) def _reset_data_block_builder(self): builder = None with self._lock: builder = self._data_block_builder self._data_block_builder = None if builder is not None: del builder def _update_latest_dump_timestamp(self): data_block_dump_duration = time.time() - self._latest_dump_timestamp metrics.emit_timer(name='data_block_dump_duration', value=int(data_block_dump_duration), tags=self._metrics_tags) self._latest_dump_timestamp = time.time() def _acuqire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _set_join_finished(self): with self._lock: self._join_finished = True def _need_finish_data_block_since_interval(self): dump_interval = self._example_joiner_options.data_block_dump_interval duration_since_dump = time.time() - self._latest_dump_timestamp return 0 < dump_interval <= duration_since_dump
class ExampleJoiner(object): def __init__(self, etcd, data_source, partition_id, options): self._data_source = data_source self._partition_id = partition_id self._leader_visitor = ExampleIdVisitor( ExampleIdManager(data_source, partition_id)) self._follower_visitor = RawDataVisitor(etcd, data_source, partition_id, options) self._data_block_manager = DataBlockManager(data_source, partition_id) self._data_block_builder = None self._stale_with_dfs = False self._follower_restart_index = 0 self._sync_state() def join_example(self): raise NotImplementedError( "join exampel not implement for base class: %s" % ExampleJoiner.name()) @classmethod def name(cls): return 'EXAMPLE_JOINER' def get_data_block_number(self): return self._data_block_manager.num_dumped_data_block() def get_data_block_meta(self, index): return self._data_block_manager.get_data_block_meta_by_index(index) def join_finished(self): return self._data_block_manager.join_finished() def _sync_state(self): meta = self._data_block_manager.get_last_data_block_meta( self._stale_with_dfs) if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") if (self._leader_visitor.finished() or self._follower_visitor.finished()): self._data_block_manager.finish_join() self._stale_with_dfs = False def _get_data_block_builder(self): if self._data_block_builder is not None: return self._data_block_builder data_block_index = self._data_block_manager.get_dumped_data_block_num() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._partition_id, data_block_index, self._data_source.data_source_meta.max_example_in_data_block) return self._data_block_builder def _finish_data_block(self): assert self._data_block_builder is not None self._data_block_builder.set_follower_restart_index( self._follower_restart_index) self._data_block_builder.finish_data_block() meta = self._data_block_builder.get_data_block_meta() if meta is not None: self._data_block_manager.add_dumped_data_block_meta(meta) self._data_block_builder = None
class ExampleIdBatchFetcher(object): class ExampleIdBatch(object): def __init__(self, partition_id, begin_index): self._lite_example_ids = dj_pb.LiteExampleIds( partition_id=partition_id, begin_index=begin_index) def append(self, example_id, event_time): self._lite_example_ids.example_id.append(example_id) self._lite_example_ids.event_time.append(event_time) @property def begin_index(self): return self._lite_example_ids.begin_index @property def lite_example_ids(self): return self._lite_example_ids @property def partition_id(self): return self._lite_example_ids.partition_id def __len__(self): return len(self._lite_example_ids.example_id) def __lt__(self, other): assert isinstance(other, ExampleIdBatchFetcher.ExampleIdBatch) assert self.partition_id == other.partition_id return self.begin_index < other.begin_index def __init__(self, etcd, data_source, partition_id, raw_data_options, example_id_batch_options): self._lock = threading.Lock() self._partition_id = partition_id self._raw_data_visitor = RawDataVisitor(etcd, data_source, partition_id, raw_data_options) self._example_id_batch_options = example_id_batch_options self._flying_example_id_count = 0 self._batch_queue = [] self._raw_data_finished = False self._fetch_finished = False self._last_index = None def need_fetch(self, next_index): with self._lock: if next_index is None: return False if self._last_index is not None and next_index > self._last_index: assert self._fetch_finished return False if self._check_index_rollback(next_index): return True return self._flying_example_id_count < \ self._example_id_batch_options.max_flying_example_id def set_raw_data_finished(self): with self._lock: self._raw_data_finished = True def is_raw_data_finished(self): with self._lock: return self._raw_data_finished @contextmanager def make_fetcher(self, next_index): yield self._inner_fetcher(next_index) def _inner_fetcher(self, next_index): raw_data_finished = False with self._lock: if next_index is None: return if self._check_index_rollback(next_index): self._batch_queue = [] self._flying_example_id_count = 0 if len(self._batch_queue) > 0: end_batch = self._batch_queue[-1] next_index = end_batch.begin_index + len(end_batch) raw_data_finished = self._raw_data_finished assert next_index >= 0, "the next index should >= 0" self._raw_data_visitor.active_visitor() if next_index == 0: self._raw_data_visitor.reset() else: self._raw_data_visitor.seek(next_index - 1) while not self._raw_data_visitor.finished() and \ not self._fly_example_id_full(): next_batch = ExampleIdBatchFetcher.ExampleIdBatch( self._partition_id, next_index) for (index, item) in self._raw_data_visitor: if index != next_index: logging.fatal("index is for partition %d not consecutive, "\ "%d != %d", self._partition_id, index, next_index) os._exit(-1) # pylint: disable=protected-access next_batch.append(item.example_id, item.event_time) next_index += 1 if len(next_batch) > \ self._example_id_batch_options.example_id_batch_size: break self._append_new_example_id_batch(next_batch) yield next_batch if raw_data_finished and self._raw_data_visitor.finished(): self._set_fetch_finished(self._raw_data_visitor.get_index()) def fetch_example_id_batch_by_index(self, next_index, hit_idx=None): with self._lock: if next_index is None: return False, None, hit_idx if self._last_index is not None and self._last_index < next_index: assert self._fetch_finished return True, None, None if len(self._batch_queue) == 0: return False, None, 0 end_batch = self._batch_queue[-1] # fast path, use the hit if hit_idx is not None: if hit_idx < len(self._batch_queue): if self._batch_queue[hit_idx].begin_index == next_index: return False, self._batch_queue[hit_idx], hit_idx elif next_index >= end_batch.begin_index + len(end_batch): return self._fetch_finished, None, hit_idx fake_batch = ExampleIdBatchFetcher.ExampleIdBatch( self._partition_id, next_index) idx = bisect.bisect_left(self._batch_queue, fake_batch) if idx == len(self._batch_queue): if end_batch.begin_index + len(end_batch) >= next_index: return self._fetch_finished, None, len(self._batch_queue) elif self._batch_queue[idx].begin_index == next_index: return False, self._batch_queue[idx], idx logging.warning("next_index %d rollback! check it", next_index) return False, None, None def evict_staless_example_id_batch(self, dumped_index): with self._lock: skip_batch = 0 while dumped_index is not None and \ len(self._batch_queue) > skip_batch: batch = self._batch_queue[skip_batch] if batch.begin_index + len(batch) - 1 <= dumped_index: skip_batch += 1 self._flying_example_id_count -= len(batch) else: break self._batch_queue = self._batch_queue[skip_batch:] return skip_batch def _append_new_example_id_batch(self, next_batch): with self._lock: if len(self._batch_queue) > 0: end_batch = self._batch_queue[-1] expected_index = end_batch.begin_index + len(end_batch) if expected_index != next_batch.begin_index: logging.fatal("next batch index is not consecutive!"\ "%d(expected_index) != %d(supply_index)", expected_index, next_batch.begin_index) os._exit(-1) # pylint: disable=protected-access self._batch_queue.append(next_batch) self._flying_example_id_count += len(next_batch) def _check_index_rollback(self, next_index): assert next_index is not None if len(self._batch_queue) == 0: return True end_batch = self._batch_queue[-1] # fast path check index consecutively if next_index == end_batch.begin_index + len(end_batch): return False # slow path since need binary search fake_batch = ExampleIdBatchFetcher.ExampleIdBatch( self._partition_id, next_index) idx = bisect.bisect_left(self._batch_queue, fake_batch) if idx == len(self._batch_queue): return next_index != end_batch.begin_index + len(end_batch) return self._batch_queue[idx].begin_index != next_index def _fly_example_id_full(self): with self._lock: return self._flying_example_id_count > \ self._example_id_batch_options.max_flying_example_id def _set_fetch_finished(self, last_index): with self._lock: self._fetch_finished = True self._last_index = last_index
class DataBlockDumperManager(object): def __init__(self, etcd, data_source, partition_id): self._lock = threading.Lock() self._data_source = data_source self._partition_id = partition_id self._data_block_manager = DataBlockManager(data_source, partition_id) self._raw_data_visitor = RawDataVisitor( etcd, data_source, partition_id ) self._next_data_block_index = ( self._data_block_manager.get_dumped_data_block_num() ) self._fly_data_block_meta = [] self._stale_with_dfs = False self._synced_data_block_meta_finished = False def get_partition_id(self): return self._partition_id def get_next_data_block_index(self): with self._lock: return self._next_data_block_index def append_synced_data_block_meta(self, meta): with self._lock: if self._next_data_block_index != meta.data_block_index: return False, self._next_data_block_index self._fly_data_block_meta.append(meta) self._next_data_block_index += 1 return True, self._next_data_block_index def finish_sync_data_block_meta(self): with self._lock: self._synced_data_block_meta_finished = True def need_dump(self): with self._lock: return (len(self._fly_data_block_meta) > 0 or self._stale_with_dfs) def dump_data_blocks(self): try: self._sync_with_dfs() while True: finished = False meta = None builder = None with self._lock: finished, meta = self._get_next_data_block_meta() self._create_data_block_by_meta(meta) if meta is None: return except Exception as e: # pylint: disable=broad-except logging.error("Failed to dump data block for partition "\ "%d with expect %s", self._partition_id, e) with self._lock: self._stale_with_dfs = True def data_block_meta_sync_finished(self): with self._lock: return self._synced_data_block_meta_finished def _get_next_data_block_meta(self): if len(self._fly_data_block_meta) == 0: if self._synced_data_block_meta_finished: return True, None return False, None return False, self._fly_data_block_meta[0] @contextmanager def _make_data_block_builder(self, meta): manager = self._data_block_manager assert manager is not None assert self._partition_id == meta.partition_id builder = None try: builder = DataBlockBuilder( self._data_source.data_block_dir, self._partition_id, meta.data_block_index, ) builder.init_by_meta(meta) yield builder except Exception as e: # pylint: disable=broad-except logging.warning( "Failed make data block builder, reason %s", e ) del builder def _create_data_block_by_meta(self, meta): if meta is None: return with self._make_data_block_builder(meta) as data_block_builder: try: if meta.leader_start_index == 0: self._raw_data_visitor.reset() else: assert meta.leader_start_index > 0 self._raw_data_visitor.seek(meta.leader_start_index-1) except StopIteration: logging.fatal("raw data finished before when seek to %d", meta.leader_start_index-1) os._exit(-1) # pylint: disable=protected-access match_index = 0 example_num = len(meta.example_ids) for (index, item) in self._raw_data_visitor: example_id = item.example_id if example_id == meta.example_ids[match_index]: data_block_builder.append_raw_example(item.record) match_index += 1 if match_index >= example_num: break if index >= meta.leader_end_index: break if match_index < example_num: for idx in range(match_index, example_num): feat = {} example_id = meta.example_ids[idx] feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) empty_example = tf.train.Example( features=tf.train.Features(feature=feat)) data_block_builder.append_raw_example( empty_example.SerializeToString() ) data_block_builder.finish_data_block() assert meta == data_block_builder.get_data_block_meta() self._data_block_manager.add_dumped_data_block_meta(meta) with self._lock: assert self._fly_data_block_meta[0] == meta self._fly_data_block_meta.pop(0) def _sync_with_dfs(self): manager = self._data_block_manager dumped_num = manager.get_dumped_data_block_num(self._sync_with_dfs) with self._lock: skip_count = 0 for meta in self._fly_data_block_meta: if meta.data_block_index >= dumped_num: break skip_count += 1 self._fly_data_block_meta = self._fly_data_block_meta[skip_count:]
class DataBlockDumperManager(object): def __init__(self, kvstore, data_source, partition_id, raw_data_options, data_block_builder_options): self._lock = threading.Lock() self._data_source = data_source self._partition_id = partition_id self._data_block_manager = \ DataBlockManager(data_source, partition_id) self._raw_data_visitor = \ RawDataVisitor(kvstore, data_source, partition_id, raw_data_options) self._data_block_builder_options = data_block_builder_options self._next_data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._fly_data_block_meta = [] self._state_stale = False self._synced_data_block_meta_finished = False ds_name = self._data_source.data_source_meta.name self._metrics_tags = { 'data_source_name': ds_name, 'partition': self._partition_id } self._optional_stats = OptionalStats(raw_data_options, self._metrics_tags) def get_next_data_block_index(self): with self._lock: return self._next_data_block_index def get_dumped_data_block_index(self): return self._data_block_manager.get_dumped_data_block_count() - 1 def add_synced_data_block_meta(self, meta): with self._lock: if self._synced_data_block_meta_finished: raise RuntimeError( "data block dumper manager has been mark as "\ "no more data block meta" ) if self._next_data_block_index != meta.data_block_index: return False, self._next_data_block_index self._fly_data_block_meta.append(meta) self._next_data_block_index += 1 return True, self._next_data_block_index def finish_sync_data_block_meta(self): with self._lock: self._synced_data_block_meta_finished = True def need_dump(self): with self._lock: return len(self._fly_data_block_meta) > 0 def is_synced_data_block_meta_finished(self): with self._lock: return self._synced_data_block_meta_finished @contextmanager def make_data_block_dumper(self): self._sync_with_data_block_manager() self._acquire_state_stale() yield self._dump_data_blocks self._release_state_stale() def _dump_data_blocks(self): while self.need_dump(): meta = self._get_next_data_block_meta() if meta is not None: start_tm = time.time() self._raw_data_visitor.active_visitor() self._dump_data_block_by_meta(meta) dump_duration = time.time() - start_tm metrics.emit_timer(name='data_block_dump_duration', value=int(dump_duration), tags=self._metrics_tags) def data_block_meta_sync_finished(self): with self._lock: return self._synced_data_block_meta_finished def _acquire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _get_next_data_block_meta(self): with self._lock: if len(self._fly_data_block_meta) == 0: return None return self._fly_data_block_meta[0] @contextmanager def _make_data_block_builder(self, meta): assert self._partition_id == meta.partition_id, \ "partition id of building data block meta mismatch "\ "{} != {}".format(self._partition_id, meta.partition_id) builder = None expt = None try: builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, self._partition_id, meta.data_block_index, self._data_block_builder_options) builder.init_by_meta(meta) builder.set_data_block_manager(self._data_block_manager) yield builder except Exception as e: # pylint: disable=broad-except logging.warning("Failed make data block builder, " \ "reason %s", e) expt = e if builder is not None: del builder if expt is not None: raise expt def _dump_data_block_by_meta(self, meta): assert meta is not None, "input data block must not be None" with self._make_data_block_builder(meta) as data_block_builder: try: if meta.leader_start_index == 0: self._raw_data_visitor.reset() else: assert meta.leader_start_index > 0, \ "leader start index must be positive" self._raw_data_visitor.seek(meta.leader_start_index - 1) except StopIteration: logging.fatal("raw data finished before when seek to %d", meta.leader_start_index - 1) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access match_index = 0 example_num = len(meta.example_ids) is_v2 = len(meta.indices) > 0 def if_match(meta, match_index, index, example_id, is_v2): if is_v2: return meta.indices[match_index] == index return example_id == meta.example_ids[match_index] for (index, item) in self._raw_data_visitor: example_id = item.example_id joined = False # Elements in meta.example_ids maybe duplicated while match_index < example_num and \ if_match(meta, match_index, index, example_id, is_v2): if len(meta.joined) > 0: item.add_extra_fields( {'joined': meta.joined[match_index]}, True) data_block_builder.write_item(item) self._optional_stats.update_stats(item, kind='joined') match_index += 1 joined = True if not joined: self._optional_stats.update_stats(item, kind='unjoined') if match_index >= example_num: break if index >= meta.leader_end_index: break if match_index < example_num: logging.fatal( "Data lose corrupt! only match %d/%d example " "for data block %s", match_index, example_num, meta.block_id) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access dumped_meta = data_block_builder.finish_data_block(True) self._optional_stats.emit_optional_stats() assert dumped_meta == meta, "the generated dumped meta should "\ "be the same with input mata" with self._lock: assert self._fly_data_block_meta[0] == meta self._fly_data_block_meta.pop(0) def _is_state_stale(self): with self._lock: return self._state_stale def _sync_with_data_block_manager(self): if self._is_state_stale(): self._evict_dumped_data_block_meta() def _evict_dumped_data_block_meta(self): next_data_block_index = \ self._data_block_manager.get_dumped_data_block_count() with self._lock: skip_count = 0 for meta in self._fly_data_block_meta: if meta.data_block_index >= next_data_block_index: break skip_count += 1 self._fly_data_block_meta = \ self._fly_data_block_meta[skip_count:]