Exemplo n.º 1
0
 def __init__(self, example_joiner_options, raw_data_options,
              data_block_builder_options, kvstore, data_source,
              partition_id):
     self._lock = threading.Lock()
     self._example_joiner_options = example_joiner_options
     self._raw_data_options = raw_data_options
     self._data_source = data_source
     self._partition_id = partition_id
     self._leader_visitor = \
             ExampleIdVisitor(kvstore, self._data_source, self._partition_id)
     self._follower_visitor = \
             RawDataVisitor(kvstore, self._data_source,
                            self._partition_id, raw_data_options)
     self._data_block_manager = \
             DataBlockManager(self._data_source, self._partition_id)
     meta = self._data_block_manager.get_lastest_data_block_meta()
     if meta is None:
         self._joiner_stats = JoinerStats(0, -1, -1)
     else:
         stats_info = meta.joiner_stats_info
         self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num,
                                          stats_info.leader_stats_index,
                                          stats_info.follower_stats_index)
     self._data_block_builder_options = data_block_builder_options
     self._data_block_builder = None
     self._state_stale = False
     self._follower_restart_index = 0
     self._sync_example_id_finished = False
     self._raw_data_finished = False
     self._join_finished = False
     ds_name = self._data_source.data_source_meta.name
     self._metrics_tags = {
         'data_source_name': ds_name,
         'partition': partition_id,
         'joiner_name': self.name()
     }
     self._optional_stats = OptionalStats(raw_data_options,
                                          self._metrics_tags)
     self._latest_dump_timestamp = time.time()
     self._sync_state()
Exemplo n.º 2
0
class ExampleJoiner(object):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        self._lock = threading.Lock()
        self._example_joiner_options = example_joiner_options
        self._raw_data_options = raw_data_options
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = \
                ExampleIdVisitor(kvstore, self._data_source, self._partition_id)
        self._follower_visitor = \
                RawDataVisitor(kvstore, self._data_source,
                               self._partition_id, raw_data_options)
        self._data_block_manager = \
                DataBlockManager(self._data_source, self._partition_id)
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is None:
            self._joiner_stats = JoinerStats(0, -1, -1)
        else:
            stats_info = meta.joiner_stats_info
            self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num,
                                             stats_info.leader_stats_index,
                                             stats_info.follower_stats_index)
        self._data_block_builder_options = data_block_builder_options
        self._data_block_builder = None
        self._state_stale = False
        self._follower_restart_index = 0
        self._sync_example_id_finished = False
        self._raw_data_finished = False
        self._join_finished = False
        ds_name = self._data_source.data_source_meta.name
        self._metrics_tags = {
            'data_source_name': ds_name,
            'partition': partition_id,
            'joiner_name': self.name()
        }
        self._optional_stats = OptionalStats(raw_data_options,
                                             self._metrics_tags)
        self._latest_dump_timestamp = time.time()
        self._sync_state()

    @contextmanager
    def make_example_joiner(self):
        state_stale = self._is_state_stale()
        self._acuqire_state_stale()
        yield self._inner_joiner(state_stale)
        self._release_state_stale()

    @classmethod
    def name(cls):
        return 'BASE_EXAMPLE_JOINER'

    def get_data_block_meta_by_index(self, index):
        with self._lock:
            manager = self._data_block_manager
            return self._join_finished, \
                    manager.get_data_block_meta_by_index(index)

    def get_dumped_data_block_count(self):
        return self._data_block_manager.get_dumped_data_block_count()

    def is_join_finished(self):
        with self._lock:
            return self._join_finished

    def set_sync_example_id_finished(self):
        with self._lock:
            self._sync_example_id_finished = True

    def set_raw_data_finished(self):
        with self._lock:
            self._raw_data_finished = True

    def is_sync_example_id_finished(self):
        with self._lock:
            return self._sync_example_id_finished

    def is_raw_data_finished(self):
        with self._lock:
            return self._raw_data_finished

    def need_join(self):
        with self._lock:
            if self._join_finished:
                return False
            if self._state_stale or self._sync_example_id_finished:
                return True
            if self._follower_visitor.is_visitor_stale() or \
                    self._leader_visitor.is_visitor_stale():
                return True
            if not self._follower_visitor.finished() and \
                    not self._leader_visitor.finished():
                return True
            return self._need_finish_data_block_since_interval()

    def _prepare_join(self, state_stale):
        if state_stale:
            self._sync_state()
            self._reset_data_block_builder()
        sync_example_id_finished = self.is_sync_example_id_finished()
        raw_data_finished = self.is_raw_data_finished()
        self._active_visitors()
        return sync_example_id_finished, raw_data_finished

    def _inner_joiner(self, reset_state):
        raise NotImplementedError(
            "_inner_joiner not implement for base class: %s" %
            ExampleJoiner.name())

    def _is_state_stale(self):
        with self._lock:
            return self._state_stale

    def _active_visitors(self):
        self._leader_visitor.active_visitor()
        self._follower_visitor.active_visitor()

    def _sync_state(self):
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
        else:
            self._leader_visitor.reset()
            self._follower_visitor.reset()

    def _get_data_block_builder(self, create_if_no_existed):
        if self._data_block_builder is None and create_if_no_existed:
            data_block_index = \
                    self._data_block_manager.get_dumped_data_block_count()
            self._data_block_builder = DataBlockBuilder(
                common.data_source_data_block_dir(self._data_source),
                self._data_source.data_source_meta.name, self._partition_id,
                data_block_index, self._data_block_builder_options,
                self._example_joiner_options.data_block_dump_threshold)
            self._data_block_builder.set_data_block_manager(
                self._data_block_manager)
            self._data_block_builder.set_follower_restart_index(
                self._follower_restart_index)
        return self._data_block_builder

    def _finish_data_block(self):
        if self._data_block_builder is not None:
            self._data_block_builder.set_join_stats_info(
                self._create_join_stats_info())
            meta = self._data_block_builder.finish_data_block(
                True, self._metrics_tags)
            self._optional_stats.emit_optional_stats()
            self._reset_data_block_builder()
            self._update_latest_dump_timestamp()
            return meta
        return None

    def _create_join_stats_info(self):
        builder = self._get_data_block_builder(False)
        nstats_cum_join_num = self._joiner_stats.calc_stats_joined_num()
        nactual_cum_join_num = 0 if builder is None \
                               else builder.example_count()
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            nactual_cum_join_num += meta.joiner_stats_info.actual_cum_join_num
        return dj_pb.JoinerStatsInfo(
            stats_cum_join_num=nstats_cum_join_num,
            actual_cum_join_num=nactual_cum_join_num,
            leader_stats_index=self._joiner_stats.get_leader_stats_index(),
            follower_stats_index=self._joiner_stats.get_follower_stats_index())

    def _reset_data_block_builder(self):
        builder = None
        with self._lock:
            builder = self._data_block_builder
            self._data_block_builder = None
        if builder is not None:
            del builder

    def _update_latest_dump_timestamp(self):
        data_block_dump_duration = time.time() - self._latest_dump_timestamp
        metrics.emit_timer(name='data_block_dump_duration',
                           value=int(data_block_dump_duration),
                           tags=self._metrics_tags)
        self._latest_dump_timestamp = time.time()

    def _acuqire_state_stale(self):
        with self._lock:
            self._state_stale = True

    def _release_state_stale(self):
        with self._lock:
            self._state_stale = False

    def _set_join_finished(self):
        with self._lock:
            self._join_finished = True

    def _need_finish_data_block_since_interval(self):
        dump_interval = self._example_joiner_options.data_block_dump_interval
        duration_since_dump = time.time() - self._latest_dump_timestamp
        return 0 < dump_interval <= duration_since_dump