Пример #1
0
 def _get_data_block_builder(self):
     if self._data_block_builder is not None:
         return self._data_block_builder
     data_block_index = self._data_block_manager.get_dumped_data_block_num()
     self._data_block_builder = DataBlockBuilder(
         self._data_source.data_block_dir, self._partition_id,
         data_block_index,
         self._data_source.data_source_meta.max_example_in_data_block)
     return self._data_block_builder
Пример #2
0
 def _get_data_block_builder(self, create_if_no_existed):
     if self._data_block_builder is None and create_if_no_existed:
         data_block_index = \
                 self._data_block_manager.get_dumped_data_block_count()
         self._data_block_builder = DataBlockBuilder(
             self._data_source.data_block_dir,
             self._data_source.data_source_meta.name, self._partition_id,
             data_block_index,
             self._example_joiner_options.data_block_dump_threshold)
         self._data_block_builder.set_data_block_manager(
             self._data_block_manager)
         self._data_block_builder.set_follower_restart_index(
             self._follower_restart_index)
     return self._data_block_builder
Пример #3
0
 def _make_data_block_builder(self, meta):
     manager = self._data_block_manager
     assert manager is not None
     assert self._partition_id == meta.partition_id
     builder = None
     try:
         builder = DataBlockBuilder(
             self._data_source.data_block_dir,
             self._partition_id,
             meta.data_block_index,
         )
         builder.init_by_meta(meta)
         yield builder
     except Exception as e:  # pylint: disable=broad-except
         logging.warning("Failed make data block builder, reason %s", e)
     del builder
Пример #4
0
    def _create_data_block(self, partition_id):
        dbm = data_block_manager.DataBlockManager(self.data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)

        leader_index = 0
        follower_index = 65536
        for i in range(64):
            builder = DataBlockBuilder(
                    common.data_source_data_block_dir(self.data_source),
                    self.data_source.data_source_meta.name,
                    partition_id, i,
                    dj_pb.WriterOptions(output_writer='TF_RECORD'), None
                )
            builder.set_data_block_manager(dbm)
            for j in range(4):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = random.randint(0, 10)
                feat['event_time'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            self.data_block_matas.append(builder.finish_data_block())
Пример #5
0
    def add_data_block(self, partition_id, x, y):
        dbm = self._dbms[partition_id]

        builder = DataBlockBuilder(
            common.data_source_data_block_dir(self._data_source),
            self._data_source.data_source_meta.name, partition_id,
            dbm.get_dumped_data_block_count(),
            dj_pb.WriterOptions(output_writer="TF_RECORD"), None)
        builder.set_data_block_manager(dbm)
        for i in range(x.shape[0]):
            feat = {}
            exam_id = '{}'.format(i).encode()
            feat['example_id'] = Feature(
                bytes_list=BytesList(value=[exam_id]))
            feat['event_time'] = Feature(
                int64_list = Int64List(value=[i])
            )
            feat['x'] = Feature(float_list=FloatList(value=list(x[i])))
            if y is not None:
                feat['y'] = Feature(int64_list=Int64List(value=[y[i]]))

            example = Example(features=Features(feature=feat))
            builder.append_item(TfExampleItem(example.SerializeToString()), i, 0)

        return builder.finish_data_block()
 def generate_leader_raw_data(self):
     dbm = data_block_manager.DataBlockManager(self.data_source_l, 0)
     raw_data_dir = os.path.join(self.data_source_l.raw_data_dir,
                                 common.partition_repr(0))
     if gfile.Exists(raw_data_dir):
         gfile.DeleteRecursively(raw_data_dir)
     gfile.MakeDirs(raw_data_dir)
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source_l, 0)
     block_index = 0
     builder = DataBlockBuilder(
         self.data_source_l.raw_data_dir,
         self.data_source_l.data_source_meta.name, 0, block_index,
         dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
     process_index = 0
     start_index = 0
     for i in range(0, self.leader_end_index + 3):
         if (i > 0 and i % 2048 == 0) or (i == self.leader_end_index + 2):
             meta = builder.finish_data_block()
             if meta is not None:
                 ofname = common.encode_data_block_fname(
                     self.data_source_l.data_source_meta.name, meta)
                 fpath = os.path.join(raw_data_dir, ofname)
                 self.manifest_manager.add_raw_data(0, [
                     dj_pb.RawDataMeta(
                         file_path=fpath,
                         timestamp=timestamp_pb2.Timestamp(seconds=3))
                 ], False)
                 process_index += 1
                 start_index += len(meta.example_ids)
             block_index += 1
             builder = DataBlockBuilder(
                 self.data_source_l.raw_data_dir,
                 self.data_source_l.data_source_meta.name, 0, block_index,
                 dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         feat = {}
         pt = i + 1 << 30
         if i % 3 == 0:
             pt = i // 3
         example_id = '{}'.format(pt).encode()
         feat['example_id'] = tf.train.Feature(
             bytes_list=tf.train.BytesList(value=[example_id]))
         event_time = 150000000 + pt
         feat['event_time'] = tf.train.Feature(
             int64_list=tf.train.Int64List(value=[event_time]))
         example = tf.train.Example(features=tf.train.Features(
             feature=feat))
         builder.append_item(TfExampleItem(example.SerializeToString()), i,
                             i)
     fpaths = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in fpaths:
         if not fpath.endswith(common.DataBlockSuffix):
             gfile.Remove(fpath)
Пример #7
0
 def _make_data_block_builder(self, meta):
     assert self._partition_id == meta.partition_id, \
         "partition id of building data block meta mismatch "\
         "{} != {}".format(self._partition_id, meta.partition_id)
     builder = None
     expt = None
     try:
         builder = DataBlockBuilder(
                 common.data_source_data_block_dir(self._data_source),
                 self._data_source.data_source_meta.name,
                 self._partition_id,
                 meta.data_block_index,
                 self._data_block_builder_options
             )
         builder.init_by_meta(meta)
         builder.set_data_block_manager(self._data_block_manager)
         yield builder
     except Exception as e: # pylint: disable=broad-except
         logging.warning("Failed make data block builder, " \
                          "reason %s", e)
         expt = e
     if builder is not None:
         del builder
     if expt is not None:
         raise expt
Пример #8
0
    def _create_data_block(self, data_source, partition_id, x, y):
        data_block_metas = []
        dbm = data_block_manager.DataBlockManager(data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)
        N = 200
        chunk_size = x.shape[0] // N

        leader_index = 0
        follower_index = N * chunk_size * 10
        for i in range(N):
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(data_source),
                data_source.data_source_meta.name,
                partition_id, i,
                dj_pb.WriterOptions(output_writer="TF_RECORD"), None
            )
            builder.set_data_block_manager(dbm)
            for j in range(chunk_size):
                feat = {}
                idx =  i * chunk_size + j
                exam_id = '{}'.format(idx).encode()
                feat['example_id'] = Feature(
                    bytes_list=BytesList(value=[exam_id]))
                evt_time = random.randint(1, 1000)
                feat['event_time'] = Feature(
                    int64_list = Int64List(value=[evt_time])
                )
                feat['x'] = Feature(float_list=FloatList(value=list(x[idx])))
                if y is not None:
                    feat['y'] = Feature(int64_list=Int64List(value=[y[idx]]))

                feat['leader_index'] = Feature(
                    int64_list = Int64List(value=[leader_index])
                )
                feat['follower_index'] = Feature(
                    int64_list = Int64List(value=[follower_index])
                )
                example = Example(features=Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                   leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            data_block_metas.append(builder.finish_data_block())
        self.max_index = follower_index
        return data_block_metas
Пример #9
0
 def generate_follower_data_block(self):
     dbm = data_block_manager.DataBlockManager(self.data_source_f, 0)
     self.assertEqual(dbm.get_dumped_data_block_count(), 0)
     self.assertEqual(dbm.get_lastest_data_block_meta(), None)
     leader_index = 0
     follower_index = 65536
     self.dumped_metas = []
     for i in range(5):
         builder = DataBlockBuilder(
             common.data_source_data_block_dir(self.data_source_f),
             self.data_source_f.data_source_meta.name, 0, i,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         builder.set_data_block_manager(dbm)
         for j in range(1024):
             feat = {}
             example_id = '{}'.format(i * 1024 + j).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + i * 1024 + j
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat['leader_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[leader_index]))
             feat['follower_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[follower_index]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 leader_index, follower_index)
             leader_index += 3
             follower_index += 1
         meta = builder.finish_data_block()
         self.dumped_metas.append(meta)
     self.leader_start_index = 0
     self.leader_end_index = leader_index
     self.assertEqual(dbm.get_dumped_data_block_count(), 5)
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(dbm.get_data_block_meta_by_index(idx), meta)
Пример #10
0
 def generate_raw_data(self, begin_index, item_count):
     raw_data_dir = os.path.join(self.raw_data_dir,
                                 common.partition_repr(0))
     if not gfile.Exists(raw_data_dir):
         gfile.MakeDirs(raw_data_dir)
     self.total_raw_data_count += item_count
     useless_index = 0
     rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source,
                                           0)
     fpaths = []
     for block_index in range(0, item_count // 2048):
         builder = DataBlockBuilder(
             self.raw_data_dir,
             self.data_source.data_source_meta.name, 0, block_index,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         cands = list(
             range(begin_index + block_index * 2048,
                   begin_index + (block_index + 1) * 2048))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 2:
                 continue
             a = random.randint(i - 32, i + 32)
             b = random.randint(i - 32, i + 32)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= 32
                     and abs(cands[b] - i - start_index) <= 32):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             feat = {}
             example_id = '{}'.format(example_idx).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + example_idx
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             label = random.choice([1, 0])
             if random.random() < 0.8:
                 feat['label'] = tf.train.Feature(
                     int64_list=tf.train.Int64List(value=[label]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 useless_index, useless_index)
             useless_index += 1
         meta = builder.finish_data_block()
         fname = common.encode_data_block_fname(
             self.data_source.data_source_meta.name, meta)
         fpath = os.path.join(raw_data_dir, fname)
         fpaths.append(
             dj_pb.RawDataMeta(
                 file_path=fpath,
                 timestamp=timestamp_pb2.Timestamp(seconds=3)))
         self.g_data_block_index += 1
     all_files = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in all_files:
         if not fpath.endswith(common.DataBlockSuffix):
             gfile.Remove(fpath)
     self.manifest_manager.add_raw_data(0, fpaths, False)
Пример #11
0
class ExampleJoiner(object):
    def __init__(self, example_joiner_options, raw_data_options, etcd,
                 data_source, partition_id):
        self._lock = threading.Lock()
        self._example_joiner_options = example_joiner_options
        self._raw_data_options = raw_data_options
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = \
                ExampleIdVisitor(etcd, self._data_source, self._partition_id)
        self._follower_visitor = \
                RawDataVisitor(etcd, self._data_source,
                               self._partition_id, raw_data_options)
        self._data_block_manager = \
                DataBlockManager(self._data_source, self._partition_id)

        self._data_block_builder = None
        self._state_stale = False
        self._follower_restart_index = 0
        self._sync_example_id_finished = False
        self._raw_data_finished = False
        self._join_finished = False
        self._latest_dump_timestamp = time.time()
        self._sync_state()

    @contextmanager
    def make_example_joiner(self):
        state_stale = self._is_state_stale()
        self._acuqire_state_stale()
        yield self._inner_joiner(state_stale)
        self._release_state_stale()

    @classmethod
    def name(cls):
        return 'BASE_EXAMPLE_JOINER'

    def get_data_block_meta_by_index(self, index):
        with self._lock:
            manager = self._data_block_manager
            return self._join_finished, \
                    manager.get_data_block_meta_by_index(index)

    def get_dumped_data_block_count(self):
        return self._data_block_manager.get_dumped_data_block_count()

    def is_join_finished(self):
        with self._lock:
            return self._join_finished

    def set_sync_example_id_finished(self):
        with self._lock:
            self._sync_example_id_finished = True

    def set_raw_data_finished(self):
        with self._lock:
            self._raw_data_finished = True

    def is_sync_example_id_finished(self):
        with self._lock:
            return self._sync_example_id_finished

    def is_raw_data_finished(self):
        with self._lock:
            return self._raw_data_finished

    def need_join(self):
        with self._lock:
            if self._join_finished:
                return False
            if self._state_stale or self._sync_example_id_finished:
                return True
            if self._follower_visitor.is_visitor_stale() or \
                    self._leader_visitor.is_visitor_stale():
                return True
            if not self._follower_visitor.finished() and \
                    not self._leader_visitor.finished():
                return True
            return self._need_finish_data_block_since_interval()

    def _inner_joiner(self, reset_state):
        raise NotImplementedError(
            "_inner_joiner not implement for base class: %s" %
            ExampleJoiner.name())

    def _is_state_stale(self):
        with self._lock:
            return self._state_stale

    def _active_visitors(self):
        self._leader_visitor.active_visitor()
        self._follower_visitor.active_visitor()

    def _sync_state(self):
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
        else:
            self._leader_visitor.reset()
            self._follower_visitor.reset()

    def _get_data_block_builder(self, create_if_no_existed):
        if self._data_block_builder is None and create_if_no_existed:
            data_block_index = \
                    self._data_block_manager.get_dumped_data_block_count()
            self._data_block_builder = DataBlockBuilder(
                self._data_source.data_block_dir,
                self._data_source.data_source_meta.name, self._partition_id,
                data_block_index,
                self._example_joiner_options.data_block_dump_threshold)
            self._data_block_builder.set_data_block_manager(
                self._data_block_manager)
            self._data_block_builder.set_follower_restart_index(
                self._follower_restart_index)
        return self._data_block_builder

    def _finish_data_block(self):
        if self._data_block_builder is not None:
            meta = self._data_block_builder.finish_data_block()
            self._reset_data_block_builder()
            self._update_latest_dump_timestamp()
            return meta
        return None

    def _reset_data_block_builder(self):
        builder = None
        with self._lock:
            builder = self._data_block_builder
            self._data_block_builder = None
        if builder is not None:
            del builder

    def _update_latest_dump_timestamp(self):
        with self._lock:
            self._latest_dump_timestamp = time.time()

    def _acuqire_state_stale(self):
        with self._lock:
            self._state_stale = True

    def _release_state_stale(self):
        with self._lock:
            self._state_stale = False

    def _set_join_finished(self):
        with self._lock:
            self._join_finished = True

    def _need_finish_data_block_since_interval(self):
        dump_interval = self._example_joiner_options.data_block_dump_interval
        duration_since_dump = time.time() - self._latest_dump_timestamp
        return 0 < dump_interval <= duration_since_dump
Пример #12
0
 def generate_raw_data(self, etcd, rdp, data_source, partition_id,
                       block_size, shuffle_win_size, feat_key_fmt,
                       feat_val_fmt):
     dbm = data_block_manager.DataBlockManager(data_source, partition_id)
     raw_data_dir = os.path.join(data_source.raw_data_dir,
                                 common.partition_repr(partition_id))
     if gfile.Exists(raw_data_dir):
         gfile.DeleteRecursively(raw_data_dir)
     gfile.MakeDirs(raw_data_dir)
     useless_index = 0
     new_raw_data_fnames = []
     for block_index in range(self.total_index // block_size):
         builder = DataBlockBuilder(
             data_source.raw_data_dir, data_source.data_source_meta.name,
             partition_id, block_index,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         cands = list(
             range(block_index * block_size,
                   (block_index + 1) * block_size))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 2:
                 continue
             a = random.randint(i - shuffle_win_size, i + shuffle_win_size)
             b = random.randint(i - shuffle_win_size, i + shuffle_win_size)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= shuffle_win_size and
                     abs(cands[b] - i - start_index) <= shuffle_win_size):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             feat = {}
             example_id = '{}'.format(example_idx).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + example_idx
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat[feat_key_fmt.format(example_idx)] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(
                     value=[feat_val_fmt.format(example_idx).encode()]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 useless_index, useless_index)
             useless_index += 1
         meta = builder.finish_data_block()
         fname = common.encode_data_block_fname(
             data_source.data_source_meta.name, meta)
         new_raw_data_fnames.append(os.path.join(raw_data_dir, fname))
     fpaths = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in fpaths:
         if fpath.endswith(common.DataBlockMetaSuffix):
             gfile.Remove(fpath)
     rdp.publish_raw_data(partition_id, new_raw_data_fnames)
    def test_data_block_manager(self):
        data_block_datas = []
        data_block_metas = []
        leader_index = 0
        follower_index = 65536
        for i in range(5):
            fill_examples = []
            builder = DataBlockBuilder(
                self.data_source.data_block_dir,
                self.data_source.data_source_meta.name, 0, i,
                dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
            builder.set_data_block_manager(self.data_block_manager)
            for j in range(1024):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = 150000000 + i * 1024 + j
                feat['event_time'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(
                    feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                fill_examples.append((example, {
                    'example_id': example_id,
                    'event_time': event_time,
                    'leader_index': leader_index,
                    'follower_index': follower_index
                }))
                leader_index += 1
                follower_index += 1
            meta = builder.finish_data_block()
            data_block_datas.append(fill_examples)
            data_block_metas.append(meta)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
        self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                         data_block_metas[-1])
        for (idx, meta) in enumerate(data_block_metas):
            self.assertEqual(
                self.data_block_manager.get_data_block_meta_by_index(idx),
                meta)
            self.assertEqual(
                meta.block_id,
                common.encode_block_id(self.data_source.data_source_meta.name,
                                       meta))
        self.assertEqual(
            self.data_block_manager.get_data_block_meta_by_index(5), None)
        data_block_dir = os.path.join(self.data_source.data_block_dir,
                                      common.partition_repr(0))
        for (i, meta) in enumerate(data_block_metas):
            data_block_fpath = os.path.join(
                data_block_dir, meta.block_id) + common.DataBlockSuffix
            data_block_meta_fpath = os.path.join(
                data_block_dir,
                common.encode_data_block_meta_fname(
                    self.data_source.data_source_meta.name, 0,
                    meta.data_block_index))
            self.assertTrue(gfile.Exists(data_block_fpath))
            self.assertTrue(gfile.Exists(data_block_meta_fpath))
            fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
            remote_meta = text_format.Parse(
                next(fiter).decode(), dj_pb.DataBlockMeta())
            self.assertEqual(meta, remote_meta)
            for (j, record) in enumerate(
                    tf.io.tf_record_iterator(data_block_fpath)):
                example = tf.train.Example()
                example.ParseFromString(record)
                stored_data = data_block_datas[i][j]
                self.assertEqual(example, stored_data[0])
                feat = example.features.feature
                stored_feat = stored_data[1]
                self.assertTrue('example_id' in feat)
                self.assertTrue('example_id' in stored_feat)
                self.assertEqual(stored_feat['example_id'],
                                 '{}'.format(i * 1024 + j).encode())
                self.assertEqual(stored_feat['example_id'],
                                 feat['example_id'].bytes_list.value[0])
                self.assertTrue('event_time' in feat)
                self.assertTrue('event_time' in stored_feat)
                self.assertEqual(stored_feat['event_time'],
                                 feat['event_time'].int64_list.value[0])
                self.assertTrue('leader_index' in feat)
                self.assertTrue('leader_index' in stored_feat)
                self.assertEqual(stored_feat['leader_index'],
                                 feat['leader_index'].int64_list.value[0])
                self.assertTrue('follower_index' in feat)
                self.assertTrue('follower_index' in stored_feat)
                self.assertEqual(stored_feat['follower_index'],
                                 feat['follower_index'].int64_list.value[0])
            self.assertEqual(j, 1023)

        data_block_manager2 = data_block_manager.DataBlockManager(
            self.data_source, 0)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
Пример #14
0
class ExampleJoiner(object):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        self._lock = threading.Lock()
        self._example_joiner_options = example_joiner_options
        self._raw_data_options = raw_data_options
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = \
                ExampleIdVisitor(kvstore, self._data_source, self._partition_id)
        self._follower_visitor = \
                RawDataVisitor(kvstore, self._data_source,
                               self._partition_id, raw_data_options)
        self._data_block_manager = \
                DataBlockManager(self._data_source, self._partition_id)
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is None:
            self._joiner_stats = JoinerStats(0, -1, -1)
        else:
            stats_info = meta.joiner_stats_info
            self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num,
                                             stats_info.leader_stats_index,
                                             stats_info.follower_stats_index)
        self._data_block_builder_options = data_block_builder_options
        self._data_block_builder = None
        self._state_stale = False
        self._follower_restart_index = 0
        self._sync_example_id_finished = False
        self._raw_data_finished = False
        self._join_finished = False
        ds_name = self._data_source.data_source_meta.name
        self._metrics_tags = {
            'data_source_name': ds_name,
            'partition': partition_id,
            'joiner_name': self.name()
        }
        self._optional_stats = OptionalStats(raw_data_options,
                                             self._metrics_tags)
        self._latest_dump_timestamp = time.time()
        self._sync_state()

    @contextmanager
    def make_example_joiner(self):
        state_stale = self._is_state_stale()
        self._acuqire_state_stale()
        yield self._inner_joiner(state_stale)
        self._release_state_stale()

    @classmethod
    def name(cls):
        return 'BASE_EXAMPLE_JOINER'

    def get_data_block_meta_by_index(self, index):
        with self._lock:
            manager = self._data_block_manager
            return self._join_finished, \
                    manager.get_data_block_meta_by_index(index)

    def get_dumped_data_block_count(self):
        return self._data_block_manager.get_dumped_data_block_count()

    def is_join_finished(self):
        with self._lock:
            return self._join_finished

    def set_sync_example_id_finished(self):
        with self._lock:
            self._sync_example_id_finished = True

    def set_raw_data_finished(self):
        with self._lock:
            self._raw_data_finished = True

    def is_sync_example_id_finished(self):
        with self._lock:
            return self._sync_example_id_finished

    def is_raw_data_finished(self):
        with self._lock:
            return self._raw_data_finished

    def need_join(self):
        with self._lock:
            if self._join_finished:
                return False
            if self._state_stale or self._sync_example_id_finished:
                return True
            if self._follower_visitor.is_visitor_stale() or \
                    self._leader_visitor.is_visitor_stale():
                return True
            if not self._follower_visitor.finished() and \
                    not self._leader_visitor.finished():
                return True
            return self._need_finish_data_block_since_interval()

    def _prepare_join(self, state_stale):
        if state_stale:
            self._sync_state()
            self._reset_data_block_builder()
        sync_example_id_finished = self.is_sync_example_id_finished()
        raw_data_finished = self.is_raw_data_finished()
        self._active_visitors()
        return sync_example_id_finished, raw_data_finished

    def _inner_joiner(self, reset_state):
        raise NotImplementedError(
            "_inner_joiner not implement for base class: %s" %
            ExampleJoiner.name())

    def _is_state_stale(self):
        with self._lock:
            return self._state_stale

    def _active_visitors(self):
        self._leader_visitor.active_visitor()
        self._follower_visitor.active_visitor()

    def _sync_state(self):
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
        else:
            self._leader_visitor.reset()
            self._follower_visitor.reset()

    def _get_data_block_builder(self, create_if_no_existed):
        if self._data_block_builder is None and create_if_no_existed:
            data_block_index = \
                    self._data_block_manager.get_dumped_data_block_count()
            self._data_block_builder = DataBlockBuilder(
                common.data_source_data_block_dir(self._data_source),
                self._data_source.data_source_meta.name, self._partition_id,
                data_block_index, self._data_block_builder_options,
                self._example_joiner_options.data_block_dump_threshold)
            self._data_block_builder.set_data_block_manager(
                self._data_block_manager)
            self._data_block_builder.set_follower_restart_index(
                self._follower_restart_index)
        return self._data_block_builder

    def _finish_data_block(self):
        if self._data_block_builder is not None:
            self._data_block_builder.set_join_stats_info(
                self._create_join_stats_info())
            meta = self._data_block_builder.finish_data_block(
                True, self._metrics_tags)
            self._optional_stats.emit_optional_stats()
            self._reset_data_block_builder()
            self._update_latest_dump_timestamp()
            return meta
        return None

    def _create_join_stats_info(self):
        builder = self._get_data_block_builder(False)
        nstats_cum_join_num = self._joiner_stats.calc_stats_joined_num()
        nactual_cum_join_num = 0 if builder is None \
                               else builder.example_count()
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            nactual_cum_join_num += meta.joiner_stats_info.actual_cum_join_num
        return dj_pb.JoinerStatsInfo(
            stats_cum_join_num=nstats_cum_join_num,
            actual_cum_join_num=nactual_cum_join_num,
            leader_stats_index=self._joiner_stats.get_leader_stats_index(),
            follower_stats_index=self._joiner_stats.get_follower_stats_index())

    def _reset_data_block_builder(self):
        builder = None
        with self._lock:
            builder = self._data_block_builder
            self._data_block_builder = None
        if builder is not None:
            del builder

    def _update_latest_dump_timestamp(self):
        data_block_dump_duration = time.time() - self._latest_dump_timestamp
        metrics.emit_timer(name='data_block_dump_duration',
                           value=int(data_block_dump_duration),
                           tags=self._metrics_tags)
        self._latest_dump_timestamp = time.time()

    def _acuqire_state_stale(self):
        with self._lock:
            self._state_stale = True

    def _release_state_stale(self):
        with self._lock:
            self._state_stale = False

    def _set_join_finished(self):
        with self._lock:
            self._join_finished = True

    def _need_finish_data_block_since_interval(self):
        dump_interval = self._example_joiner_options.data_block_dump_interval
        duration_since_dump = time.time() - self._latest_dump_timestamp
        return 0 < dump_interval <= duration_since_dump
Пример #15
0
class ExampleJoiner(object):
    def __init__(self, etcd, data_source, partition_id, options):
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = ExampleIdVisitor(
            ExampleIdManager(data_source, partition_id))
        self._follower_visitor = RawDataVisitor(etcd, data_source,
                                                partition_id, options)
        self._data_block_manager = DataBlockManager(data_source, partition_id)

        self._data_block_builder = None
        self._stale_with_dfs = False
        self._follower_restart_index = 0
        self._sync_state()

    def join_example(self):
        raise NotImplementedError(
            "join exampel not implement for base class: %s" %
            ExampleJoiner.name())

    @classmethod
    def name(cls):
        return 'EXAMPLE_JOINER'

    def get_data_block_number(self):
        return self._data_block_manager.num_dumped_data_block()

    def get_data_block_meta(self, index):
        return self._data_block_manager.get_data_block_meta_by_index(index)

    def join_finished(self):
        return self._data_block_manager.join_finished()

    def _sync_state(self):
        meta = self._data_block_manager.get_last_data_block_meta(
            self._stale_with_dfs)
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
            if (self._leader_visitor.finished()
                    or self._follower_visitor.finished()):
                self._data_block_manager.finish_join()
        self._stale_with_dfs = False

    def _get_data_block_builder(self):
        if self._data_block_builder is not None:
            return self._data_block_builder
        data_block_index = self._data_block_manager.get_dumped_data_block_num()
        self._data_block_builder = DataBlockBuilder(
            self._data_source.data_block_dir, self._partition_id,
            data_block_index,
            self._data_source.data_source_meta.max_example_in_data_block)
        return self._data_block_builder

    def _finish_data_block(self):
        assert self._data_block_builder is not None
        self._data_block_builder.set_follower_restart_index(
            self._follower_restart_index)
        self._data_block_builder.finish_data_block()
        meta = self._data_block_builder.get_data_block_meta()
        if meta is not None:
            self._data_block_manager.add_dumped_data_block_meta(meta)
        self._data_block_builder = None