def _get_data_block_builder(self): if self._data_block_builder is not None: return self._data_block_builder data_block_index = self._data_block_manager.get_dumped_data_block_num() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._partition_id, data_block_index, self._data_source.data_source_meta.max_example_in_data_block) return self._data_block_builder
def _get_data_block_builder(self, create_if_no_existed): if self._data_block_builder is None and create_if_no_existed: data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._data_source.data_source_meta.name, self._partition_id, data_block_index, self._example_joiner_options.data_block_dump_threshold) self._data_block_builder.set_data_block_manager( self._data_block_manager) self._data_block_builder.set_follower_restart_index( self._follower_restart_index) return self._data_block_builder
def _make_data_block_builder(self, meta): manager = self._data_block_manager assert manager is not None assert self._partition_id == meta.partition_id builder = None try: builder = DataBlockBuilder( self._data_source.data_block_dir, self._partition_id, meta.data_block_index, ) builder.init_by_meta(meta) yield builder except Exception as e: # pylint: disable=broad-except logging.warning("Failed make data block builder, reason %s", e) del builder
def _create_data_block(self, partition_id): dbm = data_block_manager.DataBlockManager(self.data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) leader_index = 0 follower_index = 65536 for i in range(64): builder = DataBlockBuilder( common.data_source_data_block_dir(self.data_source), self.data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None ) builder.set_data_block_manager(dbm) for j in range(4): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = random.randint(0, 10) feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 self.data_block_matas.append(builder.finish_data_block())
def add_data_block(self, partition_id, x, y): dbm = self._dbms[partition_id] builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, partition_id, dbm.get_dumped_data_block_count(), dj_pb.WriterOptions(output_writer="TF_RECORD"), None) builder.set_data_block_manager(dbm) for i in range(x.shape[0]): feat = {} exam_id = '{}'.format(i).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) feat['event_time'] = Feature( int64_list = Int64List(value=[i]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[i]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[i]])) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, 0) return builder.finish_data_block()
def generate_leader_raw_data(self): dbm = data_block_manager.DataBlockManager(self.data_source_l, 0) raw_data_dir = os.path.join(self.data_source_l.raw_data_dir, common.partition_repr(0)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source_l, 0) block_index = 0 builder = DataBlockBuilder( self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) process_index = 0 start_index = 0 for i in range(0, self.leader_end_index + 3): if (i > 0 and i % 2048 == 0) or (i == self.leader_end_index + 2): meta = builder.finish_data_block() if meta is not None: ofname = common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, ofname) self.manifest_manager.add_raw_data(0, [ dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3)) ], False) process_index += 1 start_index += len(meta.example_ids) block_index += 1 builder = DataBlockBuilder( self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) feat = {} pt = i + 1 << 30 if i % 3 == 0: pt = i // 3 example_id = '{}'.format(pt).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + pt feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, i) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath)
def _make_data_block_builder(self, meta): assert self._partition_id == meta.partition_id, \ "partition id of building data block meta mismatch "\ "{} != {}".format(self._partition_id, meta.partition_id) builder = None expt = None try: builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, self._partition_id, meta.data_block_index, self._data_block_builder_options ) builder.init_by_meta(meta) builder.set_data_block_manager(self._data_block_manager) yield builder except Exception as e: # pylint: disable=broad-except logging.warning("Failed make data block builder, " \ "reason %s", e) expt = e if builder is not None: del builder if expt is not None: raise expt
def _create_data_block(self, data_source, partition_id, x, y): data_block_metas = [] dbm = data_block_manager.DataBlockManager(data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) N = 200 chunk_size = x.shape[0] // N leader_index = 0 follower_index = N * chunk_size * 10 for i in range(N): builder = DataBlockBuilder( common.data_source_data_block_dir(data_source), data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer="TF_RECORD"), None ) builder.set_data_block_manager(dbm) for j in range(chunk_size): feat = {} idx = i * chunk_size + j exam_id = '{}'.format(idx).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) evt_time = random.randint(1, 1000) feat['event_time'] = Feature( int64_list = Int64List(value=[evt_time]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[idx]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[idx]])) feat['leader_index'] = Feature( int64_list = Int64List(value=[leader_index]) ) feat['follower_index'] = Feature( int64_list = Int64List(value=[follower_index]) ) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 data_block_metas.append(builder.finish_data_block()) self.max_index = follower_index return data_block_metas
def generate_follower_data_block(self): dbm = data_block_manager.DataBlockManager(self.data_source_f, 0) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) leader_index = 0 follower_index = 65536 self.dumped_metas = [] for i in range(5): builder = DataBlockBuilder( common.data_source_data_block_dir(self.data_source_f), self.data_source_f.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(dbm) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 3 follower_index += 1 meta = builder.finish_data_block() self.dumped_metas.append(meta) self.leader_start_index = 0 self.leader_end_index = leader_index self.assertEqual(dbm.get_dumped_data_block_count(), 5) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(dbm.get_data_block_meta_by_index(idx), meta)
def generate_raw_data(self, begin_index, item_count): raw_data_dir = os.path.join(self.raw_data_dir, common.partition_repr(0)) if not gfile.Exists(raw_data_dir): gfile.MakeDirs(raw_data_dir) self.total_raw_data_count += item_count useless_index = 0 rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source, 0) fpaths = [] for block_index in range(0, item_count // 2048): builder = DataBlockBuilder( self.raw_data_dir, self.data_source.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(begin_index + block_index * 2048, begin_index + (block_index + 1) * 2048)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - 32, i + 32) b = random.randint(i - 32, i + 32) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 32 and abs(cands[b] - i - start_index) <= 32): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) label = random.choice([1, 0]) if random.random() < 0.8: feat['label'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[label])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( self.data_source.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, fname) fpaths.append( dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3))) self.g_data_block_index += 1 all_files = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in all_files: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath) self.manifest_manager.add_raw_data(0, fpaths, False)
class ExampleJoiner(object): def __init__(self, example_joiner_options, raw_data_options, etcd, data_source, partition_id): self._lock = threading.Lock() self._example_joiner_options = example_joiner_options self._raw_data_options = raw_data_options self._data_source = data_source self._partition_id = partition_id self._leader_visitor = \ ExampleIdVisitor(etcd, self._data_source, self._partition_id) self._follower_visitor = \ RawDataVisitor(etcd, self._data_source, self._partition_id, raw_data_options) self._data_block_manager = \ DataBlockManager(self._data_source, self._partition_id) self._data_block_builder = None self._state_stale = False self._follower_restart_index = 0 self._sync_example_id_finished = False self._raw_data_finished = False self._join_finished = False self._latest_dump_timestamp = time.time() self._sync_state() @contextmanager def make_example_joiner(self): state_stale = self._is_state_stale() self._acuqire_state_stale() yield self._inner_joiner(state_stale) self._release_state_stale() @classmethod def name(cls): return 'BASE_EXAMPLE_JOINER' def get_data_block_meta_by_index(self, index): with self._lock: manager = self._data_block_manager return self._join_finished, \ manager.get_data_block_meta_by_index(index) def get_dumped_data_block_count(self): return self._data_block_manager.get_dumped_data_block_count() def is_join_finished(self): with self._lock: return self._join_finished def set_sync_example_id_finished(self): with self._lock: self._sync_example_id_finished = True def set_raw_data_finished(self): with self._lock: self._raw_data_finished = True def is_sync_example_id_finished(self): with self._lock: return self._sync_example_id_finished def is_raw_data_finished(self): with self._lock: return self._raw_data_finished def need_join(self): with self._lock: if self._join_finished: return False if self._state_stale or self._sync_example_id_finished: return True if self._follower_visitor.is_visitor_stale() or \ self._leader_visitor.is_visitor_stale(): return True if not self._follower_visitor.finished() and \ not self._leader_visitor.finished(): return True return self._need_finish_data_block_since_interval() def _inner_joiner(self, reset_state): raise NotImplementedError( "_inner_joiner not implement for base class: %s" % ExampleJoiner.name()) def _is_state_stale(self): with self._lock: return self._state_stale def _active_visitors(self): self._leader_visitor.active_visitor() self._follower_visitor.active_visitor() def _sync_state(self): meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") else: self._leader_visitor.reset() self._follower_visitor.reset() def _get_data_block_builder(self, create_if_no_existed): if self._data_block_builder is None and create_if_no_existed: data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._data_source.data_source_meta.name, self._partition_id, data_block_index, self._example_joiner_options.data_block_dump_threshold) self._data_block_builder.set_data_block_manager( self._data_block_manager) self._data_block_builder.set_follower_restart_index( self._follower_restart_index) return self._data_block_builder def _finish_data_block(self): if self._data_block_builder is not None: meta = self._data_block_builder.finish_data_block() self._reset_data_block_builder() self._update_latest_dump_timestamp() return meta return None def _reset_data_block_builder(self): builder = None with self._lock: builder = self._data_block_builder self._data_block_builder = None if builder is not None: del builder def _update_latest_dump_timestamp(self): with self._lock: self._latest_dump_timestamp = time.time() def _acuqire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _set_join_finished(self): with self._lock: self._join_finished = True def _need_finish_data_block_since_interval(self): dump_interval = self._example_joiner_options.data_block_dump_interval duration_since_dump = time.time() - self._latest_dump_timestamp return 0 < dump_interval <= duration_since_dump
def generate_raw_data(self, etcd, rdp, data_source, partition_id, block_size, shuffle_win_size, feat_key_fmt, feat_val_fmt): dbm = data_block_manager.DataBlockManager(data_source, partition_id) raw_data_dir = os.path.join(data_source.raw_data_dir, common.partition_repr(partition_id)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) useless_index = 0 new_raw_data_fnames = [] for block_index in range(self.total_index // block_size): builder = DataBlockBuilder( data_source.raw_data_dir, data_source.data_source_meta.name, partition_id, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(block_index * block_size, (block_index + 1) * block_size)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - shuffle_win_size, i + shuffle_win_size) b = random.randint(i - shuffle_win_size, i + shuffle_win_size) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= shuffle_win_size and abs(cands[b] - i - start_index) <= shuffle_win_size): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat[feat_key_fmt.format(example_idx)] = tf.train.Feature( bytes_list=tf.train.BytesList( value=[feat_val_fmt.format(example_idx).encode()])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( data_source.data_source_meta.name, meta) new_raw_data_fnames.append(os.path.join(raw_data_dir, fname)) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if fpath.endswith(common.DataBlockMetaSuffix): gfile.Remove(fpath) rdp.publish_raw_data(partition_id, new_raw_data_fnames)
def test_data_block_manager(self): data_block_datas = [] data_block_metas = [] leader_index = 0 follower_index = 65536 for i in range(5): fill_examples = [] builder = DataBlockBuilder( self.data_source.data_block_dir, self.data_source.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(self.data_block_manager) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) fill_examples.append((example, { 'example_id': example_id, 'event_time': event_time, 'leader_index': leader_index, 'follower_index': follower_index })) leader_index += 1 follower_index += 1 meta = builder.finish_data_block() data_block_datas.append(fill_examples) data_block_metas.append(meta) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5) self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(), data_block_metas[-1]) for (idx, meta) in enumerate(data_block_metas): self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(idx), meta) self.assertEqual( meta.block_id, common.encode_block_id(self.data_source.data_source_meta.name, meta)) self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(5), None) data_block_dir = os.path.join(self.data_source.data_block_dir, common.partition_repr(0)) for (i, meta) in enumerate(data_block_metas): data_block_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockSuffix data_block_meta_fpath = os.path.join( data_block_dir, common.encode_data_block_meta_fname( self.data_source.data_source_meta.name, 0, meta.data_block_index)) self.assertTrue(gfile.Exists(data_block_fpath)) self.assertTrue(gfile.Exists(data_block_meta_fpath)) fiter = tf.io.tf_record_iterator(data_block_meta_fpath) remote_meta = text_format.Parse( next(fiter).decode(), dj_pb.DataBlockMeta()) self.assertEqual(meta, remote_meta) for (j, record) in enumerate( tf.io.tf_record_iterator(data_block_fpath)): example = tf.train.Example() example.ParseFromString(record) stored_data = data_block_datas[i][j] self.assertEqual(example, stored_data[0]) feat = example.features.feature stored_feat = stored_data[1] self.assertTrue('example_id' in feat) self.assertTrue('example_id' in stored_feat) self.assertEqual(stored_feat['example_id'], '{}'.format(i * 1024 + j).encode()) self.assertEqual(stored_feat['example_id'], feat['example_id'].bytes_list.value[0]) self.assertTrue('event_time' in feat) self.assertTrue('event_time' in stored_feat) self.assertEqual(stored_feat['event_time'], feat['event_time'].int64_list.value[0]) self.assertTrue('leader_index' in feat) self.assertTrue('leader_index' in stored_feat) self.assertEqual(stored_feat['leader_index'], feat['leader_index'].int64_list.value[0]) self.assertTrue('follower_index' in feat) self.assertTrue('follower_index' in stored_feat) self.assertEqual(stored_feat['follower_index'], feat['follower_index'].int64_list.value[0]) self.assertEqual(j, 1023) data_block_manager2 = data_block_manager.DataBlockManager( self.data_source, 0) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5)
class ExampleJoiner(object): def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): self._lock = threading.Lock() self._example_joiner_options = example_joiner_options self._raw_data_options = raw_data_options self._data_source = data_source self._partition_id = partition_id self._leader_visitor = \ ExampleIdVisitor(kvstore, self._data_source, self._partition_id) self._follower_visitor = \ RawDataVisitor(kvstore, self._data_source, self._partition_id, raw_data_options) self._data_block_manager = \ DataBlockManager(self._data_source, self._partition_id) meta = self._data_block_manager.get_lastest_data_block_meta() if meta is None: self._joiner_stats = JoinerStats(0, -1, -1) else: stats_info = meta.joiner_stats_info self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num, stats_info.leader_stats_index, stats_info.follower_stats_index) self._data_block_builder_options = data_block_builder_options self._data_block_builder = None self._state_stale = False self._follower_restart_index = 0 self._sync_example_id_finished = False self._raw_data_finished = False self._join_finished = False ds_name = self._data_source.data_source_meta.name self._metrics_tags = { 'data_source_name': ds_name, 'partition': partition_id, 'joiner_name': self.name() } self._optional_stats = OptionalStats(raw_data_options, self._metrics_tags) self._latest_dump_timestamp = time.time() self._sync_state() @contextmanager def make_example_joiner(self): state_stale = self._is_state_stale() self._acuqire_state_stale() yield self._inner_joiner(state_stale) self._release_state_stale() @classmethod def name(cls): return 'BASE_EXAMPLE_JOINER' def get_data_block_meta_by_index(self, index): with self._lock: manager = self._data_block_manager return self._join_finished, \ manager.get_data_block_meta_by_index(index) def get_dumped_data_block_count(self): return self._data_block_manager.get_dumped_data_block_count() def is_join_finished(self): with self._lock: return self._join_finished def set_sync_example_id_finished(self): with self._lock: self._sync_example_id_finished = True def set_raw_data_finished(self): with self._lock: self._raw_data_finished = True def is_sync_example_id_finished(self): with self._lock: return self._sync_example_id_finished def is_raw_data_finished(self): with self._lock: return self._raw_data_finished def need_join(self): with self._lock: if self._join_finished: return False if self._state_stale or self._sync_example_id_finished: return True if self._follower_visitor.is_visitor_stale() or \ self._leader_visitor.is_visitor_stale(): return True if not self._follower_visitor.finished() and \ not self._leader_visitor.finished(): return True return self._need_finish_data_block_since_interval() def _prepare_join(self, state_stale): if state_stale: self._sync_state() self._reset_data_block_builder() sync_example_id_finished = self.is_sync_example_id_finished() raw_data_finished = self.is_raw_data_finished() self._active_visitors() return sync_example_id_finished, raw_data_finished def _inner_joiner(self, reset_state): raise NotImplementedError( "_inner_joiner not implement for base class: %s" % ExampleJoiner.name()) def _is_state_stale(self): with self._lock: return self._state_stale def _active_visitors(self): self._leader_visitor.active_visitor() self._follower_visitor.active_visitor() def _sync_state(self): meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") else: self._leader_visitor.reset() self._follower_visitor.reset() def _get_data_block_builder(self, create_if_no_existed): if self._data_block_builder is None and create_if_no_existed: data_block_index = \ self._data_block_manager.get_dumped_data_block_count() self._data_block_builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, self._partition_id, data_block_index, self._data_block_builder_options, self._example_joiner_options.data_block_dump_threshold) self._data_block_builder.set_data_block_manager( self._data_block_manager) self._data_block_builder.set_follower_restart_index( self._follower_restart_index) return self._data_block_builder def _finish_data_block(self): if self._data_block_builder is not None: self._data_block_builder.set_join_stats_info( self._create_join_stats_info()) meta = self._data_block_builder.finish_data_block( True, self._metrics_tags) self._optional_stats.emit_optional_stats() self._reset_data_block_builder() self._update_latest_dump_timestamp() return meta return None def _create_join_stats_info(self): builder = self._get_data_block_builder(False) nstats_cum_join_num = self._joiner_stats.calc_stats_joined_num() nactual_cum_join_num = 0 if builder is None \ else builder.example_count() meta = self._data_block_manager.get_lastest_data_block_meta() if meta is not None: nactual_cum_join_num += meta.joiner_stats_info.actual_cum_join_num return dj_pb.JoinerStatsInfo( stats_cum_join_num=nstats_cum_join_num, actual_cum_join_num=nactual_cum_join_num, leader_stats_index=self._joiner_stats.get_leader_stats_index(), follower_stats_index=self._joiner_stats.get_follower_stats_index()) def _reset_data_block_builder(self): builder = None with self._lock: builder = self._data_block_builder self._data_block_builder = None if builder is not None: del builder def _update_latest_dump_timestamp(self): data_block_dump_duration = time.time() - self._latest_dump_timestamp metrics.emit_timer(name='data_block_dump_duration', value=int(data_block_dump_duration), tags=self._metrics_tags) self._latest_dump_timestamp = time.time() def _acuqire_state_stale(self): with self._lock: self._state_stale = True def _release_state_stale(self): with self._lock: self._state_stale = False def _set_join_finished(self): with self._lock: self._join_finished = True def _need_finish_data_block_since_interval(self): dump_interval = self._example_joiner_options.data_block_dump_interval duration_since_dump = time.time() - self._latest_dump_timestamp return 0 < dump_interval <= duration_since_dump
class ExampleJoiner(object): def __init__(self, etcd, data_source, partition_id, options): self._data_source = data_source self._partition_id = partition_id self._leader_visitor = ExampleIdVisitor( ExampleIdManager(data_source, partition_id)) self._follower_visitor = RawDataVisitor(etcd, data_source, partition_id, options) self._data_block_manager = DataBlockManager(data_source, partition_id) self._data_block_builder = None self._stale_with_dfs = False self._follower_restart_index = 0 self._sync_state() def join_example(self): raise NotImplementedError( "join exampel not implement for base class: %s" % ExampleJoiner.name()) @classmethod def name(cls): return 'EXAMPLE_JOINER' def get_data_block_number(self): return self._data_block_manager.num_dumped_data_block() def get_data_block_meta(self, index): return self._data_block_manager.get_data_block_meta_by_index(index) def join_finished(self): return self._data_block_manager.join_finished() def _sync_state(self): meta = self._data_block_manager.get_last_data_block_meta( self._stale_with_dfs) if meta is not None: try: self._leader_visitor.seek(meta.leader_end_index) except StopIteration: logging.warning("leader visitor finished") try: self._follower_visitor.seek(meta.follower_restart_index) except StopIteration: logging.warning("follower visitor finished") if (self._leader_visitor.finished() or self._follower_visitor.finished()): self._data_block_manager.finish_join() self._stale_with_dfs = False def _get_data_block_builder(self): if self._data_block_builder is not None: return self._data_block_builder data_block_index = self._data_block_manager.get_dumped_data_block_num() self._data_block_builder = DataBlockBuilder( self._data_source.data_block_dir, self._partition_id, data_block_index, self._data_source.data_source_meta.max_example_in_data_block) return self._data_block_builder def _finish_data_block(self): assert self._data_block_builder is not None self._data_block_builder.set_follower_restart_index( self._follower_restart_index) self._data_block_builder.finish_data_block() meta = self._data_block_builder.get_data_block_meta() if meta is not None: self._data_block_manager.add_dumped_data_block_meta(meta) self._data_block_builder = None