def _build_data_block_meta(self): tmp_meta_fpath = self._get_tmp_fpath() meta = self._data_block_meta with tf.io.TFRecordWriter(tmp_meta_fpath) as meta_writer: meta_writer.write(text_format.MessageToString(meta).encode()) if self._data_block_manager is not None: self._data_block_manager.commit_data_block_meta( tmp_meta_fpath, meta) else: meta_fname = encode_data_block_meta_fname(self._data_source_name, self._partition_id, meta.data_block_index) meta_fpath = os.path.join(self._get_data_block_dir(), meta_fname) gfile.Rename(tmp_meta_fpath, meta_fpath)
def LoadDataBlockReqByIndex(self, partition_id, data_block_index): partition_num = self._data_source.data_source_meta.partition_num if partition_id < 0 or partition_id >= partition_num: raise IndexError("partition {} out range".format(partition_id)) dirpath = self._partition_data_block_dir(partition_id) meta_fname = encode_data_block_meta_fname(self._data_source_name(), partition_id, data_block_index) meta_fpath = os.path.join(dirpath, meta_fname) meta = load_data_block_meta(meta_fpath) manifest = self._sync_raw_data_manifest(partition_id) if meta is not None and \ not self._filter_by_visible(meta.data_block_index, manifest): fname = encode_data_block_fname(self._data_source_name(), meta) return DataBlockRep(self._data_source_name(), fname, partition_id, dirpath) return None
def __init__(self, data_source_name, data_block_fname, partition_id, dirpath, check_existed=True): assert data_block_fname.endswith(DataBlockSuffix), \ "data block fname {} should has suffix {}".format( data_block_fname, DataBlockSuffix ) block_id = data_block_fname[:-len(DataBlockSuffix)] segmap = decode_block_id(block_id) if segmap["data_source_name"] != data_source_name: raise ValueError("{} invalid. Data source name mismatch "\ "{} != {}".format(data_block_fname, segmap["data_source_name"], data_source_name)) self._data_source_name = data_source_name if segmap["partition_id"] != partition_id: raise ValueError("{} invalid. partition mismatch "\ "{} != {}".format(data_block_fname, segmap["partition_id"], partition_id)) self._partition_id = partition_id start_time, end_time = \ segmap["time_frame"][0], segmap["time_frame"][1] if start_time > end_time: raise ValueError("{} invalid. time frame error start_time {} > "\ "end_time {}".format(data_block_fname, start_time, end_time)) self._start_time, self._end_time = start_time, end_time self._data_block_index = segmap["data_block_index"] self._block_id = block_id meta_fname = encode_data_block_meta_fname(self._data_source_name, self._partition_id, self._data_block_index) meta_fpath = os.path.join(dirpath, meta_fname) if check_existed and (not gfile.Exists(meta_fpath) or \ gfile.IsDirectory(meta_fpath)): raise ValueError("{} invalid. the corresponding meta file "\ "is not existed".format(data_block_fname)) self._data_block_meta_fpath = meta_fpath self._data_block_meta = None self._data_block_fpath = os.path.join(dirpath, data_block_fname)
def _get_data_block_meta_path(self, data_block_index): meta_fname = encode_data_block_meta_fname( self._data_source.data_source_meta.name, self._partition_id, data_block_index) return os.path.join(self._data_block_dir(), meta_fname)
def test_data_block_manager(self): data_block_datas = [] data_block_metas = [] leader_index = 0 follower_index = 65536 for i in range(5): fill_examples = [] builder = DataBlockBuilder( self.data_source.data_block_dir, self.data_source.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(self.data_block_manager) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) fill_examples.append((example, { 'example_id': example_id, 'event_time': event_time, 'leader_index': leader_index, 'follower_index': follower_index })) leader_index += 1 follower_index += 1 meta = builder.finish_data_block() data_block_datas.append(fill_examples) data_block_metas.append(meta) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5) self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(), data_block_metas[-1]) for (idx, meta) in enumerate(data_block_metas): self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(idx), meta) self.assertEqual( meta.block_id, common.encode_block_id(self.data_source.data_source_meta.name, meta)) self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(5), None) data_block_dir = os.path.join(self.data_source.data_block_dir, common.partition_repr(0)) for (i, meta) in enumerate(data_block_metas): data_block_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockSuffix data_block_meta_fpath = os.path.join( data_block_dir, common.encode_data_block_meta_fname( self.data_source.data_source_meta.name, 0, meta.data_block_index)) self.assertTrue(gfile.Exists(data_block_fpath)) self.assertTrue(gfile.Exists(data_block_meta_fpath)) fiter = tf.io.tf_record_iterator(data_block_meta_fpath) remote_meta = text_format.Parse( next(fiter).decode(), dj_pb.DataBlockMeta()) self.assertEqual(meta, remote_meta) for (j, record) in enumerate( tf.io.tf_record_iterator(data_block_fpath)): example = tf.train.Example() example.ParseFromString(record) stored_data = data_block_datas[i][j] self.assertEqual(example, stored_data[0]) feat = example.features.feature stored_feat = stored_data[1] self.assertTrue('example_id' in feat) self.assertTrue('example_id' in stored_feat) self.assertEqual(stored_feat['example_id'], '{}'.format(i * 1024 + j).encode()) self.assertEqual(stored_feat['example_id'], feat['example_id'].bytes_list.value[0]) self.assertTrue('event_time' in feat) self.assertTrue('event_time' in stored_feat) self.assertEqual(stored_feat['event_time'], feat['event_time'].int64_list.value[0]) self.assertTrue('leader_index' in feat) self.assertTrue('leader_index' in stored_feat) self.assertEqual(stored_feat['leader_index'], feat['leader_index'].int64_list.value[0]) self.assertTrue('follower_index' in feat) self.assertTrue('follower_index' in stored_feat) self.assertEqual(stored_feat['follower_index'], feat['follower_index'].int64_list.value[0]) self.assertEqual(j, 1023) data_block_manager2 = data_block_manager.DataBlockManager( self.data_source, 0) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5)
def test_data_block_dumper(self): self.generate_follower_data_block() self.generate_leader_raw_data() dbd = data_block_dumper.DataBlockDumperManager( self.etcd, self.data_source_l, 0, dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'), dj_pb.DataBlockBuilderOptions( data_block_builder='TF_RECORD_DATABLOCK_BUILDER'), ) self.assertEqual(dbd.get_next_data_block_index(), 0) for (idx, meta) in enumerate(self.dumped_metas): success, next_index = dbd.add_synced_data_block_meta(meta) self.assertTrue(success) self.assertEqual(next_index, idx + 1) self.assertTrue(dbd.need_dump()) self.assertEqual(dbd.get_next_data_block_index(), len(self.dumped_metas)) with dbd.make_data_block_dumper() as dumper: dumper() dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0) dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0) self.assertEqual(dbm_f.get_dumped_data_block_count(), len(self.dumped_metas)) self.assertEqual(dbm_f.get_dumped_data_block_count(), dbm_l.get_dumped_data_block_count()) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(meta.data_block_index, idx) self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta) self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta) meta_fpth_l = os.path.join( self.data_source_l.data_block_dir, common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_l.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_l) meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_l, meta) meta_fpth_f = os.path.join( self.data_source_f.data_block_dir, common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_f.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_f) meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_f, meta) data_fpth_l = os.path.join( self.data_source_l.data_block_dir, common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_l)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1) data_fpth_f = os.path.join( self.data_source_f.data_block_dir, common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_f)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1)