def test_write_read_file(self): """Test write/read file. """ # Setup and check preconditions. gfile.MkDir(self.prefix() + ":///test_write_read_file") file_name = self.prefix() + ":///test_write_read_file/1" rows = 10 self.assertFalse(gfile.Exists(file_name)) # Write data. with gfile.Open(file_name, mode="w") as w: for i in range(rows): w.write("This is row\n") # Read data. with gfile.Open(file_name, mode="r") as r: lines = r.readlines() # Check that data is equal. self.assertEqual(rows, len(lines)) for i in range(rows): self.assertEqual("This is row\n", lines[i]) # Remove file. gfile.Remove(file_name) # Check that file was removed. self.assertFalse(gfile.Exists(file_name))
def generate_raw_data(self, begin_index, item_count): raw_data_dir = os.path.join(self.raw_data_dir, common.partition_repr(0)) if not gfile.Exists(raw_data_dir): gfile.MakeDirs(raw_data_dir) self.total_raw_data_count += item_count useless_index = 0 rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source, 0) fpaths = [] for block_index in range(0, item_count // 2048): builder = DataBlockBuilder( self.raw_data_dir, self.data_source.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(begin_index + block_index * 2048, begin_index + (block_index + 1) * 2048)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - 32, i + 32) b = random.randint(i - 32, i + 32) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 32 and abs(cands[b] - i - start_index) <= 32): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) label = random.choice([1, 0]) if random.random() < 0.8: feat['label'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[label])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( self.data_source.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, fname) fpaths.append( dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3))) self.g_data_block_index += 1 all_files = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in all_files: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath) self.manifest_manager.add_raw_data(0, fpaths, False)
def generate_raw_data(self, etcd, rdp, data_source, partition_id, block_size, shuffle_win_size, feat_key_fmt, feat_val_fmt): dbm = data_block_manager.DataBlockManager(data_source, partition_id) raw_data_dir = os.path.join(data_source.raw_data_dir, common.partition_repr(partition_id)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) useless_index = 0 new_raw_data_fnames = [] for block_index in range(self.total_index // block_size): builder = DataBlockBuilder( data_source.raw_data_dir, data_source.data_source_meta.name, partition_id, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(block_index * block_size, (block_index + 1) * block_size)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - shuffle_win_size, i + shuffle_win_size) b = random.randint(i - shuffle_win_size, i + shuffle_win_size) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= shuffle_win_size and abs(cands[b] - i - start_index) <= shuffle_win_size): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat[feat_key_fmt.format(example_idx)] = tf.train.Feature( bytes_list=tf.train.BytesList( value=[feat_val_fmt.format(example_idx).encode()])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( data_source.data_source_meta.name, meta) new_raw_data_fnames.append(os.path.join(raw_data_dir, fname)) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if fpath.endswith(common.DataBlockMetaSuffix): gfile.Remove(fpath) rdp.publish_raw_data(partition_id, new_raw_data_fnames)
def destroy(self): if self._writer is not None: self._writer.close() self._writer = None if gfile.Exists(self._tmp_fpath): gfile.Remove(self._tmp_fpath)
def destroy(self): if gfile.Exists(self._tmp_fpath): gfile.Remove(self._tmp_fpath)