def _check_partitioner(self, map_task): output_partitions = gfile.ListDirectory(map_task.output_base_dir) output_partitions = [ x for x in output_partitions if "SUCCESS" not in x ] self.assertEqual(len(output_partitions), map_task.output_partition_num) partition_dirs = ["{}/{}".format(map_task.output_base_dir, x) \ for x in output_partitions] total_cnt = 0 for partition in output_partitions: dpath = "{}/{}".format(map_task.output_base_dir, partition) partition_id = partition.split("_")[-1] partition_id = int(partition_id) segments = gfile.ListDirectory(dpath) for segment in segments: fpath = "{}/{}".format(dpath, segment) event_time = 0 for record in tf.python_io.tf_record_iterator(fpath): tf_item = TfExampleItem(record) self.assertTrue( tf_item.event_time >= event_time, "{}, {}".format(tf_item.event_time, event_time)) event_time = tf_item.event_time ## assert order self.assertEqual(partition_id, CityHash32(tf_item.raw_id) \ % map_task.output_partition_num) total_cnt += 1 self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num)
def _sync_merged_state(self): self._create_merged_dir_if_need() found_tmp = False fnames = gfile.ListDirectory(self._merged_dir) metas = [] for fname in fnames: if fname.endswith(common.TmpFileSuffix): found_tmp = True if fname.endswith(common.RawDataFileSuffix): meta = MergedSortRunMeta.decode_sort_run_meta_from_fname(fname) metas.append(meta) metas.sort() if not found_tmp: metas = metas[:-1] if len(metas) == 0: return None, 0 last_meta = metas[-1] fpath = os.path.join(self._merged_dir, last_meta.encode_merged_sort_run_fname()) last_item = None for item in SortRunReader(0, fpath, self._options.reader_options, self._comparator): last_item = item assert last_item is not None return last_item, last_meta.process_index + 1
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fnames = [] if gfile.Exists(dpath) and gfile.IsDirectory(dpath): fnames = [ f for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] publish_fpaths = [] if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: publish_fpaths = self._publish_psi_raw_data( partition_id, dpath, fnames) else: publish_fpaths = self._publish_streaming_raw_data( partition_id, dpath, fnames) logging.info("Data Portal Master publish %d file for partition "\ "%d of streaming job %d\n----------\n", len(publish_fpaths), partition_id, job_id) for seq, fpath in enumerate(publish_fpaths): logging.info("%d. %s", seq, fpath) logging.info("------------------------------------------\n")
def test_list_directory(self): """Test list directory. """ # Setup and check preconditions. gfile.MkDir(self.prefix() + ":///test_list_directory") gfile.MkDir(self.prefix() + ":///test_list_directory/2") gfile.MkDir(self.prefix() + ":///test_list_directory/4") dir_name = self.prefix() + ":///test_list_directory" file_names = [ self.prefix() + ":///test_list_directory/1", self.prefix() + ":///test_list_directory/2/3" ] ch_dir_names = [ self.prefix() + ":///test_list_directory/4", ] for file_name in file_names: with gfile.Open(file_name, mode="w") as w: w.write("") for ch_dir_name in ch_dir_names: gfile.MkDir(ch_dir_name) ls_expected_result = file_names + ch_dir_names # Get list of files in directory. ls_result = gfile.ListDirectory(dir_name) # Check that list of files is correct. self.assertEqual(len(ls_expected_result), len(ls_result)) for e in ["1", "2", "4"]: self.assertTrue(e in ls_result, msg="Result doesn't contain '%s'" % e)
def _list_input_dir(self): input_dir = self._portal_manifest.input_base_dir fnames = gfile.ListDirectory(input_dir) if len(self._portal_manifest.input_file_wildcard) > 0: wildcard = self._portal_manifest.input_file_wildcard fnames = [f for f in fnames if fnmatch(f, wildcard)] return [path.join(input_dir, f) for f in fnames]
def _check_merged(self): merge_dir = os.path.join(self._options.output_file_dir, common.partition_repr(self._partition_id)) merged_fname = common.encode_merged_sort_run_fname(self._partition_id) return len([f for f in gfile.ListDirectory(merge_dir) if (os.path.basename(f) == merged_fname or \ os.path.basename(f) == '_SUCCESS')]) > 0
def get_prefix_kvs(self, prefix, ignore_prefix=False): kvs = [] target_path = self._generate_path(prefix, with_meta=False) cur_paths = [target_path] children_paths = [] while cur_paths: for path in cur_paths: filenames = [] try: if gfile.IsDirectory(path): filenames = gfile.ListDirectory(path) except Exception as e: # pylint: disable=broad-except logging.warning("get prefix kvs %s failed, " " reason: %s", path, str(e)) break for filename in sorted(filenames): file_path = "/".join([path, filename]) if gfile.IsDirectory(file_path): children_paths.append(file_path) else: if ignore_prefix and path == target_path: continue nkey = self.normalize_output_key( path, self._base_dir).encode() with gfile.Open(file_path, 'rb') as file: kvs.append((nkey, file.read())) cur_paths = children_paths children_paths = [] return kvs
def validate_holdout_selfplay(): """Validate on held-out selfplay data.""" holdout_dirs = ( os.path.join(fsdb.holdout_dir(), d) for d in reversed(gfile.ListDirectory(fsdb.holdout_dir())) if gfile.IsDirectory(os.path.join(fsdb.holdout_dir(), d)) for f in gfile.ListDirectory(os.path.join(fsdb.holdout_dir(), d))) # This is a roundabout way of computing how many hourly directories we need # to read in order to encompass 20,000 holdout games. holdout_dirs = set(itertools.islice(holdout_dirs), 20000) cmd = ['python3', 'validate.py'] + list(holdout_dirs) + [ '--use_tpu', '--tpu_name={}'.format(TPU_NAME), '--flagfile=rl_loop/distributed_flags', '--expand_validation_dirs' ] mask_flags.run(cmd)
def generate_leader_raw_data(self): dbm = data_block_manager.DataBlockManager(self.data_source_l, 0) raw_data_dir = os.path.join(self.data_source_l.raw_data_dir, common.partition_repr(0)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source_l, 0) block_index = 0 builder = create_data_block_builder( dj_pb.DataBlockBuilderOptions( data_block_builder='TF_RECORD_DATABLOCK_BUILDER'), self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, None) process_index = 0 start_index = 0 for i in range(0, self.leader_end_index + 3): if (i > 0 and i % 2048 == 0) or (i == self.leader_end_index + 2): meta = builder.finish_data_block() if meta is not None: ofname = common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, ofname) self.manifest_manager.add_raw_data(0, [ dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3)) ], False) process_index += 1 start_index += len(meta.example_ids) block_index += 1 builder = create_data_block_builder( dj_pb.DataBlockBuilderOptions( data_block_builder='TF_RECORD_DATABLOCK_BUILDER'), self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, None) feat = {} pt = i + 1 << 30 if i % 3 == 0: pt = i // 3 example_id = '{}'.format(pt).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + pt feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_record(example.SerializeToString(), example_id, event_time, i, i) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath)
def _list_data_block(self, partition_id): dirpath = self._partition_data_block_dir(partition_id) if gfile.Exists(dirpath) and gfile.IsDirectory(dirpath): return [ f for f in gfile.ListDirectory(dirpath) if f.endswith(DataBlockSuffix) ] return []
def _make_merge_options(self, task): merge_options = self._options.merge_options merge_options.output_builder = "TF_RECORD" merge_options.input_dir = os.path.join(task.map_base_dir, \ common.partition_repr(task.partition_id)) merge_options.output_dir = task.reduce_base_dir merge_options.partition_id = task.partition_id merge_options.fpath.extend(gfile.ListDirectory( merge_options.input_dir)) return merge_options
def _list_merged_sort_run_fpath(self): metas = [ MergedSortRunMeta.decode_sort_run_meta_from_fname(f) for f in gfile.ListDirectory(self._merged_dir) if f.endswith(common.RawDataFileSuffix) ] metas.sort() return [ os.path.join(self._merged_dir, meta.encode_merged_sort_run_fname()) for meta in metas ]
def __init__(self, options, partition_id): self._readers = [] self._options = options self._partition_id = partition_id self._queue = queue.PriorityQueue(options.merge_buffer_size) self._active_fpath = set() self._fpaths = gfile.ListDirectory(self._options.input_dir) self._fpath_num = len(self._fpaths) self._writer = Merge.OutputFileWriter(self._options, self._partition_id) self._prepare()
def _list_input_dir(self): all_inputs = [] wildcard = self._portal_manifest.input_file_wildcard dirs = [self._portal_manifest.input_base_dir] num_dirs = 0 num_files = 0 num_target_files = 0 while len(dirs) > 0: fdir = dirs[0] dirs = dirs[1:] # filter directories start with '_'(e.g. _tmp) # TODO: format the inputs' directory name if fdir.startswith('_'): continue fnames = gfile.ListDirectory(fdir) for fname in fnames: fpath = path.join(fdir, fname) # OSS does not retain folder structure. # For example, if we have file oss://test/1001/a.txt # list(oss://test) returns 1001/a.txt instead of 1001 basename = path.basename(fpath) # filter directories start with '_'(e.g. _tmp/_SUCCESS) # TODO: format the inputs' directory name if basename.startswith('_'): continue if gfile.IsDirectory(fpath): dirs.append(fpath) num_dirs += 1 continue num_files += 1 if len(wildcard) == 0 or fnmatch(basename, wildcard): num_target_files += 1 if self._check_success_tag: has_succ = gfile.Exists( path.join(path.dirname(fpath), '_SUCCESS')) if not has_succ: logging.warning( 'File %s skipped because _SUCCESS file is ' 'missing under %s', fpath, fdir) continue all_inputs.append(fpath) rest_fpaths = [] for fpath in all_inputs: if fpath not in self._processed_fpath: rest_fpaths.append(fpath) logging.info( 'Listing %s: found %d dirs, %d files, %d files matching wildcard, ' '%d files with success tag, %d new files to process', self._portal_manifest.input_base_dir, num_dirs, num_files, num_target_files, len(all_inputs), len(rest_fpaths)) return rest_fpaths
def _list_dumper_output_dir(self): output_dir = self._get_output_dir() if gfile.Exists(output_dir): assert gfile.IsDirectory(output_dir) all_files = gfile.ListDirectory(output_dir) for f in all_files: if f.endswith(TmpFileSuffix): gfile.Remove(path.join(output_dir, f)) return [f for f in all_files if f.endswith(DoneFileSuffix)] gfile.MakeDirs(output_dir) return []
def _list_file_metas(self, partition_id): dumped_dir = os.path.join(self._options.output_dir, common.partition_repr(partition_id)) if not gfile.Exists(dumped_dir): gfile.MakeDirs(dumped_dir) assert gfile.IsDirectory(dumped_dir) fnames = [os.path.basename(f) for f in gfile.ListDirectory(dumped_dir) if f.endswith(common.RawDataFileSuffix)] metas = [RawDataPartitioner.FileMeta.decode_meta_from_fname(f) for f in fnames] return [meta for meta in metas \ if meta.rank_id == self._options.partitioner_rank_id]
def _list_dir_helper_oss(self, root): # oss returns a file multiple times, e.g. listdir('root') returns # ['folder', 'file1.txt', 'folder/file2.txt'] # and then listdir('root/folder') returns # ['file2.txt'] filenames = set(path.join(root, i) for i in gfile.ListDirectory(root)) res = [] for fname in filenames: succ = path.join(path.dirname(fname), '_SUCCESS') if succ in filenames or not gfile.IsDirectory(fname): res.append(fname) return res
def _preload_raw_data_file_path(self): if self.mode == "distribute": if not gfile.Exists(self._local_raw_dat_dir): gfile.MakeDirs(self._local_raw_dat_dir) os.system("hadoop fs -get {0}/* {1} ".format( self._raw_data_dir, self._local_raw_dat_dir)) self._all_fpath = [ path.join(self._local_raw_dat_dir, f) for f in gfile.ListDirectory(self._local_raw_dat_dir) if not gfile.IsDirectory(path.join(self._local_raw_dat_dir, f)) ] logging.info("all path is :{}".format(self._all_fpath)) self._all_fpath.sort()
def _run_reduce_task(self, task): merger_options = self._make_merger_options(task) sort_run_merger = SortRunMerger(merger_options, 'event_time') input_dir = os.path.join(task.map_base_dir, common.partition_repr(task.partition_id)) input_fpaths = [ os.path.join(input_dir, f) for f in gfile.ListDirectory(input_dir) if f.endswith(common.RawDataFileSuffix) ] logging.info("Merger input_dir:%s(with %d files) rank_id:%s "\ "partition_id:%d start", task.map_base_dir, len(input_fpaths), self._rank_id, task.partition_id) sort_run_merger.merge_sort_runs(input_fpaths)
def _list_input_dir(self): all_inputs = [] wildcard = self._portal_manifest.input_file_wildcard dirs = [self._portal_manifest.input_base_dir] while len(dirs) > 0: fdir = dirs[0] dirs = dirs[1:] fnames = gfile.ListDirectory(fdir) for fname in fnames: fpath = path.join(fdir, fname) if gfile.IsDirectory(fpath): dirs.append(fpath) elif len(wildcard) == 0 or fnmatch(fname, wildcard): all_inputs.append(fpath) return all_inputs
def parse_data_block_dir(self, data_block_dir, role="leader"): dir_path_list = [ path.join(data_block_dir, f) for f in gfile.ListDirectory(data_block_dir) if gfile.IsDirectory(path.join(data_block_dir, f)) ] for dir_path in dir_path_list: if role == "leader": self.leader_file_path_list += [ path.join(dir_path, f) for f in gfile.ListDirectory(dir_path) if f.split(".")[-1] == "data" and not gfile.IsDirectory(path.join(dir_path, f)) ] else: self.follower_file_path_list += [ path.join(dir_path, f) for f in gfile.ListDirectory(dir_path) if f.split(".")[-1] == "data" and not gfile.IsDirectory(path.join(dir_path, f)) ] self.leader_file_path_list.sort() self.follower_file_path_list.sort()
def _check_merge(self, reduce_task): dpath = os.path.join(self._merge_output_dir, \ common.partition_repr(reduce_task.partition_id)) fpaths = gfile.ListDirectory(dpath) fpaths = sorted(fpaths, key=lambda fpath: fpath, reverse=False) event_time = 0 total_cnt = 0 for fpath in fpaths: fpath = os.path.join(dpath, fpath) logging.info("check merge path:{}".format(fpath)) for record in tf.python_io.tf_record_iterator(fpath): tf_item = TfExampleItem(record) self.assertTrue(tf_item.event_time >= event_time) event_time = tf_item.event_time total_cnt += 1 return total_cnt
def _list_dir_helper(self, root): filenames = list(gfile.ListDirectory(root)) # If _SUCCESS is present, we assume there are no subdirs if '_SUCCESS' in filenames: return [path.join(root, i) for i in filenames] res = [] for basename in filenames: fname = path.join(root, basename) if gfile.IsDirectory(fname): # 'ignore tmp dirs starting with _ if basename.startswith('_'): continue res += self._list_dir_helper(fname) else: res.append(fname) return res
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, portal_manifest.name, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, portal_manifest.name, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fpaths = [ path.join(dpath, f) for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] self._publisher.publish_raw_data(partition_id, fpaths) if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: self._publisher.finish_raw_data(partition_id)
def _run_reduce_task(self, task): merger_options = self._make_merger_options(task) sort_run_merger = SortRunMerger(merger_options, self._merger_comparator) input_dir = os.path.join(task.map_base_dir, common.partition_repr(task.partition_id)) input_fpaths = [os.path.join(input_dir, f) for f in gfile.ListDirectory(input_dir) if f.endswith(common.RawDataFileSuffix)] logging.info("Merger rank_id-[%d] start run task %s for partition "\ "%d. input_dir %s, with %d files", self._rank_id, merger_options.merger_name, task.partition_id, task.map_base_dir, len(input_fpaths)) sort_run_merger.merge_sort_runs(input_fpaths) logging.info("Merger rank_id-[%d] finish task %s for "\ "partition %d", self._rank_id, merger_options.merger_name, task.partition_id) del sort_run_merger gc.collect()
def _preload_example_id_meta(self): fdir = self._example_dumped_dir() fpaths = [os.path.join(fdir, f) for f in gfile.ListDirectory(fdir) if f.endswith(DoneFileSuffix)] index_metas = [] for fpath in fpaths: index_meta = decode_index_meta(fpath) assert index_meta is not None, "the index meta should not None "\ "if decode index meta success" index_metas.append(index_meta) index_metas = sorted(index_metas, key=lambda meta: meta.start_index) for index, index_meta in enumerate(index_metas): if index != index_meta.process_index: logging.fatal("%s has error process index. expected %d", index_meta.fpath, index) traceback.print_stack() os._exit(-1) # pylint: disable=protected-access return index_metas
def _load_data(self): dataset = np.zeros((24 * 4 * 183, 64, 64, 3)) all_files = [ x for x in gfile.ListDirectory(CARS3D_PATH) if ".mat" in x ] for i, filename in enumerate(all_files): data_mesh = _load_mesh(filename) factor1 = np.array(list(range(4))) factor2 = np.array(list(range(24))) all_factors = np.transpose([ np.tile(factor1, len(factor2)), np.repeat(factor2, len(factor1)), np.tile(i, len(factor1) * len(factor2)) ]) indexes = self.index.features_to_index(all_factors) dataset[indexes] = data_mesh return dataset
def aggregate_json_results(base_path): """Aggregates all the result files in a directory into a namespaced dict. Args: base_path: String with the directory containing JSON files that only contain dictionaries. Returns: Namespaced dictionary with the results. """ result = {} compiled_pattern = re.compile(r"(.*)\.json") for filename in gfile.ListDirectory(base_path): match = compiled_pattern.match(filename) if match: path = os.path.join(base_path, filename) with tf.gfile.GFile(path, "r") as f: result[match.group(1)] = json.load(f) return namespaced_dict(**result)
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fnames = [] if gfile.Exists(dpath) and gfile.IsDirectory(dpath): fnames = [ f for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: self._publish_psi_raw_data(partition_id, dpath, fnames) else: self._publish_streaming_raw_data(partition_id, dpath, fnames)
def test_list_directory(self): """Test list directory. """ # Setup and check preconditions. dir_name = "igfs:///test_list_directory/" file_names = [ "igfs:///test_list_directory/1", "igfs:///test_list_directory/2/3" ] ch_dir_names = [ "igfs:///test_list_directory/4", ] for file_name in file_names: with gfile.Open(file_name, mode="w") as w: w.write("") for ch_dir_name in ch_dir_names: gfile.MkDir(ch_dir_name) ls_expected_result = file_names + ch_dir_names # Get list of files in directory. ls_result = gfile.ListDirectory(dir_name) # Check that list of files is correct. self.assertEqual(len(ls_expected_result), len(ls_result)) for e in ["1", "2", "4"]: self.assertTrue(e in ls_result)