def _input_paths_for_step(self, step_num): if step_num == 0: return [ _from_file_uri(path) # *path* could be a file:// URI for input_path_glob in self._get_input_paths() for path in self.fs.ls(input_path_glob) ] else: return self.fs.ls( join(self._output_dir_for_step(step_num - 1), 'part-*'))
def _create_dist_cache_dir(self, step_num): """Copy working directory files into a shared directory, simulating the way Hadoop's Distributed Cache works on nodes.""" cache_dir = self._dist_cache_dir(step_num) log.debug('creating simulated Distributed Cache dir: %s' % cache_dir) self.fs.mkdir(cache_dir) for name, path in self._working_dir_mgr.name_to_path('file').items(): path = _from_file_uri(path) # might start with file:// dest = self._path_in_dist_cache_dir(name, step_num) log.debug('copying %s -> %s' % (path, dest)) shutil.copy(path, dest) _chmod_u_rx(dest) for name, path in self._working_dir_mgr.name_to_path( 'archive').items(): path = _from_file_uri(path) # might start with file:// dest = self._path_in_dist_cache_dir(name, step_num) log.debug('unarchiving %s -> %s' % (path, dest)) unarchive(path, dest) _chmod_u_rx(dest, recursive=True)