Пример #1
0
    def setup_input_file_mirrors(self, hdfs_mirror_dir):
        """Attach a mirror HDFS location for each non-HDFS input file.
        Also attaches a location for the worker node, incase the user wishes to
        copy the input file from HDFS to worker node first before processing.

        Will correctly account for managing JobSet's preference for share_exe_setup.
        Since input_file_mirrors is used for generate_job_arg_str(), we need to add
        the exe/setup here, even though they don't get transferred by the Job itself.

        Parameters
        ----------
        hdfs_mirror_dir : str
            Location of directory to store mirrored copies.
        """
        mirrors = []
        for ifile in self.input_files:
            basename = os.path.basename(ifile)
            mirror_dir = hdfs_mirror_dir
            if (ifile in [self.manager.exe, self.manager.setup_script] and
                    self.manager.share_exe_setup):
                mirror_dir = self.manager.hdfs_store
            hdfs_mirror = (ifile if ifile.startswith('/hdfs')
                           else os.path.join(mirror_dir, basename))
            mirror = ht.FileMirror(original=ifile, hdfs=hdfs_mirror, worker=basename)
            mirrors.append(mirror)
        self.input_file_mirrors = mirrors
Пример #2
0
    def setup_common_input_file_mirrors(self, hdfs_mirror_dir):
        """Attach a mirror HDFS location for each non-HDFS input file.
        Also attaches a location for the worker node, incase the user wishes to
        copy the input file from HDFS to worker node first before processing.

        Parameters
        ----------
        hdfs_mirror_dir : str
            Location of directory to store mirrored copies.
        """
        mirrors = []
        for ifile in self.common_input_files:
            ifile = os.path.abspath(ifile)
            basename = os.path.basename(ifile)
            mirror_dir = hdfs_mirror_dir
            hdfs_mirror = (ifile if ifile.startswith('/hdfs')
                           else os.path.join(mirror_dir, basename))
            mirror = ht.FileMirror(original=ifile, hdfs=hdfs_mirror, worker=basename)
            mirrors.append(mirror)
        self.common_input_file_mirrors = mirrors
Пример #3
0
    def setup_output_file_mirrors(self, hdfs_mirror_dir):
        """Attach a mirror HDFS location for each output file.

        Parameters
        ----------
        hdfs_mirror_dir : str
            Location of directory to store mirrored copies.
        """
        mirrors = []
        for ofile in self.output_files:
            basename = os.path.basename(ofile)
            # is this sensible? shoudl we not have
            # ... else join(hdfs_mirror_dir, ofile) ?
            hdfs_mirror = (ofile if ofile.startswith('/hdfs')
                           else os.path.join(hdfs_mirror_dir, basename))
            # set worker copy depending on if it's on hdfs or not, since we
            # can't stream to it.
            if ofile.startswith('/hdfs'):
                worker = basename
            else:
                worker = ofile
            mirror = ht.FileMirror(original=ofile, hdfs=hdfs_mirror, worker=worker)
            mirrors.append(mirror)
        self.output_file_mirrors = mirrors