Exemplo n.º 1
0
    def __init__(self):
        default_path = '${FUEL_DATA_PATH}'
        local_path = '${FUEL_LOCAL_DATA_PATH}'
        self.pid = os.getpid()

        try:
            self.dataset_remote_dir = string_utils.preprocess(default_path)
            self.dataset_local_dir = string_utils.preprocess(local_path)
        except (ValueError, string_utils.NoDataPathError,
                string_utils.EnvironmentVariableError):
            # Local cache seems to be deactivated
            self.dataset_remote_dir = ""
            self.dataset_local_dir = ""

        if self.dataset_remote_dir == "" or self.dataset_local_dir == "":
            log.debug("Local dataset cache is deactivated")
Exemplo n.º 2
0
    def cache_file(self, filename):
        """
        Caches a file locally if possible. If caching was succesfull, or if
        the file was previously successfully cached, this method returns the
        path to the local copy of the file. If not, it returns the path to
        the original file.
        Parameters
        ----------
        filename : string
            Remote file to cache locally
        Returns
        -------
        output : string
            Updated (if needed) filename to use to access the remote
            file.
        """

        remote_name = string_utils.preprocess(filename)

        # Check if a local directory for data has been defined. Otherwise,
        # do not locally copy the data
        if self.dataset_local_dir == "":
            return filename

        common_msg = ("Message from Pylearn2 local cache of dataset"
                      "(specified by the environment variable "
                      "FUEL_LOCAL_DATA_PATH): ")
        # Make sure the file to cache exists and really is a file
        if not os.path.exists(remote_name):
            log.error("Error : Specified file %s does not exist" %
                      remote_name)
            return filename

        if not os.path.isfile(remote_name):
            log.error("Error : Specified name %s is not a file" %
                      remote_name)
            return filename

        if not remote_name.startswith(self.dataset_remote_dir):
            log.warning(
                common_msg +
                "We cache in the local directory only what is"
                " under $FUEL_DATA_PATH: %s" %
                remote_name)
            return filename

        # Create the $FUEL_LOCAL_DATA_PATH folder if needed
        self.safe_mkdir(self.dataset_local_dir,
                        (stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR |
                         stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP |
                         stat.S_IROTH | stat.S_IWOTH | stat.S_IXOTH))

        # Determine local path to which the file is to be cached
        local_name = os.path.join(self.dataset_local_dir,
                                  os.path.relpath(remote_name,
                                                  self.dataset_remote_dir))

        # Create the folder structure to receive the remote file
        local_folder = os.path.split(local_name)[0]
        try:
            self.safe_mkdir(local_folder)
        except Exception as e:
            log.warning(
                (common_msg +
                 "While creating the directory %s, we got an error."
                 " We won't cache to the local disk.") % local_folder)
            return filename

        # Acquire writelock on the local file to prevent the possibility
        # of any other process modifying it while we cache it if needed.
        # Also, if another process is currently caching the same file,
        # it forces the current process to wait for it to be done before
        # using the file.
        if not os.access(local_folder, os.W_OK):
            log.warning(common_msg +
                        "Local folder %s isn't writable."
                        " This is needed for synchronization."
                        " We will use the remote version."
                        " Manually fix the permission."
                        % local_folder)
            return filename
        self.get_writelock(local_name)

        # If the file does not exist locally, consider creating it
        if not os.path.exists(local_name):

            # Check that there is enough space to cache the file
            if not self.check_enough_space(remote_name, local_name):
                log.warning(common_msg +
                            "File %s not cached: Not enough free space" %
                            remote_name)
                self.release_writelock()
                return filename

            # There is enough space; make a local copy of the file
            self.copy_from_server_to_local(remote_name, local_name)
            log.info(common_msg + "File %s has been locally cached to %s" %
                     (remote_name, local_name))
        elif os.path.getmtime(remote_name) > os.path.getmtime(local_name):
            log.warning(common_msg +
                        "File %s in cache will not be used: The remote file "
                        "(modified %s) is newer than the locally cached file "
                        "%s (modified %s)."
                        % (remote_name,
                           time.strftime(
                               '%Y-%m-%d %H:%M:%S',
                               time.localtime(os.path.getmtime(remote_name))
                           ),
                           local_name,
                           time.strftime(
                               '%Y-%m-%d %H:%M:%S',
                               time.localtime(os.path.getmtime(local_name))
                           )))
            self.release_writelock()
            return filename
        elif os.path.getsize(local_name) != os.path.getsize(remote_name):
            log.warning(common_msg +
                        "File %s not cached: The remote file (%d bytes) is of "
                        "a different size than the locally cached file %s "
                        "(%d bytes). The local cache might be corrupt."
                        % (remote_name, os.path.getsize(remote_name),
                           local_name, os.path.getsize(local_name)))
            self.release_writelock()
            return filename
        elif not os.access(local_name, os.R_OK):
            log.warning(common_msg +
                        "File %s in cache isn't readable. We will use the"
                        " remote version. Manually fix the permission."
                        % (local_name))
            self.release_writelock()
            return filename
        else:
            log.debug("File %s has previously been locally cached to %s" %
                      (remote_name, local_name))

        # Obtain a readlock on the downloaded file before releasing the
        # writelock. This is to prevent having a moment where there is no
        # lock on this file which could give the impression that it is
        # unused and therefore safe to delete.
        self.get_readlock(local_name)
        self.release_writelock()

        return local_name