def __init__(self): default_path = '${FUEL_DATA_PATH}' local_path = '${FUEL_LOCAL_DATA_PATH}' self.pid = os.getpid() try: self.dataset_remote_dir = string_utils.preprocess(default_path) self.dataset_local_dir = string_utils.preprocess(local_path) except (ValueError, string_utils.NoDataPathError, string_utils.EnvironmentVariableError): # Local cache seems to be deactivated self.dataset_remote_dir = "" self.dataset_local_dir = "" if self.dataset_remote_dir == "" or self.dataset_local_dir == "": log.debug("Local dataset cache is deactivated")
def cache_file(self, filename): """ Caches a file locally if possible. If caching was succesfull, or if the file was previously successfully cached, this method returns the path to the local copy of the file. If not, it returns the path to the original file. Parameters ---------- filename : string Remote file to cache locally Returns ------- output : string Updated (if needed) filename to use to access the remote file. """ remote_name = string_utils.preprocess(filename) # Check if a local directory for data has been defined. Otherwise, # do not locally copy the data if self.dataset_local_dir == "": return filename common_msg = ("Message from Pylearn2 local cache of dataset" "(specified by the environment variable " "FUEL_LOCAL_DATA_PATH): ") # Make sure the file to cache exists and really is a file if not os.path.exists(remote_name): log.error("Error : Specified file %s does not exist" % remote_name) return filename if not os.path.isfile(remote_name): log.error("Error : Specified name %s is not a file" % remote_name) return filename if not remote_name.startswith(self.dataset_remote_dir): log.warning( common_msg + "We cache in the local directory only what is" " under $FUEL_DATA_PATH: %s" % remote_name) return filename # Create the $FUEL_LOCAL_DATA_PATH folder if needed self.safe_mkdir(self.dataset_local_dir, (stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IWOTH | stat.S_IXOTH)) # Determine local path to which the file is to be cached local_name = os.path.join(self.dataset_local_dir, os.path.relpath(remote_name, self.dataset_remote_dir)) # Create the folder structure to receive the remote file local_folder = os.path.split(local_name)[0] try: self.safe_mkdir(local_folder) except Exception as e: log.warning( (common_msg + "While creating the directory %s, we got an error." " We won't cache to the local disk.") % local_folder) return filename # Acquire writelock on the local file to prevent the possibility # of any other process modifying it while we cache it if needed. # Also, if another process is currently caching the same file, # it forces the current process to wait for it to be done before # using the file. if not os.access(local_folder, os.W_OK): log.warning(common_msg + "Local folder %s isn't writable." " This is needed for synchronization." " We will use the remote version." " Manually fix the permission." % local_folder) return filename self.get_writelock(local_name) # If the file does not exist locally, consider creating it if not os.path.exists(local_name): # Check that there is enough space to cache the file if not self.check_enough_space(remote_name, local_name): log.warning(common_msg + "File %s not cached: Not enough free space" % remote_name) self.release_writelock() return filename # There is enough space; make a local copy of the file self.copy_from_server_to_local(remote_name, local_name) log.info(common_msg + "File %s has been locally cached to %s" % (remote_name, local_name)) elif os.path.getmtime(remote_name) > os.path.getmtime(local_name): log.warning(common_msg + "File %s in cache will not be used: The remote file " "(modified %s) is newer than the locally cached file " "%s (modified %s)." % (remote_name, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(remote_name)) ), local_name, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(local_name)) ))) self.release_writelock() return filename elif os.path.getsize(local_name) != os.path.getsize(remote_name): log.warning(common_msg + "File %s not cached: The remote file (%d bytes) is of " "a different size than the locally cached file %s " "(%d bytes). The local cache might be corrupt." % (remote_name, os.path.getsize(remote_name), local_name, os.path.getsize(local_name))) self.release_writelock() return filename elif not os.access(local_name, os.R_OK): log.warning(common_msg + "File %s in cache isn't readable. We will use the" " remote version. Manually fix the permission." % (local_name)) self.release_writelock() return filename else: log.debug("File %s has previously been locally cached to %s" % (remote_name, local_name)) # Obtain a readlock on the downloaded file before releasing the # writelock. This is to prevent having a moment where there is no # lock on this file which could give the impression that it is # unused and therefore safe to delete. self.get_readlock(local_name) self.release_writelock() return local_name