pmode = RESET_MODE # get prefix for environment name envprefix = sys.argv[1] # get number of environments to create nenvs = sys.argv[2] elif nargs > 1: # returns env to the list of environments pmode = WRITE_MODE # get name of environment to return env = sys.argv[1] else: # gets name of an environment to use pmode = READ_MODE # creates a lock for the file so it can only be accessed one at a time lock = FileLock(lock_path, timeout=time_out_secs) with lock: if pmode == RESET_MODE: # create a list (named clist) of nevns environments with the clist = [] nenvsnum = int(nenvs) for i in range(nenvsnum): clist.append(envprefix + str(i)) # prefix envprefix # add code here else: # load hickle file clist = hickle.load(file_path) if pmode == WRITE_MODE:
def _instance_iterator(self, file_path: str) -> Iterable[Instance]: cache_file: Optional[str] = None if self._cache_directory: cache_file = self._get_cache_location_for_file_path(file_path) if cache_file is not None and os.path.exists(cache_file): cache_file_lock = FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT) try: cache_file_lock.acquire() # We make an assumption here that if we can obtain the lock, no one will # be trying to write to the file anymore, so it should be safe to release the lock # before reading so that other processes can also read from it. cache_file_lock.release() logger.info("Reading instances from cache %s", cache_file) with open(cache_file) as data_file: yield from self._multi_worker_islice( data_file, transform=self.deserialize_instance) except Timeout: logger.warning( "Failed to acquire lock on dataset cache file within %d seconds. " "Cannot use cache to read instances.", self.CACHE_FILE_LOCK_TIMEOUT, ) yield from self._multi_worker_islice(self._read(file_path), ensure_lazy=True) elif cache_file is not None and not os.path.exists(cache_file): instances = self._multi_worker_islice(self._read(file_path), ensure_lazy=True) # The cache file doesn't exist so we'll try writing to it. if self.max_instances is not None: # But we don't write to the cache when max_instances is specified. logger.warning( "Skipping writing to data cache since max_instances was specified." ) yield from instances elif util.is_distributed() or (get_worker_info() and get_worker_info().num_workers): # We also shouldn't write to the cache if there's more than one process loading # instances since each worker only receives a partial share of the instances. logger.warning( "Can't cache data instances when there are multiple processes loading data" ) yield from instances else: try: with FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT): with CacheFile(cache_file, mode="w+") as cache_handle: logger.info("Caching instances to temp file %s", cache_handle.name) for instance in instances: cache_handle.write( self.serialize_instance(instance) + "\n") yield instance except Timeout: logger.warning( "Failed to acquire lock on dataset cache file within %d seconds. " "Cannot write to cache.", self.CACHE_FILE_LOCK_TIMEOUT, ) yield from instances else: # No cache. yield from self._multi_worker_islice(self._read(file_path), ensure_lazy=True)
def clear_csv(csv_path): with FileLock(csv_path + ".lock"): f = open(csv_path, "w+") csv_writer = csv.writer(f) f.close()
def initialize_control_file(self) -> None: with FileLock(LTIGradesSenderControlFile.lock_file): with Path(self.config_fullname).open('w+') as new_file: json.dump(LTIGradesSenderControlFile.cache_sender_data, new_file) logger.debug('Control file initialized.')
def read( self, file_path: Union[Path, str] ) -> Union[AllennlpDataset, AllennlpLazyDataset]: """ Returns an dataset containing all the instances that can be read from the file path. If `self.lazy` is `False`, this eagerly reads all instances from `self._read()` and returns an `AllennlpDataset`. If `self.lazy` is `True`, this returns an `AllennlpLazyDataset`, which internally relies on the generator created from `self._read()` to lazily produce `Instance`s. In this case your implementation of `_read()` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a `ConfigurationError`. In either case, the returned `Iterable` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ if not isinstance(file_path, str): file_path = str(file_path) lazy = getattr(self, "lazy", None) if lazy is None: warnings.warn( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?", UserWarning, ) if lazy: return AllennlpLazyDataset(self._instance_iterator, file_path) else: cache_file: Optional[str] = None if self._cache_directory: cache_file = self._get_cache_location_for_file_path(file_path) if cache_file is not None and os.path.exists(cache_file): try: # Try to acquire a lock just to make sure another process isn't in the middle # of writing to the cache. cache_file_lock = FileLock( cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT) cache_file_lock.acquire() # We make an assumption here that if we can obtain the lock, no one will # be trying to write to the file anymore, so it should be safe to release the lock # before reading so that other processes can also read from it. cache_file_lock.release() logger.info("Reading instances from cache %s", cache_file) instances = self._instances_from_cache_file(cache_file) except Timeout: logger.warning( "Failed to acquire lock on dataset cache file within %d seconds. " "Cannot use cache to read instances.", self.CACHE_FILE_LOCK_TIMEOUT, ) instances = self._multi_worker_islice( self._read(file_path)) else: instances = self._multi_worker_islice(self._read(file_path)) # Then some validation. if not isinstance(instances, list): instances = list(instances) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) # And finally we try writing to the cache. if cache_file is not None and not os.path.exists(cache_file): if self.max_instances is not None: # But we don't write to the cache when max_instances is specified. logger.warning( "Skipping writing to data cache since max_instances was specified." ) elif util.is_distributed() or (get_worker_info() and get_worker_info().num_workers): # We also shouldn't write to the cache if there's more than one process loading # instances since each worker only receives a partial share of the instances. logger.warning( "Can't cache data instances when there are multiple processes loading data" ) else: try: with FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT): self._instances_to_cache_file( cache_file, instances) except Timeout: logger.warning( "Failed to acquire lock on dataset cache file within %d seconds. " "Cannot write to cache.", self.CACHE_FILE_LOCK_TIMEOUT, ) return AllennlpDataset(instances)
def get_from_cache( url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent: Union[Dict, str, None] = None, local_files_only=False, ) -> Optional[str]: """ Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) etag = None if not local_files_only: try: response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout) if response.status_code == 200: etag = response.headers.get("ETag") except (EnvironmentError, requests.exceptions.Timeout): # etag is already None pass filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. # try to get the last downloaded one if etag is None: if os.path.exists(cache_path): return cache_path else: matching_files = [ file for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") if not file.endswith(".json") and not file.endswith(".lock") ] if len(matching_files) > 0: return os.path.join(cache_dir, matching_files[-1]) else: # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: raise ValueError( "Cannot find the requested files in the cached path and outgoing traffic has been" " disabled. To enable model look-ups and downloads online, set 'local_files_only'" " to False." ) return None # From now on, etag is not None. if os.path.exists(cache_path) and not force_download: return cache_path # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. return cache_path if resume_download: incomplete_path = cache_path + ".incomplete" @contextmanager def _resumable_file_manager(): with open(incomplete_path, "a+b") as f: yield f temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) logger.info("storing %s in cache at %s", url, cache_path) os.replace(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) return cache_path
check_min_version("4.22.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") logger = logging.getLogger(__name__) try: nltk.data.find("tokenizers/punkt") except (LookupError, OSError): if is_offline_mode(): raise LookupError( "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files" ) with FileLock(".lock") as lock: nltk.download("punkt", quiet=True) # A list of all multilingual tokenizer which require lang attribute. MULTILINGUAL_TOKENIZERS = [ MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast ] @dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ model_name_or_path: str = field(
def __init__( self, name: str = None, process_id: int = 0, num_process: int = 1, data_dir: Optional[str] = None, experiment_id: Optional[str] = None, in_memory: bool = False, **kwargs, ): """ A Metrics is the base class and common API for all metrics. Args: process_id (int): specify the id of the node in a distributed settings between 0 and num_nodes-1 This can be used, to compute metrics on distributed setups (in particular non-additive metrics like F1). data_dir (str): path to a directory in which temporary data will be stored. This should be a shared file-system for distributed setups. experiment_id (str): Should be used if you perform several concurrent experiments using the same caching directory (will be indicated in the raise error) in_memory (bool): keep all predictions and references in memory. Not possible in distributed settings. """ # Safety checks assert isinstance(process_id, int) and process_id >= 0, "'process_id' should be a number greater than 0" assert ( isinstance(num_process, int) and num_process > process_id ), "'num_process' should be a number greater than process_id" assert ( process_id == 0 or not in_memory ), "Using 'in_memory' is not possible in distributed setting (process_id > 0)." # Metric name self.name = camelcase_to_snakecase(self.__class__.__name__) # Configuration name self.config_name = name self.process_id = process_id self.num_process = num_process self.in_memory = in_memory self.experiment_id = experiment_id if experiment_id is not None else "cache" self._version = "1.0.0" self._data_dir_root = os.path.expanduser(data_dir or HF_METRICS_CACHE) self.data_dir = self._build_data_dir() # prepare info info = self._info() info.metric_name = self.name info.config_name = self.config_name info.version = self._version self.info = info # Update 'compute' and 'add' docstring self.compute.__func__.__doc__ += self.info.inputs_description self.add.__func__.__doc__ += self.info.inputs_description self.arrow_schema = pa.schema(field for field in self.info.features.type) self.buf_writer = None self.writer = None self.writer_batch_size = None self.data = None # Check we can write on the cache file without competitors self.cache_file_name = os.path.join(self.data_dir, self._get_file_name(self.process_id)) self.filelock = FileLock(self.cache_file_name + ".lock") try: self.filelock.acquire(timeout=1) except Timeout: raise ValueError( "Cannot acquire lock, caching file might be used by another process, " "you should setup a unique 'experiment_id' for this run." )
def cached_path( url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent: Union[Dict, str, None] = None, extract_compressed_file=False, force_extract=False, local_files_only=False, ) -> Optional[str]: """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. Args: cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). force_download: if True, re-dowload the file even if it's already cached in the cache dir. resume_download: if True, resume the download if incompletly recieved file is found. user_agent: Optional string or dict that will be appended to the user-agent on remote requests. extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed file in a folder along the archive. force_extract: if True when extract_compressed_file is True and the archive was already extracted, re-extract the archive and overide the folder where it was extracted. Return: None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string) otherwise """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) output_path = get_from_cache( url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, user_agent=user_agent, local_files_only=local_files_only, ) elif os.path.exists(url_or_filename): # File, and it exists. output_path = url_or_filename elif urlparse(url_or_filename).scheme == "": # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: # Something unknown raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) if extract_compressed_file: if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): return output_path # Path where we extract compressed archives # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" output_dir, output_file = os.path.split(output_path) output_extract_dir_name = output_file.replace(".", "-") + "-extracted" output_path_extracted = os.path.join(output_dir, output_extract_dir_name) if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: return output_path_extracted # Prevent parallel extractions lock_path = output_path + ".lock" with FileLock(lock_path): shutil.rmtree(output_path_extracted, ignore_errors=True) os.makedirs(output_path_extracted) if is_zipfile(output_path): with ZipFile(output_path, "r") as zip_file: zip_file.extractall(output_path_extracted) zip_file.close() elif tarfile.is_tarfile(output_path): tar_file = tarfile.open(output_path) tar_file.extractall(output_path_extracted) tar_file.close() else: raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) return output_path_extracted return output_path
def from_files( cls, directory: Union[str, os.PathLike], padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Loads a `Vocabulary` that was serialized either using `save_to_files` or inside a model archive file. # Parameters directory : `str` The directory or archive file containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN if not os.path.isdir(directory): base_directory = cached_path(directory, extract_archive=True) # For convenience we'll check for a 'vocabulary' subdirectory of the archive. # That way you can use model archives directly. vocab_subdir = os.path.join(base_directory, "vocabulary") if os.path.isdir(vocab_subdir): directory = vocab_subdir elif os.path.isdir(base_directory): directory = base_directory else: raise ConfigurationError( f"{directory} is neither a directory nor an archive") # We use a lock file to avoid race conditions where multiple processes # might be reading/writing from/to the same vocab files at once. with FileLock(os.path.join(directory, ".lock")): with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8") as namespace_file: non_padded_namespaces = [ namespace_str.strip() for namespace_str in namespace_file ] vocab = cls( non_padded_namespaces=non_padded_namespaces, padding_token=padding_token, oov_token=oov_token, ) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue if namespace_filename.startswith("."): continue namespace = namespace_filename.replace(".txt", "") if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token) return vocab
def main(): if request.method != 'POST': abort(METHOD_NOT_ALLOWED) try: payload = request.get_json() if payload is None: raise RuntimeError except: log.error('Failed to obtain payload data.') abort(BAD_REQUEST) event = request.headers.get('X-GitHub-Event') if event is None: log.error('DECLINED: no event provided.') abort(BAD_REQUEST) # Ping event if event == 'ping': return json.dumps({'msg': 'pong'}) repo = payload['repository']['full_name'] clone_url = payload['repository']['clone_url'] idx = md5(clone_url) record_file_path = f'{DATABASE_DIRECTORY}/{idx}.json' if not os.path.isfile(record_file_path): log.error( f'DECLINED: repository "{repo}" has no record on the server.') abort(UNAUTHORIZED) with open(record_file_path, 'r') as fp: record = json.load(fp) log.info(f'Record file "{record_file_path}" loaded.') if not DEBUG_MODE: secret = record['secret'] signature = request.headers.get('X-Hub-Signature') if signature is None: log.error('DECLINED: no signature provided.') abort(BAD_REQUEST) if not authenticate(secret, signature, request.data): log.error('DECLINED: failed to pass authentication.') abort(UNAUTHORIZED) if event != 'push': return json.dumps({'status': 'fail', 'reason': 'not a push event'}) commits = payload['commits'] if not commits: log.info('No new commits.') return json.dumps({'status': 'success'}) branch = payload['ref'].rsplit('/', 1)[-1] commit = payload['head_commit'] head = commit['id'] folder_name = '%s/%s' % (repo, branch) folder = os.path.join(WEBPAGE_DIRECTORY, folder_name) if not os.path.exists(folder): os.makedirs(folder) status = os.path.join(folder, STATUS_FILE) index = os.path.join(folder, INDEX_FILE) output = os.path.join(folder, OUTPUT_FILE) status_url = WEBURL + os.path.join(folder_name, STATUS_FILE) index_url = WEBURL + os.path.join(folder_name, INDEX_FILE) index_lock = index + '.lock' with FileLock(index_lock): log.info(f'Copying {TEMPORARY_INDEX_FILE} to {index}') shutil.copyfile(TEMPORARY_INDEX_FILE, index) tmpfd, tmppath = tempfile.mkstemp() log.debug('tmppath = %s', tmppath) with os.fdopen(tmpfd, 'w') as fp: json.dump(record['checksums'], fp) log.info('Launching docmeld...') with open(status, 'w') as fp: fp.write( f'Current server time: {str(datetime.now())} (UTC{get_utc_offset()})\n' ) fp.write(f'Build for commit #{head}: {commit["message"]}\n') with open(status, 'a') as fp: proc = subprocess.Popen([ DOCMELD_EXECUTABLE, GIT_URL_START + clone_url, '-b', branch, '-s', head, '-c', tmppath, '-o', output, '-v' if DEBUG_MODE else '-q' ], stdout=fp, stderr=subprocess.STDOUT) try: proc.wait(timeout=COMPILE_TIME_LIMIT) except subprocess.TimeoutExpried as e: log.error('Time limit exceeded.') return json.dumps({ 'status': 'fail', 'reason': f'time limit exceeded ({e.timeout}s)', 'detail': status_url }) if proc.returncode != 0: log.error( f'docmeld execution failed with status code {proc.returncode}', ) return json.dumps({ 'status': 'fail', 'reason': f'docmeld failed with status code {proc.returncode}', 'returncode': proc.returncode, 'detail': status_url }) log.info(f'Copying {output} to {index}...') shutil.copyfile(output, index) os.remove(tmppath) record['last_build'] = head with open(record_file_path, 'w') as fp: json.dump(record, fp, sort_keys=True, indent=4) return json.dumps({ 'status': 'success', 'returncode': proc.returncode, 'output_url': index_url, 'detail': status_url })
def cached_download( url: str, *, library_name: Optional[str] = None, library_version: Optional[str] = None, cache_dir: Union[str, Path, None] = None, user_agent: Union[Dict, str, None] = None, force_download: Optional[bool] = False, force_filename: Optional[str] = None, proxies: Optional[Dict] = None, etag_timeout: Optional[float] = 10, resume_download: Optional[bool] = False, use_auth_token: Union[bool, str, None] = None, local_files_only: Optional[bool] = False, ) -> Optional[str]: # pragma: no cover """ Download from a given URL and cache it if it's not already present in the local cache. Given a URL, this function looks for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Args: url (`str`): The path to the file to be downloaded. library_name (`str`, *optional*): The name of the library to which the object corresponds. library_version (`str`, *optional*): The version of the library. cache_dir (`str`, `Path`, *optional*): Path to the folder where cached files are stored. user_agent (`dict`, `str`, *optional*): The user-agent info in the form of a dictionary or a string. force_download (`bool`, *optional*, defaults to `False`): Whether the file should be downloaded even if it already exists in the local cache. force_filename (`str`, *optional*): Use this name instead of a generated file name. proxies (`dict`, *optional*): Dictionary mapping protocol to the URL of the proxy passed to `requests.request`. etag_timeout (`float`, *optional* defaults to `10`): When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`. resume_download (`bool`, *optional*, defaults to `False`): If `True`, resume a previously interrupted download. use_auth_token (`bool`, `str`, *optional*): A token to be used for the download. - If `True`, the token is read from the HuggingFace config folder. - If a string, it's used as the authentication token. local_files_only (`bool`, *optional*, defaults to `False`): If `True`, avoid downloading the file and return the path to the local cached file if it exists. Returns: Local path (string) of file or if networking is off, last version of file cached on disk. <Tip> Raises the following errors: - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) if `use_auth_token=True` and the token cannot be found. - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if ETag cannot be determined. - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) if some parameter value is invalid </Tip> """ if cache_dir is None: cache_dir = HUGGINGFACE_HUB_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) headers = { "user-agent": http_user_agent( library_name=library_name, library_version=library_version, user_agent=user_agent, ) } if isinstance(use_auth_token, str): headers["authorization"] = f"Bearer {use_auth_token}" elif use_auth_token: token = HfFolder.get_token() if token is None: raise EnvironmentError( "You specified use_auth_token=True, but a huggingface token was not" " found." ) headers["authorization"] = f"Bearer {token}" url_to_download = url etag = None if not local_files_only: try: r = _request_with_retry( method="HEAD", url=url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout, ) r.raise_for_status() etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag") # We favor a custom header indicating the etag of the linked resource, and # we fallback to the regular etag header. # If we don't have any of those, raise an error. if etag is None: raise OSError( "Distant resource does not have an ETag, we won't be able to" " reliably ensure reproducibility." ) # In case of a redirect, # save an extra redirect on the request.get call, # and ensure we download the exact atomic version even if it changed # between the HEAD and the GET (unlikely, but hey). if 300 <= r.status_code <= 399: url_to_download = r.headers["Location"] except (requests.exceptions.SSLError, requests.exceptions.ProxyError): # Actually raise for those subclasses of ConnectionError raise except ( requests.exceptions.ConnectionError, requests.exceptions.Timeout, OfflineModeIsEnabled, ): # Otherwise, our Internet connection is down. # etag is None pass filename = ( force_filename if force_filename is not None else url_to_filename(url, etag) ) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # etag is None == we don't have a connection or we passed local_files_only. # try to get the last downloaded one if etag is None: if os.path.exists(cache_path) and not force_download: return cache_path else: matching_files = [ file for file in fnmatch.filter( os.listdir(cache_dir), filename.split(".")[0] + ".*" ) if not file.endswith(".json") and not file.endswith(".lock") ] if ( len(matching_files) > 0 and not force_download and force_filename is None ): return os.path.join(cache_dir, matching_files[-1]) else: # If files cannot be found and local_files_only=True, # the models might've been found if local_files_only=False # Notify the user about that if local_files_only: raise ValueError( "Cannot find the requested files in the cached path and" " outgoing traffic has been disabled. To enable model look-ups" " and downloads online, set 'local_files_only' to False." ) else: raise ValueError( "Connection error, and we cannot find the requested files in" " the cached path. Please try again or make sure your Internet" " connection is on." ) # From now on, etag is not None. if os.path.exists(cache_path) and not force_download: return cache_path # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" # Some Windows versions do not allow for paths longer than 255 characters. # In this case, we must specify it is an extended path by using the "\\?\" prefix. if os.name == "nt" and len(os.path.abspath(lock_path)) > 255: lock_path = "\\\\?\\" + os.path.abspath(lock_path) if os.name == "nt" and len(os.path.abspath(cache_path)) > 255: cache_path = "\\\\?\\" + os.path.abspath(cache_path) with FileLock(lock_path): # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. return cache_path if resume_download: incomplete_path = cache_path + ".incomplete" @contextmanager def _resumable_file_manager() -> "io.BufferedWriter": with open(incomplete_path, "ab") as f: yield f temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: temp_file_manager = partial( tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False ) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: logger.info("downloading %s to %s", url, temp_file.name) http_get( url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers, ) logger.info("storing %s in cache at %s", url, cache_path) os.replace(temp_file.name, cache_path) if force_filename is None: logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: json.dump(meta, meta_file) return cache_path
def download_and_unpack_package( pkg_uri: str, base_directory: str, logger: Optional[logging.Logger] = default_logger, ) -> str: """Download the package corresponding to this URI and unpack it. Will be written to a directory named {base_directory}/{uri}. """ pkg_file = Path(_get_local_path(base_directory, pkg_uri)) with FileLock(str(pkg_file) + ".lock"): if logger is None: logger = default_logger logger.debug(f"Fetching package for URI: {pkg_uri}") local_dir = pkg_file.with_suffix("") assert local_dir != pkg_file, "Invalid pkg_file!" if local_dir.exists(): assert local_dir.is_dir(), f"{local_dir} is not a directory" else: protocol, pkg_name = parse_uri(pkg_uri) if protocol == Protocol.GCS: # Download package from the GCS. code = _internal_kv_get(pkg_uri) if code is None: raise IOError(f"Failed to fetch URI {pkg_uri} from GCS.") code = code or b"" pkg_file.write_bytes(code) unzip_package(package_path=pkg_file, target_dir=local_dir, remove_top_level_directory=False, unlink_zip=True, logger=logger) elif protocol in Protocol.remote_protocols(): # Download package from remote URI tp = None if protocol == Protocol.S3: try: from smart_open import open import boto3 except ImportError: raise ImportError( "You must `pip install smart_open` and " "`pip install boto3` to fetch URIs in s3 " "bucket.") tp = {"client": boto3.client("s3")} elif protocol == Protocol.GS: try: from smart_open import open from google.cloud import storage # noqa: F401 except ImportError: raise ImportError( "You must `pip install smart_open` and " "`pip install google-cloud-storage` " "to fetch URIs in Google Cloud Storage bucket.") else: try: from smart_open import open except ImportError: raise ImportError( "You must `pip install smart_open` " f"to fetch {protocol.value.upper()} URIs.") with open(pkg_uri, "rb", transport_params=tp) as package_zip: with open(pkg_file, "wb") as fin: fin.write(package_zip.read()) unzip_package(package_path=pkg_file, target_dir=local_dir, remove_top_level_directory=True, unlink_zip=True, logger=logger) else: raise NotImplementedError( f"Protocol {protocol} is not supported") return str(local_dir)
def _writeStatsFile(self): toUpdate = False for st in self._stats: if st[3] == SStatus.ACQUIRED: toUpdate = True break if not toUpdate: return exstats = self._readStatsFile() lock = FileLock(Teprolin.statsLockFile) uprec = 0 adrec = 0 with lock: print( "PID {0}-{1}.{2}[{3}]: updating the statistics in the file...". format(os.getpid(), Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno), file=sys.stderr, flush=True) with open(Teprolin.statsFile, mode="w") as f: for x in exstats: for i in range(len(self._stats)): y = self._stats[i] if y[0] == x[0]: if y[3] == SStatus.ACQUIRED: x[1] += y[1] x[2] += y[2] uprec += 1 # end if ACQUIRED self._stats.pop(i) break # end if same day # end for i d = x[0] t = x[1] r = x[2] f.write(" ".join(str(e) for e in d)) f.write(" ") f.write(str(t)) f.write(" ") f.write(str(r)) f.write("\n") # end for x for y in self._stats: if y[3] == SStatus.ACQUIRED: d = y[0] t = y[1] r = y[2] f.write(" ".join(str(e) for e in d)) f.write(" ") f.write(str(t)) f.write(" ") f.write(str(r)) f.write("\n") adrec += 1 # end for y print( "PID {0}-{1}.{2}[{3}]: updated {4} records and added {5} records." .format(os.getpid(), Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, uprec, adrec), file=sys.stderr, flush=True)