def get_collection_by_id(self, collection_id): """Build STAC collection by its id :param collection_id: product type as collection ID :type collection_id: str :returns: collection dictionnary :rtype: dict """ collection_list = self.__get_collection_list() try: collection = [ c for c in collection_list if c["id"] == collection_id ][0] except IndexError: raise NotAvailableError("%s collection not found" % collection_id) self.update_data(collection) return self.as_dict()
def download( self, product, auth=None, progress_callback=None, wait=DEFAULT_DOWNLOAD_WAIT, timeout=DEFAULT_DOWNLOAD_TIMEOUT, **kwargs ): """Download a product using HTTP protocol. The downloaded product is assumed to be a Zip file. If it is not, the user is warned, it is renamed to remove the zip extension and no further treatment is done (no extraction) """ fs_path, record_filename = self._prepare_download(product, **kwargs) if not fs_path or not record_filename: return fs_path # progress bar init if progress_callback is None: progress_callback = get_progress_callback() progress_callback.desc = product.properties.get("id", "") progress_callback.position = 1 # download assets if exist instead of remote_location try: return self._download_assets( product, fs_path.replace(".zip", ""), record_filename, auth, progress_callback, **kwargs ) except NotAvailableError: pass url = product.remote_location # order product if it is offline ordered_message = "" if ( "orderLink" in product.properties and "storageStatus" in product.properties and product.properties["storageStatus"] == OFFLINE_STATUS ): order_method = getattr(self.config, "order_method", "GET") with requests.request( method=order_method, url=product.properties["orderLink"], auth=auth, headers=getattr(self.config, "order_headers", {}), ) as response: try: response.raise_for_status() ordered_message = response.text logger.debug(ordered_message) except HTTPError as e: logger.warning( "%s could not be ordered, request returned %s", product.properties["title"], e, ) # initiate retry loop start_time = datetime.now() stop_time = datetime.now() + timedelta(minutes=timeout) product.next_try = start_time retry_count = 0 not_available_info = "The product could not be downloaded" # another output for notebooks nb_info = NotebookWidgets() while "Loop until products download succeeds or timeout is reached": if datetime.now() >= product.next_try: product.next_try += timedelta(minutes=wait) try: params = kwargs.pop("dl_url_params", None) or getattr( self.config, "dl_url_params", {} ) with requests.get( url, stream=True, auth=auth, params=params, ) as stream: try: stream.raise_for_status() except HTTPError as e: # check if error is identified as auth_error in provider conf auth_errors = getattr( self.config, "auth_error_code", [None] ) if not isinstance(auth_errors, list): auth_errors = [auth_errors] if e.response.status_code in auth_errors: raise AuthenticationError( "HTTP Error %s returned, %s\nPlease check your credentials for %s" % ( e.response.status_code, e.response.text.strip(), self.provider, ) ) # product not available elif ( product.properties.get("storageStatus", ONLINE_STATUS) != ONLINE_STATUS ): msg = ( ordered_message if ordered_message and not e.response.text else e.response.text ) raise NotAvailableError( "%s(initially %s) requested, returned: %s" % ( product.properties["title"], product.properties["storageStatus"], msg, ) ) else: import traceback as tb logger.error( "Error while getting resource :\n%s", tb.format_exc(), ) else: stream_size = int(stream.headers.get("content-length", 0)) if ( stream_size == 0 and "storageStatus" in product.properties and product.properties["storageStatus"] != ONLINE_STATUS ): raise NotAvailableError( "%s(initially %s) ordered, got: %s" % ( product.properties["title"], product.properties["storageStatus"], stream.reason, ) ) progress_callback.max_size = stream_size progress_callback.reset() with open(fs_path, "wb") as fhandle: for chunk in stream.iter_content(chunk_size=64 * 1024): if chunk: fhandle.write(chunk) progress_callback(len(chunk), stream_size) with open(record_filename, "w") as fh: fh.write(url) logger.debug("Download recorded in %s", record_filename) # Check that the downloaded file is really a zip file if not zipfile.is_zipfile(fs_path): logger.warning( "Downloaded product is not a Zip File. Please check its file type before using it" ) new_fs_path = fs_path[: fs_path.index(".zip")] shutil.move(fs_path, new_fs_path) return new_fs_path return self._finalize(fs_path, **kwargs) except NotAvailableError as e: if not getattr(self.config, "order_enabled", False): raise NotAvailableError( "Product is not available for download and order is not supported for %s, %s" % (self.provider, e) ) not_available_info = e pass if datetime.now() < product.next_try and datetime.now() < stop_time: wait_seconds = (product.next_try - datetime.now()).seconds retry_count += 1 retry_info = ( "[Retry #%s] Waiting %ss until next download try (retry every %s' for %s')" % (retry_count, wait_seconds, wait, timeout) ) logger.debug(not_available_info) # Retry-After info from Response header retry_server_info = stream.headers.get("Retry-After", "") if retry_server_info: logger.debug( "[%s response] Retry-After: %s" % (self.provider, retry_server_info) ) logger.info(retry_info) nb_info.display_html(retry_info) sleep(wait_seconds + 1) elif datetime.now() >= stop_time and timeout > 0: if "storageStatus" not in product.properties: product.properties["storageStatus"] = "N/A status" logger.info(not_available_info) raise NotAvailableError( "%s is not available (%s) and could not be downloaded, timeout reached" % (product.properties["title"], product.properties["storageStatus"]) ) elif datetime.now() >= stop_time: raise NotAvailableError(not_available_info)
def _download_assets( self, product, fs_dir_path, record_filename, auth=None, progress_callback=None, **kwargs ): """Download product assets if they exist""" assets_urls = [ a["href"] for a in getattr(product, "assets", {}).values() if "href" in a ] if not assets_urls: raise NotAvailableError("No assets available for %s" % product) # remove existing incomplete file if os.path.isfile(fs_dir_path): os.remove(fs_dir_path) # create product dest dir if not os.path.isdir(fs_dir_path): os.makedirs(fs_dir_path) # product conf overrides provider conf for "flatten_top_dirs" product_conf = getattr(self.config, "products", {}).get( product.product_type, {} ) flatten_top_dirs = product_conf.get( "flatten_top_dirs", getattr(self.config, "flatten_top_dirs", False) ) total_size = sum( [ int( requests.head(asset_url, auth=auth).headers.get("Content-length", 0) ) for asset_url in assets_urls ] ) progress_callback.max_size = total_size progress_callback.reset() error_messages = set() for asset_url in assets_urls: params = kwargs.pop("dl_url_params", None) or getattr( self.config, "dl_url_params", {} ) with requests.get( asset_url, stream=True, auth=auth, params=params, ) as stream: try: stream.raise_for_status() except HTTPError as e: # check if error is identified as auth_error in provider conf auth_errors = getattr(self.config, "auth_error_code", [None]) if not isinstance(auth_errors, list): auth_errors = [auth_errors] if e.response.status_code in auth_errors: raise AuthenticationError( "HTTP Error %s returned, %s\nPlease check your credentials for %s" % ( e.response.status_code, e.response.text.strip(), self.provider, ) ) else: logger.warning("Unexpected error: %s" % e) logger.warning("Skipping %s" % asset_url) error_messages.add(str(e)) else: asset_rel_path = ( asset_url.replace(product.location, "") .replace("https://", "") .replace("http://", "") ) asset_abs_path = os.path.join(fs_dir_path, asset_rel_path) asset_abs_path_dir = os.path.dirname(asset_abs_path) if not os.path.isdir(asset_abs_path_dir): os.makedirs(asset_abs_path_dir) if not os.path.isfile(asset_abs_path): with open(asset_abs_path, "wb") as fhandle: for chunk in stream.iter_content(chunk_size=64 * 1024): if chunk: fhandle.write(chunk) progress_callback(len(chunk)) # could not download any file if len(os.listdir(fs_dir_path)) == 0: raise HTTPError(", ".join(error_messages)) # flatten directory structure if flatten_top_dirs: tmp_product_local_path = "%s-tmp" % fs_dir_path for d, dirs, files in os.walk(fs_dir_path): if len(files) != 0: shutil.copytree(d, tmp_product_local_path) shutil.rmtree(fs_dir_path) os.rename(tmp_product_local_path, fs_dir_path) break # save hash/record file with open(record_filename, "w") as fh: fh.write(product.remote_location) logger.debug("Download recorded in %s", record_filename) return fs_dir_path
def download(self, product, auth=None, progress_callback=None, **kwargs): """Download data from USGS catalogues""" url = product.remote_location if not url: logger.debug( "Unable to get download url for %s, skipping download", product) return logger.info("Download url: %s", url) filename = product.properties["title"] + ".tar.bz" local_file_path = os.path.join(self.config.outputs_prefix, filename) download_records = os.path.join(self.config.outputs_prefix, ".downloaded") if not os.path.exists(download_records): os.makedirs(download_records) url_hash = hashlib.md5(url.encode("utf-8")).hexdigest() record_filename = os.path.join(download_records, url_hash) if os.path.isfile(record_filename) and os.path.isfile(local_file_path): logger.info("Product already downloaded. Retrieve it at %s", local_file_path) return local_file_path # Remove the record file if local_file_path is absent (e.g. it was deleted # while record wasn't) elif os.path.isfile(record_filename): logger.debug("Record file found (%s) but not the actual file", record_filename) logger.debug("Removing record file : %s", record_filename) os.remove(record_filename) with requests.get( url, stream=True, auth=auth, params=getattr(self.config, "dl_url_params", {}), verify=False, hooks={ "response": lambda r, *args, **kwargs: print("\n", r.url) }, ) as stream: stream_size = int(stream.headers.get("content-length", 0)) with open(local_file_path, "wb") as fhandle: for chunk in stream.iter_content(chunk_size=64 * 1024): if chunk: fhandle.write(chunk) progress_callback(len(chunk), stream_size) try: stream.raise_for_status() except HTTPError as e: if e.response.status_code == 404: raise NotAvailableError( "%s not available, request returned: %s" % (product.properties["title"], e)) else: import traceback logger.error("Error while getting resource : %s", traceback.format_exc()) else: with open(record_filename, "w") as fh: fh.write(url) logger.debug("Download recorded in %s", record_filename) if self.config.extract and zipfile.is_zipfile(local_file_path): logger.info("Extraction activated") with zipfile.ZipFile(local_file_path, "r") as zfile: fileinfos = zfile.infolist() with tqdm( fileinfos, unit="file", desc="Extracting files from {}".format( local_file_path), ) as progressbar: for fileinfo in progressbar: zfile.extract( fileinfo, path=self.config["outputs_prefix"]) return local_file_path[:local_file_path.index(".tar.bz")] else: return local_file_path
def download(self, product, auth=None, progress_callback=None, **kwargs): """Download data from USGS catalogues""" fs_path, record_filename = self._prepare_download( product, outputs_extension=".tar.gz", **kwargs) if not fs_path or not record_filename: return fs_path # progress bar init if progress_callback is None: progress_callback = get_progress_callback() progress_callback.desc = product.properties.get("id", "") progress_callback.position = 1 try: api.login( self.config.credentials["username"], self.config.credentials["password"], save=True, ) except USGSError: raise AuthenticationError( "Please check your USGS credentials.") from None download_options = api.download_options( product.properties["productType"], product.properties["id"]) try: product_ids = [ p["id"] for p in download_options["data"] if p["downloadSystem"] == "dds" ] except KeyError as e: raise NotAvailableError("%s not found in %s's products" % (e, product.properties["id"])) if not product_ids: raise NotAvailableError("No USGS products found for %s" % product.properties["id"]) req_urls = [] for product_id in product_ids: download_request = api.download_request( product.properties["productType"], product.properties["id"], product_id) try: req_urls.extend([ x["url"] for x in download_request["data"]["preparingDownloads"] ]) except KeyError as e: raise NotAvailableError("%s not found in %s download_request" % (e, product.properties["id"])) if len(req_urls) > 1: logger.warning( "%s usgs products found for %s. Only first will be downloaded" % (len(req_urls), product.properties["id"])) elif not req_urls: raise NotAvailableError("No usgs request url was found for %s" % product.properties["id"]) req_url = req_urls[0] progress_callback.reset() with requests.get( req_url, stream=True, ) as stream: try: stream.raise_for_status() except HTTPError: import traceback as tb logger.error( "Error while getting resource :\n%s", tb.format_exc(), ) else: stream_size = int(stream.headers.get("content-length", 0)) progress_callback.max_size = stream_size progress_callback.reset() with open(fs_path, "wb") as fhandle: for chunk in stream.iter_content(chunk_size=64 * 1024): if chunk: fhandle.write(chunk) progress_callback(len(chunk), stream_size) with open(record_filename, "w") as fh: fh.write(product.properties["downloadLink"]) logger.debug("Download recorded in %s", record_filename) api.logout() # Check that the downloaded file is really a tar file if not tarfile.is_tarfile(fs_path): logger.warning( "Downloaded product is not a tar File. Please check its file type before using it" ) new_fs_path = fs_path[:fs_path.index(".tar.gz")] shutil.move(fs_path, new_fs_path) return new_fs_path return self._finalize(fs_path, outputs_extension=".tar.gz", **kwargs)
def download(self, product, auth=None, progress_callback=None, **kwargs): """Download method for S3 REST API. :param product: The EO product to download :type product: :class:`~eodag.api.product.EOProduct` :param auth: (optional) The configuration of a plugin of type Authentication :type auth: :class:`~eodag.config.PluginConfig` :param progress_callback: (optional) A method or a callable object which takes a current size and a maximum size as inputs and handle progress bar creation and update to give the user a feedback on the download progress :type progress_callback: :class:`~eodag.utils.ProgressCallback` or None :return: The absolute path to the downloaded product in the local filesystem :rtype: str """ # get bucket urls bucket_name, prefix = self.get_bucket_name_and_prefix(product) if (bucket_name is None and "storageStatus" in product.properties and product.properties["storageStatus"] == OFFLINE_STATUS): raise NotAvailableError( "%s is not available for download on %s (status = %s)" % ( product.properties["title"], self.provider, product.properties["storageStatus"], )) bucket_url = urljoin( product.downloader.config.base_uri.strip("/") + "/", bucket_name) nodes_list_url = bucket_url + "?prefix=" + prefix.strip("/") # get nodes/files list contained in the bucket logger.debug("Retrieving product content from %s", nodes_list_url) bucket_contents = requests.get(nodes_list_url, auth=auth) try: bucket_contents.raise_for_status() except requests.HTTPError as err: # check if error is identified as auth_error in provider conf auth_errors = getattr(self.config, "auth_error_code", [None]) if not isinstance(auth_errors, list): auth_errors = [auth_errors] if err.response.status_code in auth_errors: raise AuthenticationError( "HTTP Error %s returned, %s\nPlease check your credentials for %s" % ( err.response.status_code, err.response.text.strip(), self.provider, )) # other error else: logger.exception( "Could not get content from %s (provider:%s, plugin:%s)\n%s", nodes_list_url, self.provider, self.__class__.__name__, bucket_contents.text, ) raise RequestError(str(err)) try: xmldoc = minidom.parseString(bucket_contents.text) except ExpatError as err: logger.exception("Could not parse xml data from %s", bucket_contents) raise DownloadError(str(err)) nodes_xml_list = xmldoc.getElementsByTagName("Contents") if len(nodes_xml_list) == 0: logger.warning("Could not load any content from %s", nodes_list_url) elif len(nodes_xml_list) == 1: # single file download product.remote_location = urljoin( bucket_url.strip("/") + "/", prefix.strip("/")) return HTTPDownload(self.provider, self.config).download( product=product, auth=auth, progress_callback=progress_callback, **kwargs) # destination product path outputs_prefix = kwargs.pop("ouputs_prefix", None) or self.config.outputs_prefix abs_outputs_prefix = os.path.abspath(outputs_prefix) product_local_path = os.path.join(abs_outputs_prefix, prefix.split("/")[-1]) # .downloaded cache record directory download_records_dir = os.path.join(abs_outputs_prefix, ".downloaded") try: os.makedirs(download_records_dir) except OSError as exc: import errno if exc.errno != errno.EEXIST: # Skip error if dir exists import traceback as tb logger.warning("Unable to create records directory. Got:\n%s", tb.format_exc()) # check if product has already been downloaded url_hash = hashlib.md5( product.remote_location.encode("utf-8")).hexdigest() record_filename = os.path.join(download_records_dir, url_hash) if os.path.isfile(record_filename) and os.path.exists( product_local_path): return product_local_path # Remove the record file if product_local_path is absent (e.g. it was deleted while record wasn't) elif os.path.isfile(record_filename): logger.debug("Record file found (%s) but not the actual file", record_filename) logger.debug("Removing record file : %s", record_filename) os.remove(record_filename) # total size for progress_callback total_size = sum([ int(node.firstChild.nodeValue) for node in xmldoc.getElementsByTagName("Size") ]) # download each node key for node_xml in nodes_xml_list: node_key = node_xml.getElementsByTagName( "Key")[0].firstChild.nodeValue # As "Key", "Size" and "ETag" (md5 hash) can also be retrieved from node_xml node_url = urljoin( bucket_url.strip("/") + "/", node_key.strip("/")) # output file location local_filename = os.path.join(self.config.outputs_prefix, "/".join(node_key.split("/")[6:])) local_filename_dir = os.path.dirname( os.path.realpath(local_filename)) if not os.path.isdir(local_filename_dir): os.makedirs(local_filename_dir) with requests.get(node_url, stream=True, auth=auth) as stream: try: stream.raise_for_status() except HTTPError: import traceback as tb logger.error("Error while getting resource :\n%s", tb.format_exc()) else: with open(local_filename, "wb") as fhandle: for chunk in stream.iter_content(chunk_size=64 * 1024): if chunk: fhandle.write(chunk) progress_callback(len(chunk), total_size) # TODO: check md5 hash ? with open(record_filename, "w") as fh: fh.write(product.remote_location) logger.debug("Download recorded in %s", record_filename) return product_local_path