Exemplo n.º 1
0
    def _download_files(self, products, base_dir, cache=True, cloud_only=False,):
        """
        Takes an `~astropy.table.Table` of data products and downloads them into the directory given by base_dir.

        Parameters
        ----------
        products : `~astropy.table.Table`
            Table containing products to be downloaded.
        base_dir : str
            Directory in which files will be downloaded.
        cache : bool
            Default is True. If file is found on disk it will not be downloaded again.
        cloud_only : bool, optional
            Default False. If set to True and cloud data access is enabled (see `enable_cloud_dataset`)
            files that are not found in the cloud will be skipped rather than downloaded from MAST
            as is the default behavior. If cloud access is not enables this argument as no affect.

        Returns
        -------
        response : `~astropy.table.Table`
        """

        manifest_array = []
        for data_product in products:

            local_path = os.path.join(base_dir, data_product['obs_collection'], data_product['obs_id'])
            data_url = self._portal_api_connection.MAST_DOWNLOAD_URL + "?uri=" + data_product["dataURI"]

            if not os.path.exists(local_path):
                os.makedirs(local_path)

            local_path = os.path.join(local_path, data_product['productFilename'])

            status = "COMPLETE"
            msg = None
            url = None

            try:
                if self._cloud_connection is not None and self._cloud_connection.is_supported(data_product):
                    try:
                        self._cloud_connection.download_file(data_product, local_path, cache)
                    except Exception as ex:
                        log.exception("Error pulling from S3 bucket: {}".format(ex))
                        if cloud_only:
                            log.warn("Skipping file...")
                            local_path = ""
                            status = "SKIPPED"
                        else:
                            log.warn("Falling back to mast download...")
                            self._download_file(data_url, local_path,
                                                cache=cache, head_safe=True, continuation=False)
                else:
                    self._download_file(data_url, local_path,
                                        cache=cache, head_safe=True, continuation=False)

                # check if file exists also this is where would perform md5,
                # and also check the filesize if the database reliably reported file sizes
                if (not os.path.isfile(local_path)) and (status != "SKIPPED"):
                    status = "ERROR"
                    msg = "File was not downloaded"
                    url = data_url

            except HTTPError as err:
                status = "ERROR"
                msg = "HTTPError: {0}".format(err)
                url = data_url

            manifest_array.append([local_path, status, msg, url])

        manifest = Table(rows=manifest_array, names=('Local Path', 'Status', 'Message', "URL"))

        return manifest
Exemplo n.º 2
0
    def _download_file(self,
                       url,
                       local_filepath,
                       timeout=None,
                       auth=None,
                       continuation=True,
                       cache=False,
                       method="GET",
                       head_safe=False,
                       **kwargs):
        """
        Download a file.  Resembles `astropy.utils.data.download_file` but uses
        the local ``_session``

        Parameters
        ----------
        url : string
        local_filepath : string
        timeout : int
        auth : dict or None
        continuation : bool
            If the file has already been partially downloaded *and* the server
            supports HTTP "range" requests, the download will be continued
            where it left off.
        cache : bool
        method : "GET" or "POST"
        head_safe : bool
        """

        if head_safe:
            response = self._session.request("HEAD",
                                             url,
                                             timeout=timeout,
                                             stream=True,
                                             auth=auth,
                                             **kwargs)
        else:
            response = self._session.request(method,
                                             url,
                                             timeout=timeout,
                                             stream=True,
                                             auth=auth,
                                             **kwargs)

        response.raise_for_status()
        if 'content-length' in response.headers:
            length = int(response.headers['content-length'])
            if length == 0:
                log.warn('URL {0} has length=0'.format(url))
        else:
            length = None

        if ((os.path.exists(local_filepath)
             and ('Accept-Ranges' in response.headers) and continuation)):
            open_mode = 'ab'

            existing_file_length = os.stat(local_filepath).st_size
            if length is not None and existing_file_length >= length:
                # all done!
                log.info(
                    "Found cached file {0} with expected size {1}.".format(
                        local_filepath, existing_file_length))
                return
            elif existing_file_length == 0:
                open_mode = 'wb'
            else:
                log.info("Continuing download of file {0}, with {1} bytes to "
                         "go ({2}%)".format(
                             local_filepath, length - existing_file_length,
                             (length - existing_file_length) / length * 100))

                # bytes are indexed from 0:
                # https://en.wikipedia.org/wiki/List_of_HTTP_header_fields#range-request-header
                end = "{0}".format(length - 1) if length is not None else ""
                self._session.headers['Range'] = "bytes={0}-{1}".format(
                    existing_file_length, end)

                response = self._session.request(method,
                                                 url,
                                                 timeout=timeout,
                                                 stream=True,
                                                 auth=auth,
                                                 **kwargs)
                response.raise_for_status()

        elif cache and os.path.exists(local_filepath):
            if length is not None:
                statinfo = os.stat(local_filepath)
                if statinfo.st_size != length:
                    log.warning("Found cached file {0} with size {1} that is "
                                "different from expected size {2}".format(
                                    local_filepath, statinfo.st_size, length))
                    open_mode = 'wb'
                else:
                    log.info(
                        "Found cached file {0} with expected size {1}.".format(
                            local_filepath, statinfo.st_size))
                    response.close()
                    return
            else:
                log.info("Found cached file {0}.".format(local_filepath))
                response.close()
                return
        else:
            open_mode = 'wb'
            if head_safe:
                response = self._session.request(method,
                                                 url,
                                                 timeout=timeout,
                                                 stream=True,
                                                 auth=auth,
                                                 **kwargs)
                response.raise_for_status()

        blocksize = astropy.utils.data.conf.download_block_size

        bytes_read = 0

        # Only show progress bar if logging level is INFO or lower.
        if log.getEffectiveLevel() <= 20:
            progress_stream = None  # Astropy default
        else:
            progress_stream = io.StringIO()

        with ProgressBarOrSpinner(
                length,
            ('Downloading URL {0} to {1} ...'.format(url, local_filepath)),
                file=progress_stream) as pb:
            with open(local_filepath, open_mode) as f:
                for block in response.iter_content(blocksize):
                    f.write(block)
                    bytes_read += blocksize
                    if length is not None:
                        pb.update(
                            bytes_read if bytes_read <= length else length)
                    else:
                        pb.update(bytes_read)

        response.close()
        return response
Exemplo n.º 3
0
    def download_file(self, uri, local_path=None, base_url=None, cache=True, cloud_only=False):
        """
        Downloads a single file based on the data URI

        Parameters
        ----------
        uri : str
            The product dataURI, e.g. mast:JWST/product/jw00736-o039_t001_miri_ch1-long_x1d.fits
        local_path : str
            Directory in which the files will be downloaded.  Defaults to current working directory.
        base_url: str
            A base url to use when downloading.  Default is the MAST Portal API
        cache : bool
            Default is True. If file is found on disk it will not be downloaded again.
        cloud_only : bool, optional
            Default False. If set to True and cloud data access is enabled (see `enable_cloud_dataset`)
            files that are not found in the cloud will be skipped rather than downloaded from MAST
            as is the default behavior. If cloud access is not enables this argument as no affect.

        Returns
        -------
        status: str
            download status message.  Either COMPLETE, SKIPPED, or ERROR.
        msg : str
            An error status message, if any.
        url : str
            The full url download path
        """

        # create the full data URL
        base_url = base_url if base_url else self._portal_api_connection.MAST_DOWNLOAD_URL
        data_url = base_url + "?uri=" + uri

        # create a local file path if none is input.  Use current directory as default.
        if not local_path:
            filename = uri.rsplit('/', 1)[-1]
            local_path = os.path.join(os.path.abspath('.'), filename)

        # recreate the data_product key for cloud connection check
        data_product = {'dataURI': data_url}

        status = "COMPLETE"
        msg = None
        url = None

        try:
            if self._cloud_connection is not None and self._cloud_connection.is_supported(data_product):
                try:
                    self._cloud_connection.download_file(data_product, local_path, cache)
                except Exception as ex:
                    log.exception("Error pulling from S3 bucket: {}".format(ex))
                    if cloud_only:
                        log.warn("Skipping file...")
                        local_path = ""
                        status = "SKIPPED"
                    else:
                        log.warn("Falling back to mast download...")
                        self._download_file(data_url, local_path,
                                            cache=cache, head_safe=True, continuation=False)
            else:
                self._download_file(data_url, local_path,
                                    cache=cache, head_safe=True, continuation=False)

            # check if file exists also this is where would perform md5,
            # and also check the filesize if the database reliably reported file sizes
            if (not os.path.isfile(local_path)) and (status != "SKIPPED"):
                status = "ERROR"
                msg = "File was not downloaded"
                url = data_url

        except HTTPError as err:
            status = "ERROR"
            msg = "HTTPError: {0}".format(err)
            url = data_url

        return status, msg, url