Пример #1
0
 def downloadFiles(self, input_manifest):
     logging.info("Retrieving file(s)...")
     try:
         with open(input_manifest, "r") as in_file:
             file_list = list()
             for line in in_file:
                 entry = json.loads(line)
                 url = entry.get('url')
                 if not url:
                     raise RuntimeError(
                         "Missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry))
                 store = self.getHatracStore(url)
                 filename = entry.get('filename')
                 envvars = self.envars.copy()
                 envvars.update(entry)
                 subdir = self.sub_path.format(**envvars)
                 if not filename:
                     if store:
                         head = store.head(url, headers=self.HEADERS)
                         content_disposition = head.headers.get("Content-Disposition") if head.ok else None
                         filename = os.path.basename(filename).split(":")[0] if not content_disposition else \
                             parse_content_disposition(content_disposition)
                     else:
                         filename = os.path.basename(url)
                 file_path = os.path.abspath(os.path.join(
                     self.base_path, 'data' if self.is_bag else '', subdir, filename))
                 output_dir = os.path.dirname(file_path)
                 self.makeDirs(output_dir)
                 if store:
                     resp = store.get_obj(url, self.HEADERS, file_path)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                     url = self.getExternalUrl(url)
                 else:
                     url = self.getExternalUrl(url)
                     file_path, resp = self.getExternalFile(url, file_path, self.HEADERS)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                 file_bytes = os.path.getsize(file_path)
                 if length != file_bytes:
                     raise RuntimeError(
                         "File size of %s does not match expected size of %s for file %s" %
                         (length, file_bytes, file_path))
                 output_path = ''.join([subdir, "/", filename]) if subdir else filename
                 if self.ro_manifest:
                     ro.add_file_metadata(self.ro_manifest,
                                          source_url=url,
                                          local_path=output_path,
                                          media_type=content_type,
                                          retrieved_on=ro.make_retrieved_on(),
                                          retrieved_by=ro.make_retrieved_by(
                                              self.ro_author_name, orcid=self.ro_author_orcid),
                                          bundled_as=ro.make_bundled_as())
                 file_list.append(output_path)
             return file_list
     finally:
         os.remove(input_manifest)
Пример #2
0
    def process(self):
        headers = self.HEADERS
        headers.update({'accept': self.content_type})
        resp = self.catalogQuery(headers)

        if self.ro_manifest and self.ro_file_provenance:
            ro.add_file_metadata(self.ro_manifest,
                                 source_url=self.url,
                                 local_path=self.output_relpath,
                                 media_type=self.content_type,
                                 retrieved_on=ro.make_retrieved_on(),
                                 retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid),
                                 bundled_as=ro.make_bundled_as())

        self.outputs.update({self.output_relpath: {LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url}})
        return self.outputs
Пример #3
0
    def process(self):
        if self.ro_manifest and self.ro_file_provenance:
            ro.add_file_metadata(
                self.ro_manifest,
                source_url=self.url,
                local_path=self.output_relpath,
                media_type=guess_content_type(self.output_abspath),
                retrieved_on=ro.make_retrieved_on(),
                retrieved_by=ro.make_retrieved_by(self.ro_author_name,
                                                  orcid=self.ro_author_orcid),
                bundled_as=ro.make_bundled_as())
        if self.delete_input:
            self._delete_input()

        self.outputs.update({
            self.output_relpath: {
                LOCAL_PATH_KEY: self.output_abspath,
                SOURCE_URL_KEY: self.url
            }
        })
        return self.outputs
 def downloadFiles(self, input_manifest):
     logging.info(
         "Attempting to download file(s) based on the results of query: %s"
         % self.query)
     try:
         with open(input_manifest, "r") as in_file:
             file_list = dict()
             for line in in_file:
                 entry = json.loads(line)
                 url = entry.get('url')
                 if not url:
                     logging.warning(
                         "Skipping download due to missing required attribute \"url\" in download manifest entry %s"
                         % json.dumps(entry))
                     continue
                 store = self.getHatracStore(url)
                 filename = entry.get('filename')
                 envvars = self.envars.copy()
                 envvars.update(entry)
                 subdir = self.sub_path.format(**envvars)
                 if not filename:
                     if store:
                         try:
                             head = store.head(url, headers=self.HEADERS)
                         except requests.HTTPError as e:
                             raise DerivaDownloadError(
                                 "HEAD request for [%s] failed: %s" %
                                 (url, e))
                         content_disposition = head.headers.get(
                             "Content-Disposition") if head.ok else None
                         filename = os.path.basename(filename).split(":")[0] if not content_disposition else \
                             parse_content_disposition(content_disposition)
                     else:
                         filename = os.path.basename(url)
                 file_path = os.path.abspath(
                     os.path.join(self.base_path,
                                  'data' if self.is_bag else '', subdir,
                                  filename))
                 output_dir = os.path.dirname(file_path)
                 make_dirs(output_dir)
                 if store:
                     try:
                         resp = store.get_obj(url, self.HEADERS, file_path)
                     except requests.HTTPError as e:
                         raise DerivaDownloadError(
                             "File [%s] transfer failed: %s" %
                             (file_path, e))
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                     url = self.getExternalUrl(url)
                 else:
                     url = self.getExternalUrl(url)
                     file_path, resp = self.getExternalFile(
                         url, file_path, self.HEADERS)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                 file_bytes = os.path.getsize(file_path)
                 if length != file_bytes:
                     raise DerivaDownloadError(
                         "File size of %s does not match expected size of %s for file %s"
                         % (length, file_bytes, file_path))
                 output_path = ''.join([subdir, "/", filename
                                        ]) if subdir else filename
                 if self.ro_manifest:
                     ro.add_file_metadata(
                         self.ro_manifest,
                         source_url=url,
                         local_path=output_path,
                         media_type=content_type,
                         retrieved_on=ro.make_retrieved_on(),
                         retrieved_by=ro.make_retrieved_by(
                             self.ro_author_name,
                             orcid=self.ro_author_orcid),
                         bundled_as=ro.make_bundled_as())
                 file_list.update(
                     {output_path: {
                         LOCAL_PATH_KEY: file_path
                     }})
             return file_list
     finally:
         os.remove(input_manifest)