Exemplo n.º 1
0
 def _download_and_store_zip_file(self, zip_url, zip_filepath):
     with RequestPerformer(method='GET', url=zip_url, retries=self.HTTP_RETRIES) as perf:
         if perf.get_dt_header('Last-Modified'):
             self._archive_http_last_modified = str(perf.get_dt_header('Last-Modified'))
         with open(zip_filepath, 'wb') as f:
             for chunk in perf:
                 f.write(chunk)
     LOGGER.debug('ZIP archive downloaded from %r and stored '
                  'in the %r file', zip_url, zip_filepath)
Exemplo n.º 2
0
 def _scrap_all_zip_filenames(self):
     request_performer = RequestPerformer.fetch(method='GET',
                                                url=self.config['zip_files_url'],
                                                retries=self.HTTP_RETRIES)
     zips_page = BeautifulSoup(request_performer, 'html.parser')
     all_zip_names = [href.get('href')
                      for href in zips_page.find_all('a')
                      if href.get('href').endswith(self.ARCHIVE_FILE_EXTENSION)]
     LOGGER.debug('All scrapped ZIP filenames: %r', all_zip_names)
     return all_zip_names
Exemplo n.º 3
0
 def _get_payload_info(self, payload_filename):
     # Note: invalid MD5 hash or missing file for a given hash does
     # *not* cause an exception or HTTP-level error; instead, a normal
     # (HTTP-200) response is obtained, though its body contains info
     # about the error.  So such cases are handled later (at the
     # publishing stage).
     api_url = self.config['api_url']
     payload_info = RequestPerformer.fetch(method='POST',
                                           url=api_url,
                                           data={'md5_hash': payload_filename},
                                           retries=self.HTTP_RETRIES)
     LOGGER.debug('Payload info for payload filename %r downloaded from '
                  '%r:\n%r', payload_filename, api_url, payload_info)
     return payload_info
Exemplo n.º 4
0
 def download(self,
              url,
              method='GET',
              retries=None,
              custom_request_headers=None,
              **rest_performer_constructor_kwargs):
     retries = self._get_request_retries(retries)
     headers = self._get_request_headers(custom_request_headers)
     with RequestPerformer(method=method,
                           url=url,
                           retries=retries,
                           headers=headers,
                           **rest_performer_constructor_kwargs) as perf:
         self._http_response = perf.response
         self._http_last_modified = perf.get_dt_header('Last-Modified')
         return perf.response.content
Exemplo n.º 5
0
 def request_performer(self,
                       url: str,
                       method: str = 'GET',
                       retries: Optional[int] = None,
                       custom_request_headers: Optional[dict] = None,
                       **rest_performer_constructor_kwargs):
     retries = self._get_request_retries(retries)
     headers = self._get_request_headers(custom_request_headers)
     with RequestPerformer(method=method,
                           url=url,
                           retries=retries,
                           headers=headers,
                           **rest_performer_constructor_kwargs) as perf:
         self._http_response = perf.response
         self._http_last_modified = perf.get_dt_header('Last-Modified')
         yield perf
Exemplo n.º 6
0
 def _fetch_url_info_from_api(self, url_id):
     return RequestPerformer.fetch(method='POST',
                                   url=self.config['api_url'],
                                   data={'urlid': url_id},
                                   retries=self.config['api_retries'])