def _get_link(self): config_web = self.config_web logger.info("STEP: DOWNLOAD - START") link_dict = dict() link_dict['server_url'] = config_web.server_url link_dict['base_url'] = config_web.base_url link_dict['file_name'] = config_web.file_name url_prov = UrlProvider(**link_dict) url = url_prov.get_download_link() return url
def get_converter(self): logger.info("Starting convert part") config = self.config self.down_dict['dest_dir'] = config.temp_dir self.down_dict['work_dir'] = config.work_dir self.down_dict['output_dir'] = config.output_dir try: converter = DataConverter(**self.down_dict) file_conv = converter.converter() except Exception as err: logger.warning('FAILED TO PROCESS STEP; {}'.format(err)) return False logger.info("STEP: CONVERT - DONE") return True
def download_file(self): logger.info("Starting Download") config = self.config self.down_dict['dest_dir'] = config.temp_dir self.down_dict['own_name'] = config.own_name self.down_dict['url'] = self._get_link() try: downloader = HttpDownloader(**self.down_dict) result = downloader.download() except Exception as err: logger.warning('FAILED TO PROCESS STEP; {}'.format(err)) return False logger.info("STEP: DOWNLOAD ARCHIVE - DONE") return result
def get_download_link(self): chrome_options = Options() chrome_options.add_argument("--headless") try: driver = webdriver.Chrome( options=chrome_options, executable_path=ChromeDriverManager().install()) driver.get(self._base_url) except SystemError: logger.warning("Need to updated Chrome") time.sleep(3) res = driver.execute_script("return document.body.innerHTML") soup = BeautifulSoup(res, 'lxml') page_content = soup.find( 'div', {'class': 's-downloadable-resources__results'}) partial_url = None logger.info("Searching link for {}".format(self._file_name[:-2])) try: for tag in page_content.find_all(["a"]): if self.latest_pattern.search(tag.text): partial_url = tag.get('href') break except AttributeError: logger.warning("We weren't able to get link for {}".format( self._file_name[:-2])) return False if partial_url is None: raise ConnectionError( "We weren't able to get link for {}".format( self._file_name[:-2])) new_url = self._server_url + str(partial_url) return new_url
def download(self): filename = os.path.join(self._dest_dir, self._own_name) r = requests.get(self._url, stream=True) logger.info("Starting download from url {} to {}".format( self._url, filename)) cnk_size = 1024 * 8 * 8 cnt = 1 with open(filename, 'wb') as f: for chunk in r.iter_content(chunk_size=cnk_size): if chunk: f.write(chunk) f.flush() logger.info('downloaded {} KB'.format(cnt * cnk_size)) cnt += 1 logger.info('Finished file download: {}'.format(filename)) return filename