def download_xml(link, verify_link=config.get_download_verify_link()): try: page = requests.get(link, verify=verify_link) except requests.exceptions.ConnectionError as e: logging.warning('Unable to open connection.') return None return page.content if page.status_code == 200 else None
def download_content(link, verify_link=config.get_download_verify_link(), post_parm=None, headers=None, encoding='utf-8'): try: if post_parm is None: logging.debug('Downloading without post parameters.') page = requests.get(link, verify=verify_link, headers=headers) page.encoding = encoding else: logging.debug('Downloading with post parameters.') headers_static = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8" } if headers is not None: headers.update(headers_static) else: headers = headers_static page = requests.post(link, verify=verify_link, data=post_parm, headers=headers) page.encoding = encoding except requests.exceptions.ConnectionError as e: logging.warning('Unable to open connection. (%s)', e) return None return page.text if page.status_code == 200 else None
def save_downloaded_soup(link, file, filetype, post_data=None, verify=config.get_download_verify_link(), headers=None): if config.get_download_use_cached_data() is True and os.path.isfile(file): soup = readfile(file, filetype) else: if link is not None: soup = download_content(link, verify, post_data, headers) if soup is not None: logging.info('We got content, write to file.') if not os.path.exists(config.get_directory_cache_url()): os.makedirs(config.get_directory_cache_url()) with open(file, mode='w', encoding='utf-8') as code: if filetype == FileType.html: soup = BeautifulSoup(soup, 'html.parser') code.write(str(soup.prettify())) elif filetype == FileType.xml: soup = BeautifulSoup(soup, 'lxml', from_encoding='utf-8') logging.debug('original encoding: %s', soup.original_encoding) code.write(str(soup.prettify())) elif filetype == FileType.csv or filetype == FileType.json: code.write(str(soup)) else: logging.error('Unexpected type to write: %s', filetype) else: if os.path.exists(file): logging.info( 'The %s link returned error code other than 200 but there is an already downloaded file. Try to open it.', link) soup = readfile(file, filetype) else: logging.warning( 'Skipping dataset: %s. There is not downloadable URL, nor already downbloaded file.', link) else: if os.path.exists(file): soup = readfile(file, filetype) if filetype == FileType.html: soup = BeautifulSoup(soup, 'html.parser') elif filetype == FileType.xml: soup = BeautifulSoup(soup, 'lxml') logging.info( 'Using file only: %s. There is not downloadable URL only just the file. Do not forget to update file manually!', file) else: logging.warning( 'Cannot use download and file: %s. There is not downloadable URL, nor already downbloaded file.', file) return soup
def save_downloaded_pd(link, file, verify=config.get_download_verify_link()): if config.get_download_use_cached_data() == True and os.path.isfile(file): df = pd.read_csv(file) else: df = pd.read_csv(link, encoding='UTF-16', sep='\t') if df is not None: if not os.path.exists(config.get_directory_cache_url()): os.makedirs(config.get_directory_cache_url()) df.to_csv(file) else: logging.warning('Skipping dataset.') return df
def save_downloaded_xml(link, file, verify=config.get_download_verify_link()): if config.get_download_use_cached_data() == True and os.path.isfile(file): with open(file, 'rb') as content_file: page = content_file.read() else: page = download_xml(link, verify) if page != None: if not os.path.exists(config.get_directory_cache_url()): os.makedirs(config.get_directory_cache_url()) with open(file, mode='wb') as code: code.write(page) else: logging.warning('Skipping dataset.') return page
def save_downloaded_soup(link, file, post_data=None, verify=config.get_download_verify_link()): if config.get_download_use_cached_data() == True and os.path.isfile(file): with open(file, 'r') as content_file: soup = BeautifulSoup(content_file.read(), 'html.parser') else: soup = download_soup(link, verify, post_data) if soup != None: if not os.path.exists(config.get_directory_cache_url()): os.makedirs(config.get_directory_cache_url()) with open(file, mode="w", encoding="utf8") as code: code.write(str(soup)) else: logging.warning('Skipping dataset: {}.'.format(link)) return soup
def save_downloaded_pd(link, file, verify=config.get_download_verify_link(), headers=None): if config.get_download_use_cached_data() is True and os.path.isfile(file): df = pd.read_csv(file) else: if link is not None: cvs = download_content(link, verify, None, None, 'utf-16') if cvs is not None: logging.info('We got content, write to file.') if not os.path.exists(config.get_directory_cache_url()): os.makedirs(config.get_directory_cache_url()) with open(file, mode='w', encoding='utf-8') as code: code.write(cvs) df = pd.read_csv(file, encoding='UTF-8', sep='\t', skiprows=0) else: if os.path.exists(file): logging.info( 'The %s link returned error code other than 200 but there is an already downloaded file. Try to open it.', link) df = pd.read_csv(file, encoding='UTF-8', sep='\t', skiprows=0) else: logging.warning( 'Skipping dataset: %s. There is not downloadable URL, nor already downbloaded file.', link) else: if os.path.exists(file): df = pd.read_csv(file, encoding='UTF-8', sep='\t', skiprows=0) logging.info( 'Using file only: %s. There is not downloadable URL only just the file. Do not forget to update file manually!', file) else: logging.warning( 'Cannot use download and file: %s. There is not downloadable URL, nor already downbloaded file.', file) return df
def download_soup(link, verify_link=config.get_download_verify_link(), post_parm=None): try: if post_parm is None: logging.debug('Downloading without post parameters.') page = requests.get(link, verify=verify_link) else: logging.debug('Downloading with post parameters.') headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8" } page = requests.post(link, verify=verify_link, data=post_parm, headers=headers) except requests.exceptions.ConnectionError as e: logging.warning('Unable to open connection.') return None return BeautifulSoup(page.content, 'html.parser') if page.status_code == 200 else None