def setup_logging(write_log_to_file=False, log_file_base="log", log_level_file=logging.INFO, log_level=None, progress_bar=False, ): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Initialise the logging system # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if write_log_to_file or progress_bar: # http://stackoverflow.com/questions/29087297/ # is-there-a-way-to-change-the-filemode-for-a-logger-object-that-is-not-configured # sys.stderr = open(log_file_base + ".err", 'w') pass else: log_file_base = None formatter_long = logging.Formatter('[%(asctime)s] %(name)-5s %(levelname)-8s --- %(message)s ' + '(%(filename)s:%(lineno)s)', datefmt='%Y-%m-%d %H:%M:%S') _logger = create_logger(name=LOGGER_BASE_NAME, file_log_level=log_level_file, console_log_level=log_level, log_file=log_file_base, formatter_file=formatter_long, console_log_format_long=True, ) if progress_bar: # switch off all logging because we are showing the progress bar via the print statement # logger.disabled = True # logger.disabled = True # logger.setLevel(logging.CRITICAL) for handle in _logger.handlers: try: getattr(handle, "baseFilename") except AttributeError: # this is the stream handle because we get an AtrributeError. Set it to critical handle.setLevel(logging.CRITICAL) # with this call we merge the settings of our logger with the logger in the cbs_utils logger # so we can control the output cbs_utils_logger = logging.getLogger("cbs_utils") cbs_utils_logger.setLevel(log_level) handler = logging.StreamHandler() handler.setLevel(log_level) # _logger.addHandler(handler) # cbs_utils_logger.addHandler(handler) merge_loggers(_logger, "cbs_utils", logger_level_to_merge=log_level) return _logger
import logging import matplotlib.pyplot as plt import seaborn as sns from cbs_utils.misc import (create_logger, merge_loggers) from cbs_utils.plotting import (CBSPlotSettings, add_axis_label_background) logger = create_logger(console_log_level=logging.DEBUG) logger = merge_loggers(logger, "cbs_utils.plotting", logger_level_to_merge=logging.DEBUG) figure_properties = CBSPlotSettings() def make_bar_plot(data_df, orientation="horizontal"): """ Make the bar plot Parameters ---------- data_df: Dataframe pandas dataframe with the data orientation: {"horizontal", "vertical"} Direction of the bars """ if orientation not in ("horizontal", "vertical"): raise ValueError( f"oriental must be 'horizontal' or 'vertical'. Found {orientation}"
def __init__(self, database_name=None, database_type=None, store_html_to_cache=False, internet_scraping=True, search_urls=False, max_cache_dir_size=None, user=None, password=None, hostname=None, address_keys=None, kvk_url_keys=None, maximum_entries=None, start_url_index=None, stop_url_index=None, start_url=None, stop_url=None, progressbar=False, singlebar=False, force_process=False, url_range_process=None, save=True, number_of_processes=1, exclude_extensions=None, i_proc=None, log_file_base="log", log_level_file=logging.DEBUG, older_time: datetime.timedelta = None, timezone: pytz.timezone = 'Europe/Amsterdam', filter_urls: list = None): # launch the process console_log_level = logging.getLogger( LOGGER_BASE_NAME).getEffectiveLevel() if i_proc is not None and number_of_processes > 1: mp.Process.__init__(self) formatter = logging.Formatter("{:2d} ".format(i_proc) + "%(levelname)-5s : " "%(message)s " "(%(filename)s:%(lineno)s)", datefmt="%Y-%m-%d %H:%M:%S") log_file = "{}_{:02d}".format(log_file_base, i_proc) logger_name = f"{LOGGER_BASE_NAME}_{i_proc}" self.logger = create_logger(name=logger_name, console_log_level=console_log_level, file_log_level=log_level_file, log_file=log_file, formatter=formatter) self.logger.info("Set up class logger for proc {}".format(i_proc)) else: self.logger = logging.getLogger(LOGGER_BASE_NAME) self.logger.setLevel(console_log_level) self.logger.info( "Set up class logger for main {}".format(__name__)) self.logger.debug("With debug on?") self.progressbar = progressbar self.showbar = progressbar if singlebar and i_proc > 0 or i_proc is None: # in case the single bar option is given, we only show the bar of the first process self.showbar = False # a list of all country url extension which we want to exclude self.exclude_extensions = pd.DataFrame( COUNTRY_EXTENSIONS, columns=["include", "country", "suffix"]) self.exclude_extensions = self.exclude_extensions[ ~self.exclude_extensions["include"]] self.exclude_extensions = self.exclude_extensions.set_index( "suffix", drop=True).drop(["include"], axis=1) self.i_proc = i_proc self.store_html_to_cache = store_html_to_cache self.max_cache_dir_size = max_cache_dir_size self.internet_scraping = internet_scraping self.search_urls = search_urls self.maximum_entries = maximum_entries self.start_url = start_url self.stop_url = stop_url self.force_process = force_process self.start_url_index = start_url_index self.stop_url_index = stop_url_index self.address_keys = address_keys self.kvk_url_keys = kvk_url_keys self.save = save self.older_time = older_time self.timezone = timezone self.filter_urls = filter_urls if progressbar: # switch off all logging because we are showing the progress bar via the print statement # logger.disabled = True # logger.disabled = True # logger.setLevel(logging.CRITICAL) for handle in self.logger.handlers: try: getattr(handle, "baseFilename") except AttributeError: # this is the stream handle because we get an AtrributeError. Set it to critical handle.setLevel(logging.CRITICAL) self.url_df: pd.DataFrame = None self.addresses_df: pd.DataFrame = None self.kvk_df: pd.DataFrame = None self.company_vs_kvk = None self.n_company = None self.number_of_processes = number_of_processes self.url_range_process = Range(url_range_process) self.url_ranges = None self.database = init_database(database_name, database_type=database_type, user=user, password=password, host=hostname) self.database.execute_sql("SET TIME ZONE '{}'".format(self.timezone)) tables = init_models(self.database) self.UrlNL = tables[0] self.company = tables[1] self.address = tables[2] self.website = tables[3]
import logging from pathlib import Path from bs4 import BeautifulSoup from cbs_utils.misc import (create_logger, merge_loggers, Timer) from cbs_utils.regular_expressions import (KVK_REGEXP, ZIP_REGEXP, BTW_REGEXP) from cbs_utils.web_scraping import (get_page_from_url, UrlSearchStrings) # set up logging log_level = logging.DEBUG # change to DEBUG for more info log_format = logging.Formatter( '%(levelname)8s --- %(message)s (%(filename)s:%(lineno)s)') logger = create_logger(console_log_level=log_level, formatter=log_format) merge_loggers(logger, "cbs_utils.web_scraping", logger_level_to_merge=logging.INFO) # create url name and clean previous cache file cache_directory = Path("tmp") clean_cache = True if clean_cache: for item in cache_directory.iterdir(): item.unlink() cache_directory.rmdir() url = "https://www.example.com" # first read: read from the url and report time with Timer(units="s") as timer: page = get_page_from_url(url, cache_directory=cache_directory) logger.info(f"Scraping from url took: {timer.duration} {timer.units}")
import matplotlib.pyplot as plt import sys from cbs_utils.misc import (create_logger, merge_loggers) from cbs_utils.plotting import CBSPlotSettings from cbs_utils.readers import StatLineTable fig_properties = CBSPlotSettings() logger = create_logger() merge_loggers(logger, logger_name_to_merge="cbs_utils.readers") # de tabel id kan je vinden door naar de data set te gaan op statline en in de url op te zoeken. # in dit geval is de url: https://opendata.cbs.nl/#/CBS/nl/dataset/84410NED/table?ts=1568706226304 # dus we gaan een plaatje maken uit de tabel 84410NED table_id = "84410NED" statline = StatLineTable(table_id=table_id, plot_all_questions=True, make_the_plots=True, save_plot=True) sys.exit(0) statline.show_module_table(max_width=30) statline.show_question_table(max_width=30) # hiermee worden all vragen van module 13 geplot, dus ook de individuele opties die bij vraag 16 # horen statline.modules_to_plot = 46 statline.plot() # only save the first figure for inspection
def test_create_logger(): create_logger()