def apply_checks( self, tables, path=None, script_name=None, object_name="dict_checks", dictionary=None, **kwargs): module_logger.info("Starting `apply_checks`") if (script_name is not None) & (object_name is not None): dict_checks = import_attr(path, script_name, object_name) elif dictionary is not None: if type(dictionary).__name__ != "dict": var_msg = "The `dictionary` argument is not a dictionary" module_logger.error(var_msg) raise ValueError(var_msg) dict_checks = dictionary else: var_msg = ("Either `dictionary` or both of `script_name` and " "`path` need to be none null") module_logger.error(var_msg) raise ValueError(var_msg) if type(tables).__name__ == "dict": for table_key in tables.keys(): for check_key in dict_checks.keys(): self.__apply_the_check( tables[table_key], dict_checks[check_key], check_key, table_key, **kwargs) elif type(tables).__name__ == "DataFrame": for check_key in dict_checks.keys(): self.__apply_the_check(tables, dict_checks[check_key], check_key, np.nan, **kwargs) module_logger.info("Completed `apply_checks`")
def alter_tables(self, path=None, script_name=None, object_name="dict_alter", dictionary=None, **kwargs): """ Use this functionality to make alterations to the table(s) """ module_logger.info("Starting `alter_tables`") # TODO move this check to own function (applies to convert_columns too) if (script_name is not None) & (object_name is not None): dict_alter = import_attr(path, script_name, object_name) elif dictionary is not None: if type(dictionary).__name__ != "dict": var_msg = "The `dictionary` argument is not a dictionary" module_logger.error(var_msg) raise ValueError(var_msg) dict_alter = dictionary else: var_msg = ("Either `dictionary` or both of `script_name` and " "`path` need to be none null") module_logger.error(var_msg) raise ValueError(var_msg) if type(self.tables).__name__ == "DataFrame": df = self.tables.copy() df_new = self.__alter_cols( df, dict_alter, [self.__key_1, self.__key_2, self.__key_3], np.nan, **kwargs) self.set_table(df_new) elif type(self.tables).__name__ == "dict": dfs = self.tables for key in self.tables.keys(): df = dfs[key].copy() df_new = self.__alter_cols( df, dict_alter, [self.__key_1, self.__key_2, self.__key_3], key, **kwargs) self.set_table(df_new, key) else: var_msg = ("The tables are in neither a DataFrame or dictionary " "format, which means something is seriously wrong...") module_logger.error(var_msg) raise ValueError(var_msg) module_logger.info("Completed `alter_tables`")
def link_headers(self, path=None, script_name=None, func_name="link_headers", function=None, **kwargs): # TODO Need to see if we can isolate just a set of new tables? Maybe # have a list of dictionary keys that have had their headers # done already? module_logger.info("Starting `link_headers`") if function is not None: if type(function).__name__ != "function": var_msg = ("The function passed to `self.link_headers` is " "not a function.") module_logger.error(var_msg) raise ValueError(var_msg) elif script_name is not None: function = import_attr(path, script_name, func_name) else: function = self._link_headers try: dict_link = function(self.tables, self.headers, **kwargs) except AttributeError: if len([x for x in kwargs.keys()]) > 0: var_msg = ( f"Function link_headers, kwargs may have been passed when " f"the function {func_name} in the script {script_name} does" f" not take kwargs") else: var_msg = (f"Function link_headers: The {func_name} function " f"does not exist in the {script_name} script.") module_logger.error(var_msg) raise AttributeError(var_msg) list_unallocated_keys = set(self.tables.keys()) - set(dict_link.keys()) if len(list_unallocated_keys) != 0: var_msg = (f"Not all the headers are linked, the unlinked tables " f"are: {list_unallocated_keys}") module_logger.error(var_msg) raise ValueError(var_msg) self.__link_headers = dict(dict_link) module_logger.info("Completed `link_headers`")
def form_summary_tables(self, path=None, script_name=None, func_name="form_tables", function=None, **kwargs): """ Use a function to create summaries off the main table set. The function is passed the arguments: self.tables, self.formed_tables, self.__grouping, self.__key_1, self.__key_2, self.__key_3, self.__key_separator, **kwargs """ module_logger.info("Starting `form_summary_tables`") if function is not None: if type(function).__name__ != "function": var_msg = ("The function passed to `self.form_summary_tables` " "is not a function.") module_logger.error(var_msg) raise ValueError(var_msg) elif script_name is not None: function = import_attr(path, script_name, func_name) else: var_msg = ( "One of the `function` or `script_name` arguments needs " "to be completed. And if `script name is then `path` " "needs to be too.") module_logger.error(var_msg) raise ValueError(var_msg) dict_formed_tables = function(self.tables, self.formed_tables, self.__grouping, self.__key_1, self.__key_2, self.__key_3, self.__key_separator, **kwargs) if type(dict_formed_tables).__name__ != 'dict': var_msg = ('The output of the function for `form_summary_table` ' 'is not a dictionary and it needs to be') module_logger.error(var_msg) raise ValueError(var_msg) self.formed_tables = dict_formed_tables module_logger.info("Completed `form_summary_tables`")
def convert_columns(self, path=None, script_name=None, object_name="dict_convert", dictionary=None, **kwargs): module_logger.info("Starting `convert_columns`") if (script_name is not None) & (object_name is not None): dict_convert = import_attr(path, script_name, object_name) elif dictionary is not None: if type(dictionary).__name__ != "dict": var_msg = "The `dictionary` argument is not a dictionary" module_logger.error(var_msg) raise ValueError(var_msg) dict_convert = dictionary else: var_msg = ("Either `dictionary` or both of `script_name` and " "`path` need to be none null") module_logger.error(var_msg) raise ValueError(var_msg) if type(self.tables).__name__ == "DataFrame": df = self.tables.copy() df_new = self.__convert_col(df, dict_convert, "", **kwargs) self.set_table(df_new, overwrite=True) elif type(self.tables).__name__ == "dict": dfs = self.tables for key in self.tables.keys(): df = dfs[key].copy() df_new = self.__convert_col(df, dict_convert, key, **kwargs) dfs[key] = df_new.copy() self.set_table(dfs, overwrite=True) else: var_msg = ("The tables are in neither a DataFrame or dictionary " "format, which means something is seriously wrong...") module_logger.error(var_msg) raise ValueError(var_msg) module_logger.info("Completed `convert_columns`")
def find_files(self, path=None, script_name=None, func_name="list_the_files", function=None, files_path='.', append=False, **kwargs): """ Using an externally defined function, as specified in the module argument script, acquire a list of files to be read in. In the case that we want to accumulate a list of files from different main paths there is an append option. """ module_logger.info("Starting `find_files`") # TODO move this to an internal function as it's used so often! if script_name is not None: function = import_attr(path, script_name, func_name) elif function is not None: if type(function).__name__ != "function": var_msg = "The `function` argument needs to be a function" module_logger.error(var_msg) raise ValueError(var_msg) else: var_msg = ("One of `script_name` or `function` needs to be not " "None in the function `find_files`") module_logger.error(var_msg) raise ValueError(var_msg) list_files = function(files_path, **kwargs) # TODO move these to be calls on the self.set_file_list function instead # of setting the value here if append: self.list_files += list_files else: self.list_files = list_files module_logger.info( f"Completed `find_files`, the list of files is: {self.list_files}")
def summary(self, path=None, script_name=None, object_name="dict_checks", dictionary=None): if (script_name is not None) & (object_name is not None): dict_checks = import_attr(path, script_name, object_name) elif dictionary is not None: if type(dictionary).__name__ != "dict": var_msg = "The `dictionary` argument is not a dictionary" module_logger.error(var_msg) raise ValueError(var_msg) dict_checks = dictionary else: var_msg = ("Either `dictionary` or both of `script_name` and " "`path` need to be none null") module_logger.error(var_msg) raise ValueError(var_msg) list_keys = [ 'calc_condition', 'long_description', 'check_condition', 'columns', 'count_condition', 'index_position', 'relevant_columns', 'idx_flag', 'category' ] dict_checks_values = deepcopy(dict_checks) for check in [key for key in dict_checks_values.keys()]: for key in [key for key in list_keys if key not in dict_checks_values[check].keys()]: dict_checks_values[check][key] = self.__checks_defaults[key] for check in [key for key in dict_checks_values.keys()]: for key in [key for key in dict_checks_values[check].keys()]: dict_checks_values[check][key] = self.__func_summary_( dict_checks_values[check][key]) df_summary = pd.DataFrame( dict_checks_values ).T.reset_index().rename(columns={'index': 'check'}) return {'df': df_summary, 'dict': dict_checks}
def set_headers(self, path=None, script_name=None, func_name=None, list_cols=None, function=None, ideal_headers=None, required_headers=None): module_logger.info("Starting `set_headers`") if list_cols is not None: if type(list_cols).__name__ != "list": var_msg = ( "The argument `list_cols` of function `set_headers` " "needs to be a list") module_logger.error(var_msg) raise ValueError(var_msg) elif function is not None: if type(function).__name__ != "function": var_msg = ("The argument `function` of function `set_headers` " "needs to be a function") module_logger.error(var_msg) raise ValueError(var_msg) elif script_name is not None: function = import_attr(path, script_name, func_name) elif ideal_headers is not None: if type(ideal_headers).__name__ != 'list': var_msg = ("The argument `ideal_headers` of function " "`set_headers` needs to be a list") module_logger.error(var_msg) raise ValueError(var_msg) elif required_headers is not None: if type(required_headers).__name__ != 'list': var_msg = ("The argument `required_headers` of function " "`set_headers` needs to be a list") module_logger.error(var_msg) raise ValueError(var_msg) var_type = type(self.tables).__name__ if var_type == "dict": dict_dfs = self.tables.copy() var_cond = len( set([dict_dfs[key].shape[1] for key in dict_dfs.keys()])) var_cond = var_cond != 1 if var_cond: var_msg = ("There are an inconsistent number of columns " "present in the dictionary of tables") module_logger.error(var_msg) raise ValueError(var_msg) if list_cols is not None: if (len(list_cols) != dict_dfs[[x for x in dict_dfs.keys() ][0]].shape[1]): var_msg = ("The length of `list_cols` is different to the " "number of columns present in the table") module_logger.error(var_msg) raise ValueError(var_msg) elif function is not None: list_cols_org = dict_dfs[[x for x in dict_dfs.keys() ][0]].columns.tolist() list_cols = [function(x) for x in list_cols_org] for key in dict_dfs.keys(): if list_cols is not None: dict_dfs[key].columns = list_cols elif function is not None: dict_dfs[key].columns = list_cols elif ideal_headers is not None: for col in [ col for col in ideal_headers if col not in dict_dfs[key].columns.tolist() ]: dict_dfs[key][col] = np.nan dict_dfs[key] = dict_dfs[key][ideal_headers].copy() elif required_headers is not None: for col in [ col for col in required_headers if col not in dict_dfs[key].columns.tolist() ]: dict_dfs[key][col] = np.nan self.set_table(dict_dfs, overwrite=True) elif var_type == "DataFrame": if len(list_cols) != self.tables.shape[1]: var_msg = ("The length of `list_cols` is different to the " "number of columns present in the table") module_logger.error(var_msg) raise ValueError(var_msg) df = self.tables.copy() if list_cols is not None: df.columns = list_cols elif function is not None: df.columns = [function(x) for x in df.columns.tolist()] elif ideal_headers is not None: for col in [ col for col in ideal_headers if col not in df.columns.tolist() ]: df[col] = np.nan df = df[ideal_headers].copy() elif required_headers is not None: for col in [ col for col in required_headers if col not in df.columns.tolist() ]: df[col] = np.nan self.set_table(df, overwrite=True) else: var_msg = ( "Somehow the tables are not a dictionary or a DataFrame " "for function `set_headers`") module_logger.error(var_msg) raise ValueError(var_msg) module_logger.info("Completed `set_headers`")
def set_comparison_headers(self, path=None, script_name=None, func_name="read_headers", function=None, dictionary=None, **kwargs): # TODO Need to see if we can isolate just a set of new tables? Maybe # have a list of dictionary keys that have had their headers done # already? module_logger.info("Starting `set_comparison_headers`") if function is not None: if type(function).__name__ != "function": var_msg = ("The function passed to " "`self.set_comparison_headers` is not a function.") module_logger.error(var_msg) raise ValueError(var_msg) elif script_name is not None: function = import_attr(path, script_name, func_name) elif dictionary is not None: def function(**kwargs): return dictionary else: var_msg = ( "One of the `function` or `script_name` arguments needs " "to be completed. And if `script name is then `path` " "needs to be too.") module_logger.error(var_msg) raise ValueError(var_msg) try: dict_headers = function(**kwargs) except AttributeError: if len([x for x in kwargs.keys()]) > 0: var_msg = ( f"Function set_comparison_headers, kwargs may have been " f"passed when the function {func_name} in the script " f"{script_name} does not take kwargs") else: var_msg = ( f"Function set_comparison_headers: The {func_name} function" f" does not exist in the {script_name} script.") module_logger.error(var_msg) raise AttributeError(var_msg) if type(dict_headers).__name__ != 'dict': var_msg = 'The headers output should be a dictionary' module_logger.error(var_msg) raise Exception(var_msg) list_keys = [ key for key in dict_headers.keys() if key != 'ideal_headers' ] list_keys = [ key for key in list_keys if (dict_headers[key].get('expected_headers') is None) | (dict_headers[key].get('new_headers') is None) | (dict_headers[key].get('remove') is None) ] if len(list_keys) > 0: var_msg = ( f'There are dictionary keys that do not have all the required ' f'values: {", ".join([str(key) for key in list_keys])}') module_logger.error(var_msg) raise Exception(var_msg) if dict_headers.get('ideal_headers') is None: var_msg = ('There needs to be a key to the headers dictionary that' ' is "ideal_headers"') module_logger.error(var_msg) raise Exception(var_msg) if type(dict_headers.get('ideal_headers')).__name__ != 'list': var_msg = 'The value of key "ideal_headers" needs to be a list' module_logger.error(var_msg) raise Exception(var_msg) self.headers = dict(dict_headers) module_logger.info( f"There are {len(dict_headers)} header keys and they are: " f"{', '.join([key for key in dict_headers.keys()])}") module_logger.info("Completed `set_comparison_headers`")
def reading_in(self, path=None, script_name=None, func_name="read_files", function=None, overwrite=True, **kwargs): """ Using an externally defined reading in function, and the internally defined list of files, read in each of the tables required. `path` being the relative script file path """ module_logger.info("Starting `reading_in`") if type(self.tables).__name__ != "dict": var_msg = ("The tables need to be in dictionary format for this " "`self.reading_in` step") module_logger.error(var_msg) raise ValueError(var_msg) if function is not None: if type(function).__name__ != "function": var_msg = ("The function passed to `self.reading_in` is not a " "function.") module_logger.error(var_msg) raise ValueError(var_msg) elif script_name is not None: function = import_attr(path, script_name, func_name) else: var_msg = ( "One of the `function` or `script_name` arguments needs " "to be completed. And if `script name is then `path` " "needs to be too.") module_logger.error(var_msg) raise ValueError(var_msg) try: dfs = function(self.list_files, **kwargs) except AttributeError: if len([x for x in kwargs.keys()]) > 0: var_msg = (f"Function reading_in, kwargs may have been passed " f"when the function {func_name} in the script " f"{script_name} does not take kwargs") else: var_msg = (f"Function reading in: The {func_name} function " f"does not exist in the {script_name} script.") module_logger.error(var_msg) raise AttributeError(var_msg) if overwrite is False: df_org = self.tables.copy() df_org.update(dfs) elif overwrite is True: pass else: var_msg = ("The attribute `overwrite` in the function " "`reading_in` needs to be `True` or `False`") module_logger.error(var_msg) raise ValueError(var_msg) self.set_table(dfs, overwrite=overwrite) if type(dfs).__name__ == "DataFrame": module_logger.info(f"The table has shape '{dfs.shape}'") else: for key in dfs: module_logger.info( f"The table with key '{key}' has shape '{dfs[key].shape}'") module_logger.info("Completed `reading_in`")