class DatasetSizeModel(Model): """Checks if all datasets after collapsing over all years have attributes of the same size.""" def __init__(self, directory=None): if directory is None: directory = SimulationState().get_cache_directory() self.cache = AttributeCache(directory) def run(self): year_orig = SimulationState().get_current_time() years = self.years_in_cache() SimulationState().set_current_time(years[0]) storages = {} for year in years: storages[year] = flt_storage(os.path.join(self.cache.get_storage_location(), '%s' % year)) tables = self.cache.get_table_names() counts = pd.Series(np.zeros(len(tables), dtype="int32"), index=tables) for table in tables: columns = self.cache._get_column_names_and_years(table) values = [] names = [] colyears = [] for col, year in columns: if col in names: continue data = storages[year].load_table(table, column_names=col) values.append(data[col].size) names.append(col) colyears.append(year) values = np.array(values) if(all(values == values[0])): continue # all attributes have the same size # there is an inconsistency in attributes length names = np.array(names) colyears = np.array(colyears) uc = np.unique(values, return_counts=True) imax = np.argmax(uc[1]) idx = np.where(values <> uc[0][imax])[0] df = pd.DataFrame({"column": names[idx], "year": colyears[idx], "size": values[idx]}) df = df.append(pd.DataFrame({"column": np.array(["all other columns"]), "year": np.array([years[0]]), "size": np.array([uc[0][imax]])})) logger.log_status("Inconsistency in table ", table, ":\n", df) counts[table] = df.shape[0] - 1 SimulationState().set_current_time(year_orig) logger.log_status("Model total:", counts.sum(), ' size inconsistencies found.') return counts def years_in_cache(self): return self.cache._get_sorted_list_of_years(start_with_current_year=False)
def run(self, table_names, out_storage=None, table_name_pattern=None, cache_directory=None, year=None, **kwargs): """ export specified tables to database table_name_pattern: For example '{table_name}_{scenario_name}_{year}' """ if not hasattr(self, "out_storage"): if out_storage is None: raise ValueError, "Either out_storage argument needs to be specified or " + "prepare_for_run called before run method to create a valid out_storage." else: self.out_storage = out_storage sim_state = SimulationState() if sim_state.get_current_time() == 0: sim_state.set_current_time(9999) if cache_directory is None: cache_directory = sim_state.get_cache_directory() attr_cache = AttributeCache(cache_directory=cache_directory) if year is None: years = attr_cache._get_sorted_list_of_years() else: assert isinstance(year, int) years = [year] for table_name in table_names: kwargs["table_name"] = table_name for year in years: kwargs["year"] = year out_table_name = table_name_pattern.format(**kwargs) in_storage = attr_cache.get_flt_storage_for_year(year) # cache_path = os.path.join(cache_directory, str(year)) # in_storage = flt_storage(storage_location=cache_path) # TODO drop_table(table_name) if table_name exists ExportStorage().export_dataset( table_name, in_storage=in_storage, out_storage=self.out_storage, out_dataset_name=out_table_name ) self.post_run(kwargs["scenario_name"], years)
class DataStructureModel(Model): """ Checks the structure of datasets in a given cache (or run cache) when compared to a reference cache. It writes out all columns that are missing as well as those that are not present in the reference cache. It can also compare the sizes of the datasets. """ def __init__(self, reference_location=None): """ "reference_location" is the directory of the reference cache and should include the year. If it is None, the simulation directory in its start year is taken. """ if reference_location is None: reference_location = os.path.join(SimulationState().get_cache_directory(), "%s" % SimulationState().get_start_time()) self.reference_storage = flt_storage(reference_location) def run(self, directory=None, check_size=True): """ "directory" is the cache to be compared to the reference. It should not include the year as the model checks all years. Set "check_sizes" to False if no size check of the datasets is required. """ if directory is None: directory = SimulationState().get_cache_directory() self.cache = AttributeCache(directory) year_orig = SimulationState().get_current_time() years = self.years_in_cache() SimulationState().set_current_time(years[0]) storages = {} for year in years: storages[year] = flt_storage(os.path.join(self.cache.get_storage_location(), '%s' % year)) df = pd.DataFrame(columns=["Table", "Less-than-ref", "More-than-ref", "Year", "Size", "Size-ref"]) tables = self.cache.get_table_names() for table in tables: columns_list = self.cache.get_column_names(table) columns = Set(columns_list) ref_columns_list = self.reference_storage.get_column_names(table, lowercase=True) ref_columns = Set(ref_columns_list) more = columns.difference(ref_columns) less = ref_columns.difference(columns) samesize = True if check_size: table_size = self.cache.load_table(table, columns_list[0])[columns_list[0]].size reftable_size = self.reference_storage.load_table(table, ref_columns_list[0])[ref_columns_list[0]].size if table_size <> reftable_size: samesize = False if len(more) == 0 and len(less) == 0 and samesize: continue df.loc[df.shape[0]] = [table, ', '.join(less), ', '.join(more), '', 0, 0] if len(more) == 0 and samesize: continue # if there are columns in the "more" column, write out the corresponding years columns_and_years = self.cache._get_column_names_and_years(table) more_years = [] for col, year in columns_and_years: if col in more: more_years.append(year) df.loc[df.shape[0]-1, "Year"] = ', '.join(np.unique(np.array(more_years).astype("str"))) if not samesize: # there is difference in table sizes df.loc[df.shape[0]-1, "Size"] = table_size df.loc[df.shape[0]-1, "Size-ref"] = reftable_size if not check_size or (df['Size'].sum()==0 and df['Size-ref'].sum()==0): # remove the size columns if not used del df['Size'] del df['Size-ref'] if df.shape[0] > 0: logger.log_status("Differences in data structure relative to %s:" % self.reference_storage.get_storage_location()) logger.log_status(df) else: logger.log_status("Data structure corresponds to the one in %s" % self.reference_storage.get_storage_location()) return df def years_in_cache(self): return self.cache._get_sorted_list_of_years(start_with_current_year=False)