Exemplo n.º 1
0
class DatasetSizeModel(Model):
    """Checks if all datasets after collapsing over all years have attributes of the same size."""
    
    def __init__(self, directory=None):
        if directory is None:
            directory = SimulationState().get_cache_directory()
        self.cache = AttributeCache(directory)
    
    def run(self):
        year_orig = SimulationState().get_current_time()
        years = self.years_in_cache()
        SimulationState().set_current_time(years[0])
        storages = {}
        for year in years:
            storages[year] = flt_storage(os.path.join(self.cache.get_storage_location(), '%s' % year))
        tables = self.cache.get_table_names()
        counts = pd.Series(np.zeros(len(tables), dtype="int32"), index=tables)
        for table in tables:
            columns = self.cache._get_column_names_and_years(table)
            values = []
            names = []
            colyears = []
            for col, year in columns:
                if col in names:
                    continue
                data = storages[year].load_table(table, column_names=col)
                values.append(data[col].size)
                names.append(col)
                colyears.append(year)
            values = np.array(values)
            if(all(values == values[0])):
                continue # all attributes have the same size
            # there is an inconsistency in attributes length
            names = np.array(names)
            colyears = np.array(colyears)
            uc = np.unique(values, return_counts=True)
            imax = np.argmax(uc[1])
            idx = np.where(values <> uc[0][imax])[0]
            df = pd.DataFrame({"column": names[idx],  "year": colyears[idx], "size": values[idx]})
            df = df.append(pd.DataFrame({"column": np.array(["all other columns"]), "year": np.array([years[0]]), "size": np.array([uc[0][imax]])}))
            logger.log_status("Inconsistency in table ", table, ":\n", df)
            counts[table] = df.shape[0] - 1
        SimulationState().set_current_time(year_orig)
        logger.log_status("Model total:", counts.sum(), ' size inconsistencies found.')
        return counts

    
    def years_in_cache(self):
        return self.cache._get_sorted_list_of_years(start_with_current_year=False)
Exemplo n.º 2
0
class DataStructureModel(Model):
    """
    Checks the structure of datasets in a given cache (or run cache) when compared to a reference cache.
    It writes out all columns that are missing as well as those that are not present in the reference cache.
    It can also compare the sizes of the datasets. 
    """
    def __init__(self, reference_location=None):
        """
        "reference_location" is the directory of the reference cache and should include the year.
        If it is None, the simulation directory in its start year is taken. 
        """
        if reference_location is None:
            reference_location = os.path.join(SimulationState().get_cache_directory(), "%s" % SimulationState().get_start_time())
        self.reference_storage =  flt_storage(reference_location)
    
    def run(self, directory=None, check_size=True):
        """
        "directory" is the cache to be compared to the reference. It should not include the year
        as the model checks all years.
        Set "check_sizes" to False if no size check of the datasets is required. 
        """
        if directory is None:
            directory = SimulationState().get_cache_directory()        
        self.cache = AttributeCache(directory)
        year_orig = SimulationState().get_current_time()
        years = self.years_in_cache()
        SimulationState().set_current_time(years[0])
        storages = {}
        for year in years:
            storages[year] = flt_storage(os.path.join(self.cache.get_storage_location(), '%s' % year))
        df = pd.DataFrame(columns=["Table", "Less-than-ref", "More-than-ref", "Year", "Size", "Size-ref"])
        tables = self.cache.get_table_names() 
        for table in tables:
            columns_list = self.cache.get_column_names(table)
            columns = Set(columns_list)
            ref_columns_list = self.reference_storage.get_column_names(table, lowercase=True)
            ref_columns = Set(ref_columns_list)
            more = columns.difference(ref_columns)
            less = ref_columns.difference(columns)
            samesize = True
            if check_size:
                table_size = self.cache.load_table(table, columns_list[0])[columns_list[0]].size
                reftable_size = self.reference_storage.load_table(table, ref_columns_list[0])[ref_columns_list[0]].size
                if table_size <> reftable_size:
                    samesize = False
            if len(more) == 0 and len(less) == 0 and samesize:
                continue
            df.loc[df.shape[0]] = [table, ', '.join(less), ', '.join(more), '', 0, 0]
            if len(more) == 0 and samesize:
                continue
            # if there are columns in the "more" column, write out the corresponding years
            columns_and_years = self.cache._get_column_names_and_years(table)
            more_years = []
            for col, year in columns_and_years:
                if col in more:
                    more_years.append(year)
            df.loc[df.shape[0]-1, "Year"] = ', '.join(np.unique(np.array(more_years).astype("str")))
            if not samesize:  # there is difference in table sizes
                df.loc[df.shape[0]-1, "Size"] = table_size
                df.loc[df.shape[0]-1, "Size-ref"] = reftable_size
           
        if not check_size or (df['Size'].sum()==0 and df['Size-ref'].sum()==0):
            # remove the size columns if not used
            del df['Size']
            del df['Size-ref']
        if df.shape[0] > 0:
            logger.log_status("Differences in data structure relative to %s:" % self.reference_storage.get_storage_location())
            logger.log_status(df)
        else:
            logger.log_status("Data structure corresponds to the one in %s" % self.reference_storage.get_storage_location())
        return df
    
    def years_in_cache(self):
        return self.cache._get_sorted_list_of_years(start_with_current_year=False)