def read_timeseries_data(self, data_column_names='value', hide_exceptions=False, **filters): # This function needs to be sped up """reads timeseries data to dataframe from database. Stored in self.raw_values""" # rowmap is used in ordering the data when read from the sql table headers = util.sql_read_headers(self.sql_data_table) rowmap = [headers.index(self.column_names[level]) for level in self.index_levels] data_col_ind = [] for data_col in util.put_in_list(data_column_names): data_col_ind.append(headers.index(data_col)) # read each line of the data_table matching an id and assign the value to self.raw_values data = [] if len(filters): merged_dict = dict({self.data_id_key: self.id}, **filters) read_data = util.sql_read_table(self.sql_data_table, return_iterable=True, **merged_dict) else: read_data = util.sql_read_table(self.sql_data_table, return_iterable=True, **dict([(self.data_id_key, self.id)])) if read_data: for row in read_data: try: data.append([row[i] for i in rowmap] + [row[i] * (self.unit_prefix if hasattr(self, 'unit_prefix') else 1) for i in data_col_ind ]) except: if hide_exceptions == False: print (self.id, row, i) column_names = self.df_index_names + util.put_in_list(data_column_names) self.raw_values = pd.DataFrame(data, columns=column_names).set_index(keys=self.df_index_names).sort_index() else: self.raw_values = None
def read_timeseries_data(self, data_column_names='value', hide_exceptions=False, **filters): # This function needs to be sped up """reads timeseries data to dataframe from database. Stored in self.raw_values""" # rowmap is used in ordering the data when read from the sql table headers = util.sql_read_headers(self.sql_data_table) rowmap = [ headers.index(self.column_names[level]) for level in self.index_levels ] data_col_ind = [] for data_col in util.put_in_list(data_column_names): data_col_ind.append(headers.index(data_col)) # read each line of the data_table matching an id and assign the value to self.raw_values data = [] if len(filters): merged_dict = dict({self.data_id_key: self.id}, **filters) read_data = util.sql_read_table(self.sql_data_table, return_iterable=True, **merged_dict) else: read_data = util.sql_read_table(self.sql_data_table, return_iterable=True, **dict([(self.data_id_key, self.id) ])) if read_data: for row in read_data: try: data.append([row[i] for i in rowmap] + [ row[i] * (self.unit_prefix if hasattr(self, 'unit_prefix' ) else 1) for i in data_col_ind ]) except: if hide_exceptions == False: print(self.id, row, i) column_names = self.df_index_names + util.put_in_list( data_column_names) self.raw_values = pd.DataFrame( data, columns=column_names).set_index( keys=self.df_index_names).sort_index() else: self.raw_values = None
def remap(self, map_from='raw_values', map_to='values', drivers=None, time_index_name='year', time_index=None, fill_timeseries=True, interpolation_method='missing', extrapolation_method='missing', converted_geography=None, current_geography=None, current_data_type=None, fill_value=0., lower=0, upper=None): """ Map data to drivers and geography Args: map_from (string): starting variable name (defaults to 'raw_values') map_to (string): ending variable name (defaults to 'values') drivers (list of or single dataframe): drivers for the remap input_type_override (string): either 'total' or 'intensity' (defaults to self.type) """ converted_geography = cfg.cfgfile.get('case', 'primary_geography') if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type current_geography = self.geography if current_geography is None else current_geography # TODO fix pluralization if time_index is None: time_index = getattr(self, time_index_name + "s") if hasattr(self, time_index_name + "s") else cfg.cfgfile.get('case', 'years') setattr(self, map_to, getattr(self, map_from).copy()) mapf = getattr(self, map_from) if current_geography not in (mapf.index.names if mapf.index.nlevels > 1 else [mapf.index.name]): raise ValueError('current geography does not match the geography of the dataframe in remap') else: current_geography_index_levels = mapf.index.levels[util.position_in_index(mapf, current_geography)] if mapf.index.nlevels > 1 else mapf.index.tolist() if (drivers is None) or (not len(drivers)): if fill_timeseries: self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, time_index_name=time_index_name, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, lower=lower, upper=upper) if current_geography != converted_geography: self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value) current_geography = converted_geography else: total_driver = DfOper.mult(util.put_in_list(drivers)) if len(current_geography_index_levels) > 1 and current_geography != converted_geography: # While not on primary geography, geography does have some information we would like to preserve self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value) current_geography = converted_geography if current_data_type == 'total': # Divide by drivers to turn a total to intensity. multindex_operation will aggregate to common levels. df_intensity = DfOper.divi((getattr(self, map_to), total_driver), expandable=(False, True), collapsible=(False, True)) setattr(self, map_to, df_intensity) # Clean the timeseries as an intensity if fill_timeseries: # print getattr(self,map_to) # print time_index self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method) if current_data_type == 'total': setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver))) else: setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver), expandable=(True, False), collapsible=(False, True))) self.ensure_correct_geography(map_to, converted_geography, current_geography, current_data_type)
def remap(self, map_from='raw_values', map_to='values', drivers=None, time_index_name='year', time_index=None, fill_timeseries=True, interpolation_method='missing', extrapolation_method='missing', converted_geography=None, current_geography=None, current_data_type=None, fill_value=0., lower=0, upper=None): """ Map data to drivers and geography Args: map_from (string): starting variable name (defaults to 'raw_values') map_to (string): ending variable name (defaults to 'values') drivers (list of or single dataframe): drivers for the remap input_type_override (string): either 'total' or 'intensity' (defaults to self.type) """ converted_geography = cfg.cfgfile.get('case', 'primary_geography') if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type current_geography = self.geography if current_geography is None else current_geography # TODO fix pluralization if time_index is None: time_index = getattr(self, time_index_name + "s") if hasattr(self, time_index_name + "s") else cfg.cfgfile.get('case', 'years') setattr(self, map_to, getattr(self, map_from).copy()) mapf = getattr(self, map_from) if current_geography not in (mapf.index.names if mapf.index.nlevels > 1 else [mapf.index.name]): raise ValueError('current geography does not match the geography of the dataframe in remap') # else: # current_geography_index_levels = mapf.index.levels[util.position_in_index(mapf, current_geography)] if mapf.index.nlevels > 1 else mapf.index.tolist() if (drivers is None) or (not len(drivers)): if fill_timeseries: self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, time_index_name=time_index_name, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, lower=lower, upper=upper) if current_geography != converted_geography: self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value) current_geography = converted_geography else: total_driver = DfOper.mult(util.put_in_list(drivers)) if current_geography != converted_geography: # While not on primary geography, geography does have some information we would like to preserve self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value) current_geography = converted_geography if current_data_type == 'total': # Divide by drivers to turn a total to intensity. multindex_operation will aggregate to common levels. df_intensity = DfOper.divi((getattr(self, map_to), total_driver), expandable=(False, True), collapsible=(False, True),fill_value=fill_value).replace([np.inf,np.nan,-np.nan],0) setattr(self, map_to, df_intensity) # Clean the timeseries as an intensity if fill_timeseries: # print getattr(self,map_to) # print time_index self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method) if current_data_type == 'total': setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver),fill_value=fill_value)) else: setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver), expandable=(True, False), collapsible=(False, True),fill_value=fill_value)) self.ensure_correct_geography(map_to, converted_geography, current_geography, current_data_type)
def project(self, map_from='raw_values', map_to='values', additional_drivers=None, time_index_name='year', fill_timeseries=True, converted_geography=None, current_geography=None, current_data_type=None): converted_geography = cfg.cfgfile.get('case', 'primary_geography') if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type if hasattr(self, 'projected_input_type'): current_data_type = self.projected_input_type denominator_driver_ids = [] else: denominator_driver_ids = [getattr(self, col) for col in cfg.dnmtr_col_names if getattr(self, col) is not None] current_geography = self.geography if current_geography is None else current_geography setattr(self, map_to, getattr(self, map_from).copy()) if len(denominator_driver_ids): if current_data_type != 'intensity': raise ValueError(str(self.__class__) + ' id ' + str(self.id) + ': type must be intensity if variable has denominator drivers') if len(self.index_levels['geography_id']) > 1 and (current_geography != converted_geography): # While not on primary geography, geography does have some information we would like to preserve self.geo_map(converted_geography, attr=map_to, inplace=True) current_geography = converted_geography total_driver = DfOper.mult([self.drivers[id].values for id in denominator_driver_ids]) try: setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver))) except: print getattr(self, map_to) print total_driver # the datatype is now total current_data_type = 'total' driver_ids = [getattr(self, col) for col in cfg.drivr_col_names if getattr(self, col) is not None] drivers = [self.drivers[id].values for id in driver_ids] if additional_drivers is not None: drivers += util.put_in_list(additional_drivers) # both map_from and map_to are the same self.remap(map_from=map_to, map_to=map_to, drivers=drivers, time_index_name=time_index_name, fill_timeseries=fill_timeseries, converted_geography=converted_geography, current_geography=current_geography, current_data_type=current_data_type) self.projected_input_type = 'total'
def project(self, map_from='raw_values', map_to='values', additional_drivers=None, interpolation_method='missing',extrapolation_method='missing', time_index_name='year', fill_timeseries=True, converted_geography=None, current_geography=None, current_data_type=None, fill_value=0.,projected=False,filter_geo=True): converted_geography = cfg.primary_geography if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type if map_from != 'raw_values' and current_data_type == 'total': denominator_driver_ids = [] else: denominator_driver_ids = [getattr(self, col) for col in cfg.dnmtr_col_names if getattr(self, col) is not None] current_geography = self.geography if current_geography is None else current_geography setattr(self, map_to, getattr(self, map_from).copy()) if len(denominator_driver_ids): if current_data_type != 'intensity': raise ValueError(str(self.__class__) + ' id ' + str(self.id) + ': type must be intensity if variable has denominator drivers') if current_geography != converted_geography: # While not on primary geography, geography does have some information we would like to preserve df, current_geography = self.account_for_foreign_gaus(map_from, current_data_type, current_geography) setattr(self,map_to,df) self.geo_map(converted_geography, current_geography=current_geography, attr=map_to, inplace=True) current_geography = converted_geography total_driver = util.DfOper.mult([self.drivers[id].values for id in denominator_driver_ids]) self.geo_map(current_geography=current_geography, attr=map_to, converted_geography=cfg.disagg_geography, current_data_type = 'intensity') setattr(self, map_to, util.DfOper.mult((getattr(self, map_to), total_driver))) self.geo_map(current_geography=cfg.disagg_geography, attr=map_to, converted_geography=current_geography,current_data_type='total') # the datatype is now total current_data_type = 'total' driver_ids = [getattr(self, col) for col in cfg.drivr_col_names if getattr(self, col) is not None] drivers = [self.drivers[id].values for id in driver_ids] if additional_drivers is not None: drivers += util.put_in_list(additional_drivers) # both map_from and map_to are the same self.remap(map_from=map_to, map_to=map_to, drivers=drivers, time_index_name=time_index_name, fill_timeseries=fill_timeseries, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, converted_geography=converted_geography, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value,filter_geo=filter_geo)
def read_timeseries_data(self, data_column_names='value', **filters): # This function needs to be sped up """reads timeseries data to dataframe from database. Stored in self.raw_values""" # rowmap is used in ordering the data when read from the sql table headers = util.sql_read_headers(self.sql_data_table) filters[self.data_id_key] = self.id # Check for a sensitivity specification for this table and id. If there is no relevant sensitivity specified # but the data table has a sensitivity column, we set the sensitivity filter to "None", which will filter # the data table rows down to those where sensitivity is NULL, which is the default, no-sensitivity condition. if 'sensitivity' in headers: filters['sensitivity'] = None if hasattr(self, 'scenario'): # Note that this will return None if the scenario doesn't specify a sensitivity for this table and id filters['sensitivity'] = self.scenario.get_sensitivity(self.sql_data_table, self.id) # read each line of the data_table matching an id and assign the value to self.raw_values read_data = util.sql_read_table(self.sql_data_table, return_iterable=True, **filters) self.inspect_index_levels(headers, read_data) self._validate_other_indexes(headers, read_data) rowmap = [headers.index(self.column_names[level]) for level in self.index_levels] data_col_ind = [headers.index(data_col) for data_col in util.put_in_list(data_column_names)] unit_prefix = self.unit_prefix if hasattr(self, 'unit_prefix') else 1 if read_data: data = [] for row in read_data: try: data.append([row[i] for i in rowmap] + [row[i] * unit_prefix for i in data_col_ind]) except: logging.warning('error reading table: {}, row: {}'.format(self.sql_data_table, row)) raise column_names = self.df_index_names + util.put_in_list(data_column_names) self.raw_values = pd.DataFrame(data, columns=column_names).set_index(keys=self.df_index_names).sort() # print the duplicate values duplicate_index = self.raw_values.index.duplicated(keep=False) #keep = False keeps all of the duplicate indices if any(duplicate_index): logging.warning('Duplicate indices in table: {}, parent id: {}, by default the first index will be kept.'.format(self.sql_data_table, self.id)) logging.warning(self.raw_values[duplicate_index]) self.raw_values = self.raw_values.groupby(level=self.raw_values.index.names).first() else: self.raw_values = None # We didn't find any timeseries data for this object, so now we want to let the user know if that # might be a problem. We only expect to find timeseries data if self actually existed in the database # (as opposed to being a placeholder). The existence of self in the database is flagged by self.data. if self.data: if getattr(self, 'reference_tech_id', None): logging.debug('No {} found for {} with id {}; using reference technology values instead.'.format( self.sql_data_table, self.sql_id_table, self.id )) else: msg = 'No {} or reference technology found for {} with id {}.'.format( self.sql_data_table, self.sql_id_table, self.id ) if re.search("Cost(New|Replacement)?Data$", self.sql_data_table): # The model can run fine without cost data and this is sometimes useful during model # development so we just gently note if cost data is missing. logging.debug(msg) else: # Any other missing data is likely to be a real problem so we complain logging.critical(msg)
def remap(self, map_from='raw_values', map_to='values', drivers=None, time_index_name='year', time_index=None, fill_timeseries=True, interpolation_method='missing', extrapolation_method='missing', converted_geography=None, current_geography=None, current_data_type=None, fill_value=0., lower=0, upper=None, filter_geo=True, driver_geography=None): """ Map data to drivers and geography Args: map_from (string): starting variable name (defaults to 'raw_values') map_to (string): ending variable name (defaults to 'values') drivers (list of or single dataframe): drivers for the remap input_type_override (string): either 'total' or 'intensity' (defaults to self.type) """ driver_geography = cfg.disagg_geography if driver_geography is None else driver_geography converted_geography = cfg.primary_geography if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type current_geography = self.geography if current_geography is None else current_geography time_index = self._get_active_time_index(time_index, time_index_name) if current_geography not in self._get_df_index_names_in_a_list(getattr(self, map_from)): raise ValueError('Current geography does not match the geography of the dataframe in remap') # deals with foreign gaus and updates the geography df, current_geography = self.account_for_foreign_gaus(map_from, current_data_type, current_geography) setattr(self, map_to, df) # This happens when we are on a geography level and some of the elements are missing. Such as no PR when we have all the other U.S. States. setattr(self, map_to, self._add_missing_geographies(df, current_geography, current_data_type)) if (drivers is None) or (not len(drivers)): # we have no drivers, just need to do a clean timeseries and a geomap if fill_timeseries: self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, time_index_name=time_index_name, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, lower=lower, upper=upper) if current_geography != converted_geography: self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value,filter_geo=filter_geo) current_geography = converted_geography else: # becomes an attribute of self just because we may do a geomap on it self.total_driver = DfOper.mult(util.put_in_list(drivers)) # turns out we don't always have a year or vintage column for drivers. For instance when linked_demand_technology gets remapped if time_index_name in self.total_driver.index.names: # sometimes when we have a linked service demand driver in a demand subsector it will come in on a fewer number of years than self.years, making this clean timeseries necesary self.clean_timeseries(attr='total_driver', inplace=True, time_index_name=time_index_name, time_index=time_index, lower=None, upper=None, interpolation_method='missing', extrapolation_method='missing') # While not on primary geography, geography does have some information we would like to preserve if hasattr(self,'drivers') and len(drivers) == len(self.drivers) and set([x.input_type for x in self.drivers.values()]) == set(['intensity']) and set([x.base_driver_id for x in self.drivers.values()]) == set([None]): driver_mapping_data_type = 'intensity' else: driver_mapping_data_type = 'total' total_driver_current_geo = self.geo_map(current_geography, attr='total_driver', inplace=False, current_geography=driver_geography, current_data_type=driver_mapping_data_type, fill_value=fill_value, filter_geo=False) if current_data_type == 'total': if fill_value is np.nan: df_intensity = DfOper.divi((getattr(self, map_to), total_driver_current_geo), expandable=(False, True), collapsible=(False, True),fill_value=fill_value).replace([np.inf],0) else: df_intensity = DfOper.divi((getattr(self, map_to), total_driver_current_geo), expandable=(False, True), collapsible=(False, True),fill_value=fill_value).replace([np.inf,np.nan,-np.nan],0) setattr(self, map_to, df_intensity) # Clean the timeseries as an intensity if fill_timeseries: self.clean_timeseries(attr=map_to, inplace=True, time_index=time_index, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method) # self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type='intensity', fill_value=fill_value, filter_geo=filter_geo) # total_driver_converted_geo = self.geo_map(converted_geography, attr='total_driver', inplace=False, current_geography=driver_geography, current_data_type=driver_mapping_data_type, fill_value=fill_value, filter_geo=filter_geo) if current_data_type == 'total': setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver_current_geo), fill_value=fill_value)) else: try: setattr(self, map_to, DfOper.mult((getattr(self, map_to), total_driver_current_geo), expandable=(True, False), collapsible=(False, True), fill_value=fill_value)) except: pdb.set_trace() self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type='total', fill_value=fill_value, filter_geo=filter_geo) # we don't want to keep this around del self.total_driver
def remap(self, map_from='raw_values', map_to='values', drivers=None, time_index_name='year', time_index=None, fill_timeseries=True, interpolation_method='missing', extrapolation_method='missing', converted_geography=None, current_geography=None, current_data_type=None, fill_value=0., lower=0, upper=None, filter_geo=True): """ Map data to drivers and geography Args: map_from (string): starting variable name (defaults to 'raw_values') map_to (string): ending variable name (defaults to 'values') drivers (list of or single dataframe): drivers for the remap input_type_override (string): either 'total' or 'intensity' (defaults to self.type) """ converted_geography = cfg.primary_geography if converted_geography is None else converted_geography current_data_type = self.input_type if current_data_type is None else current_data_type current_geography = self.geography if current_geography is None else current_geography time_index = self._get_active_time_index(time_index, time_index_name) if current_geography not in self._get_df_index_names_in_a_list( getattr(self, map_from)): raise ValueError( 'Current geography does not match the geography of the dataframe in remap' ) # deals with foreign gaus and updates the geography df, current_geography = self.account_for_foreign_gaus( map_from, current_data_type, current_geography) setattr(self, map_to, df) # This happens when we are on a geography level and some of the elements are missing. Such as no PR when we have all the other U.S. States. setattr( self, map_to, self._add_missing_geographies(df, current_geography, current_data_type)) if (drivers is None) or (not len(drivers)): # we have no drivers, just need to do a clean timeseries and a geomap if fill_timeseries: self.clean_timeseries( attr=map_to, inplace=True, time_index=time_index, time_index_name=time_index_name, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, lower=lower, upper=upper) if current_geography != converted_geography: self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value, filter_geo=filter_geo) current_geography = converted_geography else: self.total_driver = DfOper.mult(util.put_in_list(drivers)) if current_geography != converted_geography and len( util.put_in_list(drivers)) <= 1: # While not on primary geography, geography does have some information we would like to preserve # we put the driver on the same geography as our data self.geomapped_total_driver = self.geo_map( current_geography, attr='total_driver', inplace=False, current_geography=converted_geography, current_data_type='total', fill_value=fill_value, filter_geo=False) elif current_geography != converted_geography: self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type=current_data_type, fill_value=fill_value) current_geography = converted_geography # Divide by drivers to turn a total to intensity. multindex_operation will aggregate to common levels. if current_data_type == 'total': df_intensity = DfOper.divi( (getattr(self, map_to), self.geomapped_total_driver if hasattr(self, 'geomapped_total_driver') else self.total_driver), expandable=(False, True), collapsible=(False, True), fill_value=fill_value).replace([np.inf, np.nan, -np.nan], 0) setattr(self, map_to, df_intensity) # Clean the timeseries as an intensity if fill_timeseries: self.clean_timeseries( attr=map_to, inplace=True, time_index=time_index, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method) self.geo_map(converted_geography, attr=map_to, inplace=True, current_geography=current_geography, current_data_type='intensity', fill_value=fill_value, filter_geo=filter_geo) current_geography = converted_geography if hasattr(self, 'geomapped_total_driver'): delattr(self, 'geomapped_total_driver') if current_data_type == 'total': setattr( self, map_to, DfOper.mult((getattr(self, map_to), self.total_driver), fill_value=fill_value)) else: setattr( self, map_to, DfOper.mult((getattr(self, map_to), self.total_driver), expandable=(True, False), collapsible=(False, True), fill_value=fill_value)) self.ensure_correct_geography(map_to, converted_geography, current_geography, current_data_type, filter_geo=filter_geo)