def test_melt(): pdf = pd.DataFrame({"A": list("abcd") * 5, "B": list("XY") * 10, "C": np.random.randn(20)}) ddf = dd.from_pandas(pdf, 4) list_eq(dd.melt(ddf), pd.melt(pdf)) list_eq(dd.melt(ddf, id_vars="C"), pd.melt(pdf, id_vars="C")) list_eq(dd.melt(ddf, value_vars="C"), pd.melt(pdf, value_vars="C")) list_eq( dd.melt(ddf, value_vars=["A", "C"], var_name="myvar"), pd.melt(pdf, value_vars=["A", "C"], var_name="myvar") ) list_eq( dd.melt(ddf, id_vars="B", value_vars=["A", "C"], value_name="myval"), pd.melt(pdf, id_vars="B", value_vars=["A", "C"], value_name="myval"), )
def _stats_plot(self, element, y, data=None): """ Helper method to generate element from indexed dataframe. """ data, x, y = self._process_args(data, None, y) opts = { 'plot': dict(self._plot_opts), # labelled=[]), 'norm': self._norm_opts, 'style': self._style_opts } if not isinstance(y, (list, tuple)): ranges = {y: self._dim_ranges['y']} return (element( data, self.by, y).redim.range(**ranges).relabel(**self._relabel).opts(**opts)) labelled = ['y' if self.invert else 'x' ] if self.group_label != 'Group' else [] if self.value_label != 'value': labelled.append('x' if self.invert else 'y') opts['plot']['labelled'] = labelled kdims = [self.group_label] ranges = {self.value_label: self._dim_ranges['y']} data = data[list(y)] if check_library(data, 'dask'): from dask.dataframe import melt else: melt = pd.melt df = melt(data, var_name=self.group_label, value_name=self.value_label) return (element( df, kdims, self.value_label).redim.range(**ranges).redim( **self._redim).relabel(**self._relabel).opts(**opts))
def _category_plot(self, element, x, y, data): """ Helper method to generate element from indexed dataframe. """ labelled = ['y' if self.invert else 'x'] if x != 'index' else [] if self.value_label != 'value': labelled.append('x' if self.invert else 'y') opts = { 'plot': dict(self._plot_opts, labelled=labelled), 'style': dict(self._style_opts), 'norm': self._norm_opts } ranges = {self.value_label: self._dim_ranges['y']} id_vars = [x] if any((v in [data.index.names] + ['index']) or v == data.index.name for v in id_vars): data = data.reset_index() data = data[y + [x]] if check_library(data, 'dask'): from dask.dataframe import melt else: melt = pd.melt df = melt(data, id_vars=[x], var_name=self.group_label, value_name=self.value_label) kdims = [x, self.group_label] return (element( df, kdims, self.value_label).redim.range(**ranges).redim( **self._redim).relabel(**self._relabel).opts(**opts))
def test_melt(): pdf = pd.DataFrame({'A': list('abcd') * 5, 'B': list('XY') * 10, 'C': np.random.randn(20)}) ddf = dd.from_pandas(pdf, 4) list_eq(dd.melt(ddf), pd.melt(pdf)) list_eq(dd.melt(ddf, id_vars='C'), pd.melt(pdf, id_vars='C')) list_eq(dd.melt(ddf, value_vars='C'), pd.melt(pdf, value_vars='C')) list_eq(dd.melt(ddf, value_vars=['A', 'C'], var_name='myvar'), pd.melt(pdf, value_vars=['A', 'C'], var_name='myvar')) list_eq(dd.melt(ddf, id_vars='B', value_vars=['A', 'C'], value_name='myval'), pd.melt(pdf, id_vars='B', value_vars=['A', 'C'], value_name='myval'))
def create_diffdiff(): print("\ngenerating spreads vs spreads...") diff_diff = triu(dateless_df, True) ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)" diff_diff.columns = [ f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}' for h in diff_diff.columns ] ## verbose version of above's list comprehension # new_headers = [] # for header in diff_diff.columns: # split_header = header.split(" (") # product_A = split_header[0] # split_section = split_header[1].split(")") # product_B = split_section[1] # new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}") # diff_diff.columns = new_headers diff_diff = triu(diff_diff, False) diff_diff = diff_diff.repartition(npartitions=200) diff_diff = diff_diff.reset_index(drop=True) dd_date_col = dd.from_array(date_col) dd_date_col = dd_date_col.repartition(npartitions=200) dd_date_col = dd_date_col.reset_index(drop=True) diff_diff = diff_diff.assign(date=dd_date_col) diff_diff = dd.melt( diff_diff, id_vars="date", var_name="product_diff", value_name="price_diff").dropna().reset_index(drop=True) diff_diff["product_diff"] = diff_diff["product_diff"].astype( "category") diff_diff["differential_A"] = diff_diff["product_diff"].str.partition( " - ")[0] diff_diff["differential_B"] = diff_diff["product_diff"].str.partition( " - ")[2] print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)") dd.to_csv(df=diff_diff, filename=os.path.join(os.getcwd(), "cleaned_data", "diff_diff.csv"), index=False, single_file=True, encoding="utf-8-sig", chunksize=10000) print( f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)" )
def calc_enduse(self, eu_fraction_dict, county_energy_dd, temps=False): """ Returns Dask DataFrame """ unitname_eu_dict = { 'Process Heating': [ 'furnace', 'kiln', 'dryer', 'heater', 'oven', 'calciner', 'stove', 'htr', 'furn', 'cupola' ], 'Conventional Boiler Use': ['boiler'], 'CHP and/or Cogeneration Process': ['turbine'], 'Facility HVAC': ['building heat', 'space heater'], 'Machine Drive': ['engine', 'compressor', 'pump', 'rice'], 'Conventional Electricity Generation': ['generator'], 'Other Nonprocess Use': [ 'hot water', 'crane', 'water heater', 'comfort heater', 'RTO', 'TODF', 'oxidizer', 'RCO' ] } unittype_eu_dict = { 'Process Heating': [ 'F', 'PD', 'K', 'PRH', 'O', 'NGLH', 'CF', 'HMH', 'C', 'HPPU', 'CatH', 'COB', 'FeFL', 'Chemical Recovery Furnace', 'IFCE', 'Pulp Mill Lime Kiln', 'Lime Kiln', 'Chemical Recovery Combustion Unit', 'Direct Reduction Furnace', 'Sulfur Recovery Plant' ], 'Conventional Boiler Use': [ 'OB', 'S', 'PCWW', 'BFB', 'PCWD', 'PCT', 'CFB', 'PCO', 'OFB', 'PFB' ], 'CHP and/or Cogeneration Process': ['CCCT', 'SCCT'], 'Facility HVAC': ['CH'], 'Other Nonprocess Use': ['HWH', 'TODF', 'ICI', 'FLR', 'RTO', 'II', 'MWC', 'Flare', 'RCO'], 'Conventional Electricity Generation': ['RICE', 'Electricity Generator'] } def eu_dict_to_df(eu_dict): """ Convert unit type/unit name dictionaries to dataframes. """ eu_df = pd.DataFrame.from_dict(eu_dict, orient='index').reset_index() eu_df = pd.melt(eu_df, id_vars='index', value_name='unit').rename(columns={ 'index': 'end_use' }).drop('variable', axis=1) eu_df = eu_df.dropna().set_index('unit') return eu_df def eu_unit_type(unit_type, unittype_eu_df): """ Match GHGRP unit type to end use specified in unittype_eu_dict. """ enduse = re.match('(\w+) \(', unit_type) if enduse != None: enduse = re.match('(\w+)', enduse.group())[0] if enduse in unittype_eu_df.index: enduse = unittype_eu_df.loc[enduse, 'end_use'] else: enduse = np.nan else: if unit_type in unittype_eu_df.index: enduse = unittype_eu_df.loc[unit_type, 'end_use'] return enduse def eu_unit_name(unit_name, unitname_eu_df): """ Find keywords in GHGRP unit name descriptions and match them to appropriate end uses based on unitname_eu_dict. """ for i in unitname_eu_df.index: enduse = re.search(i, unit_name.lower()) if enduse == None: continue else: enduse = unitname_eu_df.loc[i, 'end_use'] return enduse enduse = np.nan return enduse unittype_eu_df = eu_dict_to_df(unittype_eu_dict) unitname_eu_df = eu_dict_to_df(unitname_eu_dict) # Base ghgrp energy end use disaggregation on reported unit type and # unit name. eu_ghgrp = self.energy_eia923.copy(deep=True) # First match end uses to provided unit types. Most unit types are # specified as OCS (other combustion source). unit_types = eu_ghgrp.UNIT_TYPE.dropna().unique() type_match = list() for utype in unit_types: enduse = eu_unit_type(utype, unittype_eu_df) type_match.append([utype, enduse]) type_match = pd.DataFrame(type_match, columns=['UNIT_TYPE', 'end_use']) eu_ghgrp = pd.merge(eu_ghgrp, type_match, on='UNIT_TYPE', how='left') # Next, match end use by unit name for facilites that report OCS for # unit type. eu_ocs = eu_ghgrp[ (eu_ghgrp.UNIT_TYPE == 'OCS (Other combustion source)') | (eu_ghgrp.UNIT_TYPE.isnull())][['UNIT_TYPE', 'UNIT_NAME']] eu_ocs['end_use'] = eu_ocs.UNIT_NAME.apply( lambda x: eu_unit_name(x, unitname_eu_df)) eu_ghgrp.end_use.update(eu_ocs.end_use) eu_ghgrp.drop(eu_ghgrp.columns.difference( set([ 'COUNTY_FIPS', 'MECS_Region', 'MMBtu_TOTAL', 'MECS_FT', 'PRIMARY_NAICS_CODE', 'MECS_NAICS', 'end_use', 'FACILITY_ID' ])), axis=1, inplace=True) # sum energy of unit types and unit names matched to an end use eu_ghgrp_matched = eu_ghgrp[eu_ghgrp.end_use.notnull()].pivot_table( values='MMBtu_TOTAL', columns='end_use', index=[ 'MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE', 'MECS_NAICS', 'MECS_FT' ], aggfunc='sum', fill_value=0) eu_ghgrp_matched = eu_ghgrp_matched.join( eu_ghgrp.pivot_table(values='FACILITY_ID', index=[ 'MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE', 'MECS_NAICS', 'MECS_FT' ], aggfunc='count')) # Calculate the remaining GHGRP facilities energy use # with MECS data. eu_ghgrp_notmatched = \ eu_ghgrp[(eu_ghgrp.end_use.isnull()) & (eu_ghgrp.MECS_FT.notnull())].copy(deep=True) enduses = eu_fraction_dict['GHGRP'].columns.values eu_ghgrp_notmatched = pd.merge(eu_ghgrp_notmatched.set_index( ['MECS_NAICS', 'MECS_FT']), eu_fraction_dict['GHGRP'], left_index=True, right_index=True, how='left') for eu in enduses: eu_ghgrp_notmatched[eu] = \ eu_ghgrp_notmatched.MMBtu_TOTAL.multiply( eu_ghgrp_notmatched[eu], fill_value=0 ) agg_cols = [x for x in itools.product(enduses, ['sum'])] agg_cols.append(('FACILITY_ID', 'count')) eu_ghgrp_notmatched = eu_ghgrp_notmatched.reset_index().groupby( [ 'MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE', 'MECS_NAICS', 'MECS_FT', ], as_index=False).agg(dict(agg_cols)) eu_ghgrp_notmatched.set_index('MECS_NAICS', inplace=True) eu_ghgrp_matched.reset_index( ['MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE', 'MECS_FT'], inplace=True) for df in [eu_ghgrp_matched, eu_ghgrp_notmatched]: df.rename(columns={ 'PRIMARY_NAICS_CODE': 'NAICS', 'FACILITY_ID': 'est_count' }, inplace=True) df['data_source'] = 'ghgrp' # Calculate end use of energy estimated from MECS data with MECS end # use. enduses = eu_fraction_dict['nonGHGRP'].columns.values eu_energy_dd = dd.merge( county_energy_dd[county_energy_dd.data_source == 'mecs_ipf'], eu_fraction_dict['nonGHGRP'].reset_index('MECS_FT'), on=['MECS_NAICS', 'MECS_FT'], how='left') for eu in enduses: eu_energy_dd[eu] = \ eu_energy_dd.MMBtu_TOTAL.mul(eu_energy_dd[eu], fill_value=0) # This throws FutureWanring related to sorting for pandas concat, # but currently there's no option to address this in dd.concat eu_energy_dd = dd.concat([ df for df in [eu_energy_dd, eu_ghgrp_matched, eu_ghgrp_notmatched] ], axis='index', join='outer', interleave_partitions=True) eu_energy_dd_final = dd.melt(eu_energy_dd.reset_index(), value_vars=enduses.tolist(), id_vars=[ 'MECS_NAICS', 'COUNTY_FIPS', 'Emp_Size', 'MECS_FT', 'MECS_Region', 'data_source', 'est_count', 'fipscty', 'fipstate', 'NAICS' ], var_name='End_use', value_name='MMBtu') # clean up by removing MMBtu values == 0.. eu_energy_dd_final = \ eu_energy_dd_final[eu_energy_dd_final.MMBtu !=0] eu_energy_dd_final = eu_energy_dd_final.set_index('MECS_NAICS') # compute=True) return eu_energy_dd_final
ssrt_metrics['NARGUID'] = ssrt_metrics['filename'].apply( lambda x: x.split('_')[-1].replace('.csv', ''), meta=str) ssrt_metrics['Underlying Distribution'] = ssrt_metrics['filename'].apply( lambda x: '_'.join(x.split('/')[-1].split('_')[:-1]), meta=str) ssrt_metrics = ssrt_metrics.drop('filename', axis=1) ssrt_metrics['graded_both'] = ssrt_metrics['SSRT_w_graded'] ssrt_metrics = ssrt_metrics.rename( columns={ 'SSRT': 'standard', 'SSRT_w_guesses': 'guesses', 'SSRT_w_graded': 'graded_go' }) print('melting...') melt_df = dd.melt( ssrt_metrics, id_vars=['SSD', 'Underlying Distribution'], value_vars=['standard', 'guesses', 'graded_go', 'graded_both'], var_name='Assumed Distribution', value_name='SSRT') # Formal Names renaming_map = { 'standard': 'Independent Race', 'guesses': 'Guessing', 'graded_go': 'Slowed Go Processing', 'graded_both': 'Confusion', 'ABCD data': 'ABCD Data' } melt_df["Assumed Distribution"] = melt_df["Assumed Distribution"].replace( renaming_map) melt_df["Underlying Distribution"] = melt_df[ "Underlying Distribution"].replace(renaming_map)