Пример #1
0
def test_melt():
    pdf = pd.DataFrame({"A": list("abcd") * 5, "B": list("XY") * 10, "C": np.random.randn(20)})
    ddf = dd.from_pandas(pdf, 4)

    list_eq(dd.melt(ddf), pd.melt(pdf))

    list_eq(dd.melt(ddf, id_vars="C"), pd.melt(pdf, id_vars="C"))
    list_eq(dd.melt(ddf, value_vars="C"), pd.melt(pdf, value_vars="C"))
    list_eq(
        dd.melt(ddf, value_vars=["A", "C"], var_name="myvar"), pd.melt(pdf, value_vars=["A", "C"], var_name="myvar")
    )
    list_eq(
        dd.melt(ddf, id_vars="B", value_vars=["A", "C"], value_name="myval"),
        pd.melt(pdf, id_vars="B", value_vars=["A", "C"], value_name="myval"),
    )
Пример #2
0
    def _stats_plot(self, element, y, data=None):
        """
        Helper method to generate element from indexed dataframe.
        """
        data, x, y = self._process_args(data, None, y)

        opts = {
            'plot': dict(self._plot_opts),  # labelled=[]),
            'norm': self._norm_opts,
            'style': self._style_opts
        }
        if not isinstance(y, (list, tuple)):
            ranges = {y: self._dim_ranges['y']}
            return (element(
                data, self.by,
                y).redim.range(**ranges).relabel(**self._relabel).opts(**opts))

        labelled = ['y' if self.invert else 'x'
                    ] if self.group_label != 'Group' else []
        if self.value_label != 'value':
            labelled.append('x' if self.invert else 'y')
        opts['plot']['labelled'] = labelled

        kdims = [self.group_label]
        ranges = {self.value_label: self._dim_ranges['y']}
        data = data[list(y)]
        if check_library(data, 'dask'):
            from dask.dataframe import melt
        else:
            melt = pd.melt
        df = melt(data, var_name=self.group_label, value_name=self.value_label)
        return (element(
            df, kdims, self.value_label).redim.range(**ranges).redim(
                **self._redim).relabel(**self._relabel).opts(**opts))
Пример #3
0
    def _category_plot(self, element, x, y, data):
        """
        Helper method to generate element from indexed dataframe.
        """
        labelled = ['y' if self.invert else 'x'] if x != 'index' else []
        if self.value_label != 'value':
            labelled.append('x' if self.invert else 'y')

        opts = {
            'plot': dict(self._plot_opts, labelled=labelled),
            'style': dict(self._style_opts),
            'norm': self._norm_opts
        }
        ranges = {self.value_label: self._dim_ranges['y']}

        id_vars = [x]
        if any((v in [data.index.names] + ['index']) or v == data.index.name
               for v in id_vars):
            data = data.reset_index()
        data = data[y + [x]]

        if check_library(data, 'dask'):
            from dask.dataframe import melt
        else:
            melt = pd.melt

        df = melt(data,
                  id_vars=[x],
                  var_name=self.group_label,
                  value_name=self.value_label)
        kdims = [x, self.group_label]
        return (element(
            df, kdims, self.value_label).redim.range(**ranges).redim(
                **self._redim).relabel(**self._relabel).opts(**opts))
Пример #4
0
def test_melt():
    pdf = pd.DataFrame({'A': list('abcd') * 5,
                        'B': list('XY') * 10,
                        'C': np.random.randn(20)})
    ddf = dd.from_pandas(pdf, 4)

    list_eq(dd.melt(ddf),
            pd.melt(pdf))

    list_eq(dd.melt(ddf, id_vars='C'),
            pd.melt(pdf, id_vars='C'))
    list_eq(dd.melt(ddf, value_vars='C'),
            pd.melt(pdf, value_vars='C'))
    list_eq(dd.melt(ddf, value_vars=['A', 'C'], var_name='myvar'),
            pd.melt(pdf, value_vars=['A', 'C'], var_name='myvar'))
    list_eq(dd.melt(ddf, id_vars='B', value_vars=['A', 'C'], value_name='myval'),
            pd.melt(pdf, id_vars='B', value_vars=['A', 'C'], value_name='myval'))
Пример #5
0
    def create_diffdiff():
        print("\ngenerating spreads vs spreads...")

        diff_diff = triu(dateless_df, True)

        ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)"
        diff_diff.columns = [
            f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}'
            for h in diff_diff.columns
        ]

        ## verbose version of above's list comprehension
        # new_headers = []
        # for header in diff_diff.columns:
        #     split_header = header.split(" (")
        #     product_A = split_header[0]
        #     split_section = split_header[1].split(")")
        #     product_B = split_section[1]
        #     new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}")
        # diff_diff.columns = new_headers

        diff_diff = triu(diff_diff, False)

        diff_diff = diff_diff.repartition(npartitions=200)
        diff_diff = diff_diff.reset_index(drop=True)

        dd_date_col = dd.from_array(date_col)
        dd_date_col = dd_date_col.repartition(npartitions=200)
        dd_date_col = dd_date_col.reset_index(drop=True)

        diff_diff = diff_diff.assign(date=dd_date_col)

        diff_diff = dd.melt(
            diff_diff,
            id_vars="date",
            var_name="product_diff",
            value_name="price_diff").dropna().reset_index(drop=True)

        diff_diff["product_diff"] = diff_diff["product_diff"].astype(
            "category")

        diff_diff["differential_A"] = diff_diff["product_diff"].str.partition(
            " - ")[0]
        diff_diff["differential_B"] = diff_diff["product_diff"].str.partition(
            " - ")[2]

        print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)")
        dd.to_csv(df=diff_diff,
                  filename=os.path.join(os.getcwd(), "cleaned_data",
                                        "diff_diff.csv"),
                  index=False,
                  single_file=True,
                  encoding="utf-8-sig",
                  chunksize=10000)
        print(
            f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)"
        )
Пример #6
0
    def calc_enduse(self, eu_fraction_dict, county_energy_dd, temps=False):
        """

        Returns Dask DataFrame
        """
        unitname_eu_dict = {
            'Process Heating': [
                'furnace', 'kiln', 'dryer', 'heater', 'oven', 'calciner',
                'stove', 'htr', 'furn', 'cupola'
            ],
            'Conventional Boiler Use': ['boiler'],
            'CHP and/or Cogeneration Process': ['turbine'],
            'Facility HVAC': ['building heat', 'space heater'],
            'Machine Drive': ['engine', 'compressor', 'pump', 'rice'],
            'Conventional Electricity Generation': ['generator'],
            'Other Nonprocess Use': [
                'hot water', 'crane', 'water heater', 'comfort heater', 'RTO',
                'TODF', 'oxidizer', 'RCO'
            ]
        }

        unittype_eu_dict = {
            'Process Heating': [
                'F', 'PD', 'K', 'PRH', 'O', 'NGLH', 'CF', 'HMH', 'C', 'HPPU',
                'CatH', 'COB', 'FeFL', 'Chemical Recovery Furnace', 'IFCE',
                'Pulp Mill Lime Kiln', 'Lime Kiln',
                'Chemical Recovery Combustion Unit',
                'Direct Reduction Furnace', 'Sulfur Recovery Plant'
            ],
            'Conventional Boiler Use': [
                'OB', 'S', 'PCWW', 'BFB', 'PCWD', 'PCT', 'CFB', 'PCO', 'OFB',
                'PFB'
            ],
            'CHP and/or Cogeneration Process': ['CCCT', 'SCCT'],
            'Facility HVAC': ['CH'],
            'Other Nonprocess Use':
            ['HWH', 'TODF', 'ICI', 'FLR', 'RTO', 'II', 'MWC', 'Flare', 'RCO'],
            'Conventional Electricity Generation':
            ['RICE', 'Electricity Generator']
        }

        def eu_dict_to_df(eu_dict):
            """
            Convert unit type/unit name dictionaries to dataframes.
            """
            eu_df = pd.DataFrame.from_dict(eu_dict,
                                           orient='index').reset_index()

            eu_df = pd.melt(eu_df, id_vars='index',
                            value_name='unit').rename(columns={
                                'index': 'end_use'
                            }).drop('variable', axis=1)

            eu_df = eu_df.dropna().set_index('unit')

            return eu_df

        def eu_unit_type(unit_type, unittype_eu_df):
            """
            Match GHGRP unit type to end use specified in unittype_eu_dict.
            """

            enduse = re.match('(\w+) \(', unit_type)

            if enduse != None:

                enduse = re.match('(\w+)', enduse.group())[0]

                if enduse in unittype_eu_df.index:

                    enduse = unittype_eu_df.loc[enduse, 'end_use']

                else:

                    enduse = np.nan

            else:

                if unit_type in unittype_eu_df.index:

                    enduse = unittype_eu_df.loc[unit_type, 'end_use']

            return enduse

        def eu_unit_name(unit_name, unitname_eu_df):
            """
            Find keywords in GHGRP unit name descriptions and match them
            to appropriate end uses based on unitname_eu_dict.
            """

            for i in unitname_eu_df.index:

                enduse = re.search(i, unit_name.lower())

                if enduse == None:

                    continue

                else:

                    enduse = unitname_eu_df.loc[i, 'end_use']

                    return enduse

            enduse = np.nan

            return enduse

        unittype_eu_df = eu_dict_to_df(unittype_eu_dict)

        unitname_eu_df = eu_dict_to_df(unitname_eu_dict)

        # Base ghgrp energy end use disaggregation on reported unit type and
        # unit name.
        eu_ghgrp = self.energy_eia923.copy(deep=True)

        # First match end uses to provided unit types. Most unit types are
        # specified as OCS (other combustion source).
        unit_types = eu_ghgrp.UNIT_TYPE.dropna().unique()

        type_match = list()

        for utype in unit_types:

            enduse = eu_unit_type(utype, unittype_eu_df)

            type_match.append([utype, enduse])

        type_match = pd.DataFrame(type_match, columns=['UNIT_TYPE', 'end_use'])

        eu_ghgrp = pd.merge(eu_ghgrp, type_match, on='UNIT_TYPE', how='left')

        # Next, match end use by unit name for facilites that report OCS for
        # unit type.
        eu_ocs = eu_ghgrp[
            (eu_ghgrp.UNIT_TYPE == 'OCS (Other combustion source)') |
            (eu_ghgrp.UNIT_TYPE.isnull())][['UNIT_TYPE', 'UNIT_NAME']]

        eu_ocs['end_use'] = eu_ocs.UNIT_NAME.apply(
            lambda x: eu_unit_name(x, unitname_eu_df))

        eu_ghgrp.end_use.update(eu_ocs.end_use)

        eu_ghgrp.drop(eu_ghgrp.columns.difference(
            set([
                'COUNTY_FIPS', 'MECS_Region', 'MMBtu_TOTAL', 'MECS_FT',
                'PRIMARY_NAICS_CODE', 'MECS_NAICS', 'end_use', 'FACILITY_ID'
            ])),
                      axis=1,
                      inplace=True)

        # sum energy of unit types and unit names matched to an end use
        eu_ghgrp_matched = eu_ghgrp[eu_ghgrp.end_use.notnull()].pivot_table(
            values='MMBtu_TOTAL',
            columns='end_use',
            index=[
                'MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE',
                'MECS_NAICS', 'MECS_FT'
            ],
            aggfunc='sum',
            fill_value=0)

        eu_ghgrp_matched = eu_ghgrp_matched.join(
            eu_ghgrp.pivot_table(values='FACILITY_ID',
                                 index=[
                                     'MECS_Region', 'COUNTY_FIPS',
                                     'PRIMARY_NAICS_CODE', 'MECS_NAICS',
                                     'MECS_FT'
                                 ],
                                 aggfunc='count'))

        # Calculate the remaining GHGRP facilities energy use
        # with MECS data.
        eu_ghgrp_notmatched = \
            eu_ghgrp[(eu_ghgrp.end_use.isnull()) &
                     (eu_ghgrp.MECS_FT.notnull())].copy(deep=True)

        enduses = eu_fraction_dict['GHGRP'].columns.values

        eu_ghgrp_notmatched = pd.merge(eu_ghgrp_notmatched.set_index(
            ['MECS_NAICS', 'MECS_FT']),
                                       eu_fraction_dict['GHGRP'],
                                       left_index=True,
                                       right_index=True,
                                       how='left')

        for eu in enduses:

            eu_ghgrp_notmatched[eu] = \
                eu_ghgrp_notmatched.MMBtu_TOTAL.multiply(
                        eu_ghgrp_notmatched[eu], fill_value=0
                        )

        agg_cols = [x for x in itools.product(enduses, ['sum'])]

        agg_cols.append(('FACILITY_ID', 'count'))

        eu_ghgrp_notmatched = eu_ghgrp_notmatched.reset_index().groupby(
            [
                'MECS_Region',
                'COUNTY_FIPS',
                'PRIMARY_NAICS_CODE',
                'MECS_NAICS',
                'MECS_FT',
            ],
            as_index=False).agg(dict(agg_cols))

        eu_ghgrp_notmatched.set_index('MECS_NAICS', inplace=True)

        eu_ghgrp_matched.reset_index(
            ['MECS_Region', 'COUNTY_FIPS', 'PRIMARY_NAICS_CODE', 'MECS_FT'],
            inplace=True)

        for df in [eu_ghgrp_matched, eu_ghgrp_notmatched]:

            df.rename(columns={
                'PRIMARY_NAICS_CODE': 'NAICS',
                'FACILITY_ID': 'est_count'
            },
                      inplace=True)

            df['data_source'] = 'ghgrp'

        # Calculate end use of energy estimated from MECS data with MECS end
        # use.
        enduses = eu_fraction_dict['nonGHGRP'].columns.values

        eu_energy_dd = dd.merge(
            county_energy_dd[county_energy_dd.data_source == 'mecs_ipf'],
            eu_fraction_dict['nonGHGRP'].reset_index('MECS_FT'),
            on=['MECS_NAICS', 'MECS_FT'],
            how='left')

        for eu in enduses:

            eu_energy_dd[eu] = \
                eu_energy_dd.MMBtu_TOTAL.mul(eu_energy_dd[eu],
                                             fill_value=0)

        # This throws FutureWanring related to sorting for pandas concat,
        # but currently there's no option to address this in dd.concat
        eu_energy_dd = dd.concat([
            df for df in [eu_energy_dd, eu_ghgrp_matched, eu_ghgrp_notmatched]
        ],
                                 axis='index',
                                 join='outer',
                                 interleave_partitions=True)

        eu_energy_dd_final = dd.melt(eu_energy_dd.reset_index(),
                                     value_vars=enduses.tolist(),
                                     id_vars=[
                                         'MECS_NAICS', 'COUNTY_FIPS',
                                         'Emp_Size', 'MECS_FT', 'MECS_Region',
                                         'data_source', 'est_count', 'fipscty',
                                         'fipstate', 'NAICS'
                                     ],
                                     var_name='End_use',
                                     value_name='MMBtu')

        # clean up by removing MMBtu values == 0..
        eu_energy_dd_final = \
            eu_energy_dd_final[eu_energy_dd_final.MMBtu !=0]

        eu_energy_dd_final = eu_energy_dd_final.set_index('MECS_NAICS')
        #                                                          compute=True)

        return eu_energy_dd_final
Пример #7
0
    ssrt_metrics['NARGUID'] = ssrt_metrics['filename'].apply(
        lambda x: x.split('_')[-1].replace('.csv', ''), meta=str)
    ssrt_metrics['Underlying Distribution'] = ssrt_metrics['filename'].apply(
        lambda x: '_'.join(x.split('/')[-1].split('_')[:-1]), meta=str)
    ssrt_metrics = ssrt_metrics.drop('filename', axis=1)
    ssrt_metrics['graded_both'] = ssrt_metrics['SSRT_w_graded']
    ssrt_metrics = ssrt_metrics.rename(
        columns={
            'SSRT': 'standard',
            'SSRT_w_guesses': 'guesses',
            'SSRT_w_graded': 'graded_go'
        })
    print('melting...')
    melt_df = dd.melt(
        ssrt_metrics,
        id_vars=['SSD', 'Underlying Distribution'],
        value_vars=['standard', 'guesses', 'graded_go', 'graded_both'],
        var_name='Assumed Distribution',
        value_name='SSRT')
    # Formal Names
    renaming_map = {
        'standard': 'Independent Race',
        'guesses': 'Guessing',
        'graded_go': 'Slowed Go Processing',
        'graded_both': 'Confusion',
        'ABCD data': 'ABCD Data'
    }
    melt_df["Assumed Distribution"] = melt_df["Assumed Distribution"].replace(
        renaming_map)
    melt_df["Underlying Distribution"] = melt_df[
        "Underlying Distribution"].replace(renaming_map)