Exemplo n.º 1
0
    def transform(self, X) -> pd.DataFrame:
        """

        """
        for var in self.variables:
            X[var] = X[var].astype(
                pandas_types.CategoricalDtype(categories=self.categories[var]))

        X = pd.get_dummies(X)

        return X
Exemplo n.º 2
0
def add_margins(df, vars, margins=True):
    """
    Add margins to a data frame.

    All margining variables will be converted to factors.

    Parameters
    ----------
    df : dataframe
        input data frame

    vars : list
        a list of 2 lists | tuples vectors giving the
        variables in each dimension

    margins : bool | list
        variable names to compute margins for.
        True will compute all possible margins.
    """
    margin_vars = _margins(vars, margins)
    if not margin_vars:
        return df

    # create margin dataframes
    margin_dfs = [df]
    for vlst in margin_vars[1:]:
        dfx = df.copy()
        for v in vlst:
            dfx.loc[0:, v] = '(all)'
        margin_dfs.append(dfx)

    merged = pd.concat(margin_dfs, axis=0)
    merged.reset_index(drop=True, inplace=True)

    # All margin columns become categoricals. The margin indicator
    # (all) needs to be added as the last level of the categories.
    categories = {}
    for v in itertools.chain(*vars):
        col = df[v]
        if not pdtypes.is_categorical_dtype(df[v].dtype):
            col = pd.Categorical(df[v])
        categories[v] = col.categories
        if '(all)' not in categories[v]:
            categories[v] = categories[v].insert(
                len(categories[v]), '(all)')

    for v in merged.columns.intersection(set(categories)):
        merged[v] = merged[v].astype(
           pdtypes.CategoricalDtype(categories[v]))

    return merged
Exemplo n.º 3
0
    def ordinal_to_numeric(self,
                           df: pd.DataFrame,
                           column: str,
                           categories: pd.Series,
                           create_column: bool = False):
        column_categories = pdt.CategoricalDtype(categories=categories,
                                                 ordered=True)

        df[column] = df[column].astype(column_categories)

        if create_column:
            df[column + "_numeric"] = df[column].cat.codes + 1
        else:
            df[column] = df[column].cat.codes + 1

        return df
Exemplo n.º 4
0
def load_train_dataset(variables_family: str = None,
                       return_dataframe: bool = False,
                       excluded_variables: list = list()):
    path_train = "{}/datasets/data_train.csv".format(
        os.path.dirname(os.path.abspath(__file__)))
    df = pd.read_csv(path_train)

    quantitatives = get_quantitatives(variables_family=variables_family,
                                      excluded_variables=excluded_variables)
    categorical = "class"
    df[categorical] = df[categorical].astype(
        types.CategoricalDtype(ordered=False))

    if return_dataframe:
        quantitatives.add(categorical)
        return df[quantitatives]

    return df[quantitatives].values, df[categorical].values, quantitatives
Exemplo n.º 5
0
    def _calculate_agg_features(self, features, frame, df_trie):
        test_feature = features[0]
        child_entity = test_feature.base_features[0].entity

        base_frame = df_trie.get_node(test_feature.relationship_path).value
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where
        where = test_feature.where
        if where is not None and not base_frame.empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        if base_frame.empty:
            for f in features:
                frame[f.get_name()] = np.nan
        else:
            relationship_path = test_feature.relationship_path

            groupby_var = get_relationship_variable_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child entity included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous and not base_frame.empty:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.is_absolute():
                    time_first = time_last - use_previous
                    ti = child_entity.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.value

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_var,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multivariable and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):
                    variable_id = f.base_features[0].get_name()

                    if variable_id not in to_agg:
                        to_agg[variable_id] = []

                    func = f.get_function()

                    # for some reason, using the string count is significantly
                    # faster than any method a primitive can return
                    # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg
                    if is_python_2() and func == pd.Series.count.__func__:
                        func = "count"
                    elif func == pd.Series.count:
                        func = "count"

                    funcname = func
                    if callable(func):
                        # if the same function is being applied to the same
                        # variable twice, wrap it in a partial to avoid
                        # duplicate functions
                        funcname = str(id(func))
                        if u"{}-{}".format(variable_id,
                                           funcname) in agg_rename:
                            func = partial(func)
                            funcname = str(id(func))

                        func.__name__ = funcname

                    to_agg[variable_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename[u"{}-{}".format(variable_id,
                                               funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns.ravel()
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

        # Handle default values
        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
            }
            fillna_dict.update(feature_defaults)

        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (f.number_output_features == 1
                    and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
Exemplo n.º 6
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        entity = test_feature.entity
        child_entity = test_feature.base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where clause for all functions below
        where = test_feature.where
        if where is not None:
            base_frame = base_frame[base_frame[where.get_name()]]

        relationship_path = self.entityset.find_backward_path(
            entity.id, child_entity.id)

        groupby_var = Relationship._get_link_variable_name(relationship_path)

        # if the use_previous property exists on this feature, include only the
        # instances from the child entity included in that Timedelta
        use_previous = test_feature.use_previous
        if use_previous and not base_frame.empty:
            # Filter by use_previous values
            time_last = self.time_last
            if use_previous.is_absolute():
                time_first = time_last - use_previous
                ti = child_entity.time_index
                if ti is not None:
                    base_frame = base_frame[base_frame[ti] >= time_first]
            else:
                n = use_previous.value

                def last_n(df):
                    return df.iloc[-n:]

                base_frame = base_frame.groupby(groupby_var,
                                                observed=True,
                                                sort=False).apply(last_n)

        to_agg = {}
        agg_rename = {}
        to_apply = set()
        # apply multivariable and time-dependent features as we find them, and
        # save aggregable features for later
        for f in features:
            if _can_agg(f):
                variable_id = f.base_features[0].get_name()

                if variable_id not in to_agg:
                    to_agg[variable_id] = []

                func = f.get_function()
                funcname = func
                if callable(func):
                    funcname = func.__name__

                to_agg[variable_id].append(func)
                # this is used below to rename columns that pandas names for us
                agg_rename[u"{}-{}".format(variable_id,
                                           funcname)] = f.get_name()
                continue

            to_apply.add(f)

        # Apply the non-aggregable functions generate a new dataframe, and merge
        # it with the existing one
        if len(to_apply):
            wrap = agg_wrapper(to_apply, self.time_last)
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).apply(wrap)

            to_merge.reset_index(1, drop=True, inplace=True)
            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Apply the aggregate functions to generate a new dataframe, and merge
        # it with the existing one
        if len(to_agg):
            # groupby_var can be both the name of the index and a column,
            # to silence pandas warning about ambiguity we explicitly pass
            # the column (in actuality grouping by both index and group would
            # work)
            to_merge = base_frame.groupby(base_frame[groupby_var],
                                          observed=True,
                                          sort=False).agg(to_agg)
            # rename columns to the correct feature names
            to_merge.columns = [
                agg_rename["-".join(x)] for x in to_merge.columns.ravel()
            ]
            to_merge = to_merge[list(agg_rename.values())]

            # workaround for pandas bug where categories are in the wrong order
            # see: https://github.com/pandas-dev/pandas/issues/22501
            if pdtypes.is_categorical_dtype(frame.index):
                categories = pdtypes.CategoricalDtype(
                    categories=frame.index.categories)
                to_merge.index = to_merge.index.astype(object).astype(
                    categories)

            frame = pd.merge(left=frame,
                             right=to_merge,
                             left_index=True,
                             right_index=True,
                             how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
    def _calculate_agg_features(self, features, frame, df_trie,
                                progress_callback):
        test_feature = features[0]
        child_dataframe = test_feature.base_features[0].dataframe
        base_frame = df_trie.get_node(test_feature.relationship_path).value
        parent_merge_col = test_feature.relationship_path[0][
            1]._parent_column_name
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        fl = []
        for f in features:
            for ind in f.get_feature_names():
                if ind not in frame.columns:
                    fl.append(f)
                    break
        features = fl
        if not len(features):
            progress_callback(len(features) / float(self.num_features))
            return frame

        # handle where
        base_frame_empty = (base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False)
        where = test_feature.where
        if where is not None and not base_frame_empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        base_frame_empty = (base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False)
        if base_frame_empty:
            feature_values = []
            for f in features:
                feature_values.append(
                    (f, np.full(f.number_output_features, np.nan)))
                progress_callback(1 / float(self.num_features))
            frame = update_feature_columns(feature_values, frame)
        else:
            relationship_path = test_feature.relationship_path

            groupby_col = get_relationship_column_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child dataframe included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.has_no_observations():
                    time_first = time_last - use_previous
                    ti = child_dataframe.ww.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.get_value("o")

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_col,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multi-column and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):

                    column_id = f.base_features[0].get_name()
                    if column_id not in to_agg:
                        to_agg[column_id] = []
                    if isinstance(base_frame, dd.DataFrame):
                        func = f.get_function(agg_type=Library.DASK)
                    elif is_instance(base_frame, ps, "DataFrame"):
                        func = f.get_function(agg_type=Library.SPARK)
                    else:
                        func = f.get_function()

                    # for some reason, using the string count is significantly
                    # faster than any method a primitive can return
                    # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg
                    if func == pd.Series.count:
                        func = "count"

                    funcname = func
                    if callable(func):
                        # if the same function is being applied to the same
                        # column twice, wrap it in a partial to avoid
                        # duplicate functions
                        funcname = str(id(func))
                        if "{}-{}".format(column_id, funcname) in agg_rename:
                            func = partial(func)
                            funcname = str(id(func))

                        func.__name__ = funcname

                    if isinstance(func, dd.Aggregation):
                        # TODO: handle aggregation being applied to same column twice
                        # (see above partial wrapping of functions)
                        funcname = func.__name__

                    to_agg[column_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename["{}-{}".format(column_id,
                                              funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_col can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_col],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(
                    left=frame,
                    right=to_merge,
                    left_index=True,
                    right_index=True,
                    how="left",
                )

                progress_callback(len(to_apply) / float(self.num_features))

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_col can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                if is_instance(base_frame, (dd, ps), "DataFrame"):
                    to_merge = base_frame.groupby(groupby_col).agg(to_agg)

                else:
                    to_merge = base_frame.groupby(base_frame[groupby_col],
                                                  observed=True,
                                                  sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # Workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                #
                # Pandas claims that bug is fixed but it still shows up in some
                # cases.  More investigation needed.
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                if is_instance(frame, (dd, ps), "DataFrame"):
                    frame = frame.merge(to_merge,
                                        left_on=parent_merge_col,
                                        right_index=True,
                                        how="left")
                else:
                    frame = pd.merge(
                        left=frame,
                        right=to_merge,
                        left_index=True,
                        right_index=True,
                        how="left",
                    )

                # determine number of features that were just merged
                progress_callback(
                    len(to_merge.columns) / float(self.num_features))

        # Handle default values
        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
            }
            fillna_dict.update(feature_defaults)

        frame = frame.fillna(fillna_dict)

        return frame
    for element in combination:
        current_key = dict_translate[element]
        element_to_add = SUPERCLUSTERS[current_key]
        if element_to_add != "NA":
            new_combination += [element_to_add]
    return new_combination


df['combination_classes'] = df['combination'].apply(
    combination_to_combi_of_classes)

## Compute entropy
df['entropy'] = df['combination_classes'].apply(entropy)
# Round to 2 decimals and make a category
df['entropy'] = df['entropy'].apply(lambda x: round(x, 2)).astype(
    pdtypes.CategoricalDtype())

# Using only combis of each individual length

all_lengths = sorted(set(df['combi_length']))

# No point in going beyond, roughly, 12
all_lengths = [l for l in all_lengths if l <= 12]

for length in all_lengths:

    df_filtered = df.loc[df['combi_length'] == length, :]

    try:
        p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='fc')) +
             geom_violin(position=position_dodge(1), width=1) +
Exemplo n.º 9
0
def read_survey_2013_all_participants_table(con: sqlalchemy.engine.Connectable,
                                            schema: str) -> pd.DataFrame:
    """
    Read 2013 survey data from a database into a `pandas.DataFrame`.

    Parameters
    ----------
    con : `sqlalchemy.engine.Connectable`
        An object which supports execution of SQL constructs. Currently there
        are two implementations: `sqlalchemy.engine.Connection` and
        `sqlalchemy.engine.Engine`.
    schema : `str`
        Name of schema containing the `survey_2013_all_participants` table/view.

    Returns
    -------
    df: `pandas.DataFrame`

        2013 survey data for all participants.

    """
    df = pd.read_sql_table("survey_2013_all_participants", con, schema)

    # these variables are encoded as True/False but could also be 1/0.
    # merge the two foundation columns into single categorical columns
    dtype = types.CategoricalDtype(
        categories=["Pier and beam", "Slab", "Both"], ordered=False)
    df["foundation"] = (df.apply(_merge_foundation_columns,
                                 axis=1).astype(dtype))
    df.drop(["foundation_pier_beam", "foundation_slab"], axis=1, inplace=True)

    for column in df:
        if column == "primary_residence":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "number_floors":
            dtype = types.CategoricalDtype(categories=[1, 2, 3, 4],
                                           ordered=True)
            strs_to_ints = {"One": 1, "Two": 2, "Three": 3, "Four": 4}
            df[column] = (df[column].replace(strs_to_ints).astype(dtype))
        elif column == "year_moved_into_house":
            categories = range(1958, 2018)
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].apply(
                lambda v: int(v) if v is not None else v).astype(dtype))
        elif column == "month_moved_into_house":
            categories = [
                "January", "February", "March", "April", "May", "June", "July",
                "August", "September", "October", "November", "December"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "year_house_constructed":
            categories = range(1930, 2018)
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({
                "1930 or earlier": "1930"
            }).apply(lambda v: int(v) if v is not None else v).astype(dtype))
        elif column == "house_num_rooms":
            categories = range(1, 17)
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].apply(
                lambda v: int(v) if v is not None else v).astype(dtype))
        elif column == "house_ceiling_height":
            df.drop(column, axis=1,
                    inplace=True)  # needs substantial cleaning!
        elif column == "house_square_feet":
            df[column] = df[column].astype("float64")
        elif column.startswith("spend_time_at_home_"):
            df[column] = df[column].notnull()
        elif column.startswith("ethnicity_"):
            df[column] = df[column].notnull()
        elif column.startswith("sex_"):
            dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5],
                                           ordered=True)
            strs_with_ints = {
                None: 0,
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                "5 or more": 5
            }
            df[column] = (df[column].replace(strs_with_ints).astype(dtype))
        elif column.startswith("residents_"):
            dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5],
                                           ordered=True)
            strs_with_ints = {
                None: 0,
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                "5 or more": 5
            }
            df[column] = (df[column].replace(strs_with_ints).astype(dtype))
        elif column == "education_level":
            categories = [
                "High School graduate", "Some college/trade/vocational school",
                "College graduate", "Postgraduate degree"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "total_annual_income":
            categories = [
                "Less than $10,000", "$10,000 - $19,999", "$20,000 - $34,999",
                "$35,000 - $49,999", "$50,000 - $74,999", "$75,000 - $99,999",
                "$100,000 - $149,999", "$150,000 - $299,000",
                "$300,000 - $1,000,000", "more than $1,000,000"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].str.replace('-', ',').str.replace(
                " , ", " - ").astype(dtype))
        elif column == "smartphone_own":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "tablet_own":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "pv_system_own":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "pv_system_size":
            specific_replacements = {
                "5kw I think": 5.0,
                "8060 Watts": 8.060,
                "20W (solar powered attic fan)": 0.020
            }
            str_to_numeric = pd.to_numeric(df.pv_system_size.str.replace(
                ' ',
                '').str.replace("kw", '',
                                case=False).replace(specific_replacements),
                                           errors="coerce")
            df[column] = str_to_numeric.apply(lambda v: v / 1e3
                                              if v > 1e3 else v)
        elif column == "electricity_used_monthly":
            pass  # TODO column needs significant cleaning!
        elif column == "gas_used_monthly":
            pass  # TODO column needs significant cleaning!
        elif column == "retrofits":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "retrofits_detail":
            df.drop(column, axis=1, inplace=True)
        elif column == "retrofits_reason":
            df[column] = df[column].replace({
                "Yes": True,
                "No": False,
                "N/A": None
            })
        elif column.startswith("appliance_"):
            categories = [
                'rarely', 'once or twice a month', 'once or twice a week',
                'several times a week', 'daily basis'
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({"N/A": None}).astype(dtype))
        elif column == "irrigation_system":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "cooking_weekdays_times":
            categories = ["None", "Less than 1 hour", "An hour or more"]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "cooking_weekends_times":
            categories = ["None", "Less than 1 hour", "An hour or more"]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column.startswith("cooking_"):
            df[column] = df[column].notnull()
        elif column.startswith("blinds_"):
            categories = ['Rarely or never', 'Some days', 'Most days']
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "thermostat_settings":
            categories = [
                "We are generally in agreement",
                "Our preferences vary 1-2 degrees Fahrenheit",
                "Our preferences vary 3-5 degrees Fahrenheit",
                "Our preferences vary more than 5 degrees Fahrenheit"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "tv_hours":
            categories = range(0, 12)
            dtype = types.CategoricalDtype(categories, ordered=True)
            strs_with_ints = {
                None: 0,
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                '5': 5,
                '6': 6,
                '7': 7,
                '8': 8,
                '9': 9,
                "10": 10,
                "10 or more": 11
            }
            df[column] = (df[column].replace(strs_with_ints).astype(dtype))
        elif column == "care_energy_cost":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "reduce_energy_cost":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "reduce_energy_yes":
            df.drop(column, axis=1, inplace=True
                    )  # TODO substantial work required to make this useful!
        elif column == "modify_routines":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column.endswith("_brand") or column.endswith("_models"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("hvac_"):
            df[column] = df[column].notnull()
        elif column.startswith("compressor1_"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("compressor2_"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("compressor3_"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("air_handler1_"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("air_handler2_"):
            df.drop(column, axis=1, inplace=True)
        elif column.startswith("heating_"):
            df[column] = df[column].notnull()
        elif column.startswith("temp_"):
            df[column] = pd.to_numeric(df[column], errors="coerce")
        elif column == "pets":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column == "programmable_thermostat_currently_programmed":
            df[column] = df[column].replace({
                "Yes": True,
                "No": False,
                "I dont know": None
            })
        elif column == "programmable_thermostat_difficultly":  # typo in name!
            categories = [
                "Havent tried", "Easy", "Moderately difficult",
                "Very difficult"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df["programmable_thermostat_difficulty"] = df[column].astype(dtype)
            df.drop(column, axis=1, inplace=True)
        elif column == "ac_comfortability":
            dtype = types.CategoricalDtype([1, 2, 3, 4, 5], ordered=True)
            strs_with_ints = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
            df[column] = (df[column].replace(strs_with_ints).astype(dtype))
        elif column == "house_drafty":
            categories = ["No", "Somewhat", "Yes"]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({
                "I dont know/Havent noticed": None
            }).astype(dtype))
        elif column.startswith("ac_cooling_"):
            df[column] = df[column].notnull()
        elif column == "change_ac_filters":
            categories = [
                "Every year or greater", "Every 6-12 months",
                "Every 4-6 months", "Every 2-3 months",
                "At least once every month"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({
                "There is an HVAC filter!?": None
            }).astype(dtype))
        elif column == "ac_service_package":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column.startswith("ac_service_package_cost"):
            df.drop(column, axis=1, inplace=True)
        elif column == "ac_service_date":
            categories = [
                "Less than a year ago", "1-2 years ago", "2-3 years ago",
                "3-5 years ago", "Never"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({
                "I dont know": None
            }).astype(dtype))
        elif column == "water_heater_tankless":
            df[column] = df[column].replace({"Yes": True, "No": False})
        elif column.startswith("light_bulbs_"):
            df[column] = df[column].notnull()
        elif column.startswith("electronic_devices_"):
            dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5],
                                           ordered=True)
            strs_with_ints = {
                None: 0,
                '1': 1,
                '2': 2,
                '3': 3,
                '4': 4,
                "5 or more": 5
            }
            df[column] = (df[column].replace(strs_with_ints).astype(dtype))
        elif column.endswith("_number"):
            df.drop(column, axis=1, inplace=True)
        elif column == "recessed_lights_location":
            df.drop(column, axis=1, inplace=True)
        elif column == "track_lights_location":
            df.drop(column, axis=1, inplace=True)
        else:
            pass

    return df
Exemplo n.º 10
0
def read_survey_2014_all_participants_table(con: sqlalchemy.engine.Connectable,
                                            schema: str) -> pd.DataFrame:
    """
    Read 2014 survey data from a database into a `pandas.DataFrame`.

    Parameters
    ----------
    con : `sqlalchemy.engine.Connectable`
        An object which supports execution of SQL constructs. Currently there
        are two implementations: `sqlalchemy.engine.Connection` and
        `sqlalchemy.engine.Engine`.
    schema : `str`
        Name of schema containing the `survey_2014_all_participants` table/view.

    Returns
    -------
    df: `pandas.DataFrame`

        2014 survey data for all participants.

    """
    df = pd.read_sql_table("survey_2014_all_participants", con, schema)

    # merge the two foundation columns into single categorical columns
    dtype = types.CategoricalDtype(
        categories=["Pier and beam", "Slab", "Both"], ordered=False)
    df["foundation"] = (df.apply(_merge_foundation_columns,
                                 axis=1).astype(dtype))
    df.drop(["foundation_pier_beam", "foundation_slab"], axis=1, inplace=True)

    for column in df:
        if column == "status":
            dtype = types.CategoricalDtype(categories=["Complete", "Partial"],
                                           ordered=False)
            df[column] = df[column].astype(dtype)
        elif column.startswith("spend_time_at_home_"):
            df[column] = (df[column].replace({'': None}).notnull())
        elif column.startswith("ethnicity_"):
            df[column] = (df[column].replace({'': None}).notnull())
        elif column.startswith("hvac_"):
            df[column] = (df[column].replace({'': None}).notnull())
        elif column.startswith("residents_"):
            df[column] = (df[column].replace({
                'None': 0,
                '': 0,
                '5 or more': 5
            }).astype('int64'))
        elif column == "education_level":
            categories = [
                "High School graduate", "Some college/trade/vocational school",
                "College graduate", "Postgraduate degree"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = df[column].astype(dtype)
        elif column == "total_annual_income":
            categories = [
                "Less than $10,000", "$10,000 - $19,999", "$20,000 - $34,999",
                "$35,000 - $49,999", "$50,000 - $74,999", "$75,000 - $99,999",
                "$100,000 - $149,999", "$150,000 - $299,000",
                "$300,000 - $1,000,000", "more than $1,000,000"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].str.replace('"""', '').replace({
                '': None
            }).astype(dtype))
        elif column == "pv_system_own":
            df[column] = df[column] == "Yes"
        elif column == "pv_system_size":
            str_to_numeric = pd.to_numeric(df.pv_system_size.str.replace(
                "kw", '', case=False).str.replace('"', '').str.replace(
                    ',', '').str.replace(' ', '').str.replace('DC',
                                                              '').replace({
                                                                  '':
                                                                  None,
                                                                  'NA':
                                                                  None,
                                                                  'n/a':
                                                                  None
                                                              }),
                                           errors="coerce")
            df[column] = str_to_numeric.apply(lambda v: v / 1e3
                                              if v > 1e3 else v)
        elif column == "pv_system_reason":
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column == "pv_system_satisfied":
            categories = [
                "Very dissatisfied", "Somewhat dissatisfied", "Neutral",
                "Somewhat satisfied", "Very"
            ]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df[column].replace({'': None}).astype(dtype))
        elif column.startswith("pv_system_features_"):
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("pv_system_common_"):
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("pv_neg_factors_"):
            df[column] = (df[column].replace({'': None}).notnull())
        elif column.startswith("pv_pos_"):
            df[column] = (df[column].replace({'': None}).notnull())
        elif column == "pv_owner_response":
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column == "retrofits":
            df[column] = df[column].replace({
                '': None,
                "Yes": True,
                "No": False
            })
        elif column == "retrofits_detail":
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column == "irrigation_system":
            df[column] = df[column].replace({
                '': None,
                "Yes": True,
                "No": False
            })
        elif column == "ceiling_fans_count":
            df[column] = (df.ceiling_fans_count.replace({
                '': '0'
            }).astype("int64"))
        elif column == "compressors_count":
            df[column] = (df.compressors_count.replace({
                '': '0'
            }).astype("int64"))
        elif column.startswith("compressor1_"):
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("compressor2_"):
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("compressor3_"):
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("temp_summer_"):
            df[column] = (df[column].replace({'': None}).astype("float64"))
        elif column.startswith("temp_winter_"):
            df[column] = pd.to_numeric(df[column], errors="coerce")
        elif column == "thermostats_brand":
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column == "programmable_thermostat_currently_programmed":
            df[column] = df[column].replace({
                '': None,
                "Yes": True,
                "No": False,
                "\"\"\"I don\'t know\"\"\"": None
            })
        elif column == "programmable_thermostat_difficulty":
            categories = ["Easy", "Moderately difficult", "Very difficult"]
            dtype = types.CategoricalDtype(categories, ordered=True)
            df[column] = (df.programmable_thermostat_difficulty.replace({
                '':
                None,
                "\"\"\"Haven't tried\"\"\"":
                None
            }).astype(dtype))
        elif column == "ac_service_package":
            df[column] = df[column].replace({
                '': None,
                "Yes": True,
                "No": False
            })
        elif column == "electronic_devices_on_other":
            df.drop(column, axis=1,
                    inplace=True)  # significant cleaning required!
        elif column.startswith("electronic_devices_"):
            df[column] = (df[column].replace({
                'None': 0,
                '': 0,
                '5 or more': 5
            }).astype('int64'))
        else:
            pass

    return df