def transform(self, X) -> pd.DataFrame: """ """ for var in self.variables: X[var] = X[var].astype( pandas_types.CategoricalDtype(categories=self.categories[var])) X = pd.get_dummies(X) return X
def add_margins(df, vars, margins=True): """ Add margins to a data frame. All margining variables will be converted to factors. Parameters ---------- df : dataframe input data frame vars : list a list of 2 lists | tuples vectors giving the variables in each dimension margins : bool | list variable names to compute margins for. True will compute all possible margins. """ margin_vars = _margins(vars, margins) if not margin_vars: return df # create margin dataframes margin_dfs = [df] for vlst in margin_vars[1:]: dfx = df.copy() for v in vlst: dfx.loc[0:, v] = '(all)' margin_dfs.append(dfx) merged = pd.concat(margin_dfs, axis=0) merged.reset_index(drop=True, inplace=True) # All margin columns become categoricals. The margin indicator # (all) needs to be added as the last level of the categories. categories = {} for v in itertools.chain(*vars): col = df[v] if not pdtypes.is_categorical_dtype(df[v].dtype): col = pd.Categorical(df[v]) categories[v] = col.categories if '(all)' not in categories[v]: categories[v] = categories[v].insert( len(categories[v]), '(all)') for v in merged.columns.intersection(set(categories)): merged[v] = merged[v].astype( pdtypes.CategoricalDtype(categories[v])) return merged
def ordinal_to_numeric(self, df: pd.DataFrame, column: str, categories: pd.Series, create_column: bool = False): column_categories = pdt.CategoricalDtype(categories=categories, ordered=True) df[column] = df[column].astype(column_categories) if create_column: df[column + "_numeric"] = df[column].cat.codes + 1 else: df[column] = df[column].cat.codes + 1 return df
def load_train_dataset(variables_family: str = None, return_dataframe: bool = False, excluded_variables: list = list()): path_train = "{}/datasets/data_train.csv".format( os.path.dirname(os.path.abspath(__file__))) df = pd.read_csv(path_train) quantitatives = get_quantitatives(variables_family=variables_family, excluded_variables=excluded_variables) categorical = "class" df[categorical] = df[categorical].astype( types.CategoricalDtype(ordered=False)) if return_dataframe: quantitatives.add(categorical) return df[quantitatives] return df[quantitatives].values, df[categorical].values, quantitatives
def _calculate_agg_features(self, features, frame, df_trie): test_feature = features[0] child_entity = test_feature.base_features[0].entity base_frame = df_trie.get_node(test_feature.relationship_path).value # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where where = test_feature.where if where is not None and not base_frame.empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan if base_frame.empty: for f in features: frame[f.get_name()] = np.nan else: relationship_path = test_feature.relationship_path groupby_var = get_relationship_variable_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if is_python_2() and func == pd.Series.count.__func__: func = "count" elif func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # variable twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if u"{}-{}".format(variable_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (f.number_output_features == 1 and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] entity = test_feature.entity child_entity = test_feature.base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where clause for all functions below where = test_feature.where if where is not None: base_frame = base_frame[base_frame[where.get_name()]] relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() funcname = func if callable(func): funcname = func.__name__ to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) to_merge.reset_index(1, drop=True, inplace=True) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def _calculate_agg_features(self, features, frame, df_trie, progress_callback): test_feature = features[0] child_dataframe = test_feature.base_features[0].dataframe base_frame = df_trie.get_node(test_feature.relationship_path).value parent_merge_col = test_feature.relationship_path[0][ 1]._parent_column_name # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here fl = [] for f in features: for ind in f.get_feature_names(): if ind not in frame.columns: fl.append(f) break features = fl if not len(features): progress_callback(len(features) / float(self.num_features)) return frame # handle where base_frame_empty = (base_frame.empty if isinstance( base_frame, pd.DataFrame) else False) where = test_feature.where if where is not None and not base_frame_empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan base_frame_empty = (base_frame.empty if isinstance( base_frame, pd.DataFrame) else False) if base_frame_empty: feature_values = [] for f in features: feature_values.append( (f, np.full(f.number_output_features, np.nan))) progress_callback(1 / float(self.num_features)) frame = update_feature_columns(feature_values, frame) else: relationship_path = test_feature.relationship_path groupby_col = get_relationship_column_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child dataframe included in that Timedelta use_previous = test_feature.use_previous if use_previous: # Filter by use_previous values time_last = self.time_last if use_previous.has_no_observations(): time_first = time_last - use_previous ti = child_dataframe.ww.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.get_value("o") def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_col, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multi-column and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): column_id = f.base_features[0].get_name() if column_id not in to_agg: to_agg[column_id] = [] if isinstance(base_frame, dd.DataFrame): func = f.get_function(agg_type=Library.DASK) elif is_instance(base_frame, ps, "DataFrame"): func = f.get_function(agg_type=Library.SPARK) else: func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # column twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if "{}-{}".format(column_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname if isinstance(func, dd.Aggregation): # TODO: handle aggregation being applied to same column twice # (see above partial wrapping of functions) funcname = func.__name__ to_agg[column_id].append(func) # this is used below to rename columns that pandas names for us agg_rename["{}-{}".format(column_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_col can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_col], observed=True, sort=False).apply(wrap) frame = pd.merge( left=frame, right=to_merge, left_index=True, right_index=True, how="left", ) progress_callback(len(to_apply) / float(self.num_features)) # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_col can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) if is_instance(base_frame, (dd, ps), "DataFrame"): to_merge = base_frame.groupby(groupby_col).agg(to_agg) else: to_merge = base_frame.groupby(base_frame[groupby_col], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns ] to_merge = to_merge[list(agg_rename.values())] # Workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 # # Pandas claims that bug is fixed but it still shows up in some # cases. More investigation needed. if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) if is_instance(frame, (dd, ps), "DataFrame"): frame = frame.merge(to_merge, left_on=parent_merge_col, right_index=True, how="left") else: frame = pd.merge( left=frame, right=to_merge, left_index=True, right_index=True, how="left", ) # determine number of features that were just merged progress_callback( len(to_merge.columns) / float(self.num_features)) # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame = frame.fillna(fillna_dict) return frame
for element in combination: current_key = dict_translate[element] element_to_add = SUPERCLUSTERS[current_key] if element_to_add != "NA": new_combination += [element_to_add] return new_combination df['combination_classes'] = df['combination'].apply( combination_to_combi_of_classes) ## Compute entropy df['entropy'] = df['combination_classes'].apply(entropy) # Round to 2 decimals and make a category df['entropy'] = df['entropy'].apply(lambda x: round(x, 2)).astype( pdtypes.CategoricalDtype()) # Using only combis of each individual length all_lengths = sorted(set(df['combi_length'])) # No point in going beyond, roughly, 12 all_lengths = [l for l in all_lengths if l <= 12] for length in all_lengths: df_filtered = df.loc[df['combi_length'] == length, :] try: p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='fc')) + geom_violin(position=position_dodge(1), width=1) +
def read_survey_2013_all_participants_table(con: sqlalchemy.engine.Connectable, schema: str) -> pd.DataFrame: """ Read 2013 survey data from a database into a `pandas.DataFrame`. Parameters ---------- con : `sqlalchemy.engine.Connectable` An object which supports execution of SQL constructs. Currently there are two implementations: `sqlalchemy.engine.Connection` and `sqlalchemy.engine.Engine`. schema : `str` Name of schema containing the `survey_2013_all_participants` table/view. Returns ------- df: `pandas.DataFrame` 2013 survey data for all participants. """ df = pd.read_sql_table("survey_2013_all_participants", con, schema) # these variables are encoded as True/False but could also be 1/0. # merge the two foundation columns into single categorical columns dtype = types.CategoricalDtype( categories=["Pier and beam", "Slab", "Both"], ordered=False) df["foundation"] = (df.apply(_merge_foundation_columns, axis=1).astype(dtype)) df.drop(["foundation_pier_beam", "foundation_slab"], axis=1, inplace=True) for column in df: if column == "primary_residence": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "number_floors": dtype = types.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) strs_to_ints = {"One": 1, "Two": 2, "Three": 3, "Four": 4} df[column] = (df[column].replace(strs_to_ints).astype(dtype)) elif column == "year_moved_into_house": categories = range(1958, 2018) dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].apply( lambda v: int(v) if v is not None else v).astype(dtype)) elif column == "month_moved_into_house": categories = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "year_house_constructed": categories = range(1930, 2018) dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({ "1930 or earlier": "1930" }).apply(lambda v: int(v) if v is not None else v).astype(dtype)) elif column == "house_num_rooms": categories = range(1, 17) dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].apply( lambda v: int(v) if v is not None else v).astype(dtype)) elif column == "house_ceiling_height": df.drop(column, axis=1, inplace=True) # needs substantial cleaning! elif column == "house_square_feet": df[column] = df[column].astype("float64") elif column.startswith("spend_time_at_home_"): df[column] = df[column].notnull() elif column.startswith("ethnicity_"): df[column] = df[column].notnull() elif column.startswith("sex_"): dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5], ordered=True) strs_with_ints = { None: 0, '1': 1, '2': 2, '3': 3, '4': 4, "5 or more": 5 } df[column] = (df[column].replace(strs_with_ints).astype(dtype)) elif column.startswith("residents_"): dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5], ordered=True) strs_with_ints = { None: 0, '1': 1, '2': 2, '3': 3, '4': 4, "5 or more": 5 } df[column] = (df[column].replace(strs_with_ints).astype(dtype)) elif column == "education_level": categories = [ "High School graduate", "Some college/trade/vocational school", "College graduate", "Postgraduate degree" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "total_annual_income": categories = [ "Less than $10,000", "$10,000 - $19,999", "$20,000 - $34,999", "$35,000 - $49,999", "$50,000 - $74,999", "$75,000 - $99,999", "$100,000 - $149,999", "$150,000 - $299,000", "$300,000 - $1,000,000", "more than $1,000,000" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].str.replace('-', ',').str.replace( " , ", " - ").astype(dtype)) elif column == "smartphone_own": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "tablet_own": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "pv_system_own": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "pv_system_size": specific_replacements = { "5kw I think": 5.0, "8060 Watts": 8.060, "20W (solar powered attic fan)": 0.020 } str_to_numeric = pd.to_numeric(df.pv_system_size.str.replace( ' ', '').str.replace("kw", '', case=False).replace(specific_replacements), errors="coerce") df[column] = str_to_numeric.apply(lambda v: v / 1e3 if v > 1e3 else v) elif column == "electricity_used_monthly": pass # TODO column needs significant cleaning! elif column == "gas_used_monthly": pass # TODO column needs significant cleaning! elif column == "retrofits": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "retrofits_detail": df.drop(column, axis=1, inplace=True) elif column == "retrofits_reason": df[column] = df[column].replace({ "Yes": True, "No": False, "N/A": None }) elif column.startswith("appliance_"): categories = [ 'rarely', 'once or twice a month', 'once or twice a week', 'several times a week', 'daily basis' ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({"N/A": None}).astype(dtype)) elif column == "irrigation_system": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "cooking_weekdays_times": categories = ["None", "Less than 1 hour", "An hour or more"] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "cooking_weekends_times": categories = ["None", "Less than 1 hour", "An hour or more"] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column.startswith("cooking_"): df[column] = df[column].notnull() elif column.startswith("blinds_"): categories = ['Rarely or never', 'Some days', 'Most days'] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "thermostat_settings": categories = [ "We are generally in agreement", "Our preferences vary 1-2 degrees Fahrenheit", "Our preferences vary 3-5 degrees Fahrenheit", "Our preferences vary more than 5 degrees Fahrenheit" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "tv_hours": categories = range(0, 12) dtype = types.CategoricalDtype(categories, ordered=True) strs_with_ints = { None: 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, "10": 10, "10 or more": 11 } df[column] = (df[column].replace(strs_with_ints).astype(dtype)) elif column == "care_energy_cost": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "reduce_energy_cost": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "reduce_energy_yes": df.drop(column, axis=1, inplace=True ) # TODO substantial work required to make this useful! elif column == "modify_routines": df[column] = df[column].replace({"Yes": True, "No": False}) elif column.endswith("_brand") or column.endswith("_models"): df.drop(column, axis=1, inplace=True) elif column.startswith("hvac_"): df[column] = df[column].notnull() elif column.startswith("compressor1_"): df.drop(column, axis=1, inplace=True) elif column.startswith("compressor2_"): df.drop(column, axis=1, inplace=True) elif column.startswith("compressor3_"): df.drop(column, axis=1, inplace=True) elif column.startswith("air_handler1_"): df.drop(column, axis=1, inplace=True) elif column.startswith("air_handler2_"): df.drop(column, axis=1, inplace=True) elif column.startswith("heating_"): df[column] = df[column].notnull() elif column.startswith("temp_"): df[column] = pd.to_numeric(df[column], errors="coerce") elif column == "pets": df[column] = df[column].replace({"Yes": True, "No": False}) elif column == "programmable_thermostat_currently_programmed": df[column] = df[column].replace({ "Yes": True, "No": False, "I dont know": None }) elif column == "programmable_thermostat_difficultly": # typo in name! categories = [ "Havent tried", "Easy", "Moderately difficult", "Very difficult" ] dtype = types.CategoricalDtype(categories, ordered=True) df["programmable_thermostat_difficulty"] = df[column].astype(dtype) df.drop(column, axis=1, inplace=True) elif column == "ac_comfortability": dtype = types.CategoricalDtype([1, 2, 3, 4, 5], ordered=True) strs_with_ints = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5} df[column] = (df[column].replace(strs_with_ints).astype(dtype)) elif column == "house_drafty": categories = ["No", "Somewhat", "Yes"] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({ "I dont know/Havent noticed": None }).astype(dtype)) elif column.startswith("ac_cooling_"): df[column] = df[column].notnull() elif column == "change_ac_filters": categories = [ "Every year or greater", "Every 6-12 months", "Every 4-6 months", "Every 2-3 months", "At least once every month" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({ "There is an HVAC filter!?": None }).astype(dtype)) elif column == "ac_service_package": df[column] = df[column].replace({"Yes": True, "No": False}) elif column.startswith("ac_service_package_cost"): df.drop(column, axis=1, inplace=True) elif column == "ac_service_date": categories = [ "Less than a year ago", "1-2 years ago", "2-3 years ago", "3-5 years ago", "Never" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({ "I dont know": None }).astype(dtype)) elif column == "water_heater_tankless": df[column] = df[column].replace({"Yes": True, "No": False}) elif column.startswith("light_bulbs_"): df[column] = df[column].notnull() elif column.startswith("electronic_devices_"): dtype = types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5], ordered=True) strs_with_ints = { None: 0, '1': 1, '2': 2, '3': 3, '4': 4, "5 or more": 5 } df[column] = (df[column].replace(strs_with_ints).astype(dtype)) elif column.endswith("_number"): df.drop(column, axis=1, inplace=True) elif column == "recessed_lights_location": df.drop(column, axis=1, inplace=True) elif column == "track_lights_location": df.drop(column, axis=1, inplace=True) else: pass return df
def read_survey_2014_all_participants_table(con: sqlalchemy.engine.Connectable, schema: str) -> pd.DataFrame: """ Read 2014 survey data from a database into a `pandas.DataFrame`. Parameters ---------- con : `sqlalchemy.engine.Connectable` An object which supports execution of SQL constructs. Currently there are two implementations: `sqlalchemy.engine.Connection` and `sqlalchemy.engine.Engine`. schema : `str` Name of schema containing the `survey_2014_all_participants` table/view. Returns ------- df: `pandas.DataFrame` 2014 survey data for all participants. """ df = pd.read_sql_table("survey_2014_all_participants", con, schema) # merge the two foundation columns into single categorical columns dtype = types.CategoricalDtype( categories=["Pier and beam", "Slab", "Both"], ordered=False) df["foundation"] = (df.apply(_merge_foundation_columns, axis=1).astype(dtype)) df.drop(["foundation_pier_beam", "foundation_slab"], axis=1, inplace=True) for column in df: if column == "status": dtype = types.CategoricalDtype(categories=["Complete", "Partial"], ordered=False) df[column] = df[column].astype(dtype) elif column.startswith("spend_time_at_home_"): df[column] = (df[column].replace({'': None}).notnull()) elif column.startswith("ethnicity_"): df[column] = (df[column].replace({'': None}).notnull()) elif column.startswith("hvac_"): df[column] = (df[column].replace({'': None}).notnull()) elif column.startswith("residents_"): df[column] = (df[column].replace({ 'None': 0, '': 0, '5 or more': 5 }).astype('int64')) elif column == "education_level": categories = [ "High School graduate", "Some college/trade/vocational school", "College graduate", "Postgraduate degree" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = df[column].astype(dtype) elif column == "total_annual_income": categories = [ "Less than $10,000", "$10,000 - $19,999", "$20,000 - $34,999", "$35,000 - $49,999", "$50,000 - $74,999", "$75,000 - $99,999", "$100,000 - $149,999", "$150,000 - $299,000", "$300,000 - $1,000,000", "more than $1,000,000" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].str.replace('"""', '').replace({ '': None }).astype(dtype)) elif column == "pv_system_own": df[column] = df[column] == "Yes" elif column == "pv_system_size": str_to_numeric = pd.to_numeric(df.pv_system_size.str.replace( "kw", '', case=False).str.replace('"', '').str.replace( ',', '').str.replace(' ', '').str.replace('DC', '').replace({ '': None, 'NA': None, 'n/a': None }), errors="coerce") df[column] = str_to_numeric.apply(lambda v: v / 1e3 if v > 1e3 else v) elif column == "pv_system_reason": df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column == "pv_system_satisfied": categories = [ "Very dissatisfied", "Somewhat dissatisfied", "Neutral", "Somewhat satisfied", "Very" ] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df[column].replace({'': None}).astype(dtype)) elif column.startswith("pv_system_features_"): df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("pv_system_common_"): df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("pv_neg_factors_"): df[column] = (df[column].replace({'': None}).notnull()) elif column.startswith("pv_pos_"): df[column] = (df[column].replace({'': None}).notnull()) elif column == "pv_owner_response": df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column == "retrofits": df[column] = df[column].replace({ '': None, "Yes": True, "No": False }) elif column == "retrofits_detail": df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column == "irrigation_system": df[column] = df[column].replace({ '': None, "Yes": True, "No": False }) elif column == "ceiling_fans_count": df[column] = (df.ceiling_fans_count.replace({ '': '0' }).astype("int64")) elif column == "compressors_count": df[column] = (df.compressors_count.replace({ '': '0' }).astype("int64")) elif column.startswith("compressor1_"): df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("compressor2_"): df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("compressor3_"): df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("temp_summer_"): df[column] = (df[column].replace({'': None}).astype("float64")) elif column.startswith("temp_winter_"): df[column] = pd.to_numeric(df[column], errors="coerce") elif column == "thermostats_brand": df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column == "programmable_thermostat_currently_programmed": df[column] = df[column].replace({ '': None, "Yes": True, "No": False, "\"\"\"I don\'t know\"\"\"": None }) elif column == "programmable_thermostat_difficulty": categories = ["Easy", "Moderately difficult", "Very difficult"] dtype = types.CategoricalDtype(categories, ordered=True) df[column] = (df.programmable_thermostat_difficulty.replace({ '': None, "\"\"\"Haven't tried\"\"\"": None }).astype(dtype)) elif column == "ac_service_package": df[column] = df[column].replace({ '': None, "Yes": True, "No": False }) elif column == "electronic_devices_on_other": df.drop(column, axis=1, inplace=True) # significant cleaning required! elif column.startswith("electronic_devices_"): df[column] = (df[column].replace({ 'None': 0, '': 0, '5 or more': 5 }).astype('int64')) else: pass return df