def cols_to_multilabel(self): '''Utility function for correlation and other reducers that require transforming hyperparameter values into multilabel values before applying the reduction strategy.''' import wrangle import pandas as pd # read in the experiment log data = pd.read_csv(self.experiment_name + '.csv') # apply recuction window data = data.tail(self.reduction_window) # drop all other metric columns except reduction_metric data = data[[self.reduction_metric] + self._param_dict_keys] # convert all hyperparameter columns to multi label columns for col in data.iloc[:, 1:].columns: # get the dtype of the column data col_dtype = data[col].dtype # parse column name to contain label, value and dtype data = wrangle.col_to_multilabel(data, col, extended_colname=True, extended_separator='~' + str(col_dtype) + '~') return data
def telco_churn(quantile=.5): '''Returns dataset in format x, [y1, y2]. This dataset is useful for demonstrating multi-output model or for experimenting with reduction strategy creation. The data is from hyperparameter optimization experiment with Kaggle telco churn dataset. x: features y1: val_loss y2: val_f1score quantile is for transforming the otherwise continuous y variables into labels so that higher value is stronger. If set to 0 then original continuous will be returned.''' import wrangle import pandas as pd df = pd.read_csv( 'https://raw.githubusercontent.com/autonomio/examples/master/telco_churn/telco_churn_for_sensitivity.csv' ) df = df.drop(['val_acc', 'loss', 'f1score', 'acc', 'round_epochs'], 1) for col in df.iloc[:, 2:].columns: df = wrangle.col_to_multilabel(df, col) df = wrangle.df_rename_cols(df) if quantile > 0: y1 = (df.C0 < df.C0.quantile(quantile)).astype(int).values y2 = (df.C1 > df.C1.quantile(quantile)).astype(int).values else: y1 = df.C0.values y2 = df.C1.values x = df.drop(['C0', 'C1'], 1).values return x, [y1, y2]
# use estimated age for missing contact age values estimated_age = (df.cnt_age_est_min + df.cnt_age_est_max) / 2 estimated_age = estimated_age.fillna(0).astype(int) df['contact_age'] = (df.cnt_age_exact.fillna(0) + estimated_age).astype(int) # keep these cols cols = [ 'part_id', 'part_gender', 'contact_age', 'part_age', 'country', 'hh_size', 'cnt_gender', 'cnt_home', 'cnt_work', 'cnt_school', 'cnt_transport', 'cnt_leisure', 'cnt_otherplace' ] df = df[cols] # convert string label values to multi-label columns df = wrangle.col_to_multilabel(df, 'part_gender') df = wrangle.col_to_multilabel(df, 'country') # drop redundant columns df.drop(['cnt_gender'], 1, inplace=True) # use these column names instead cols = [ 'participant_id', 'contact_age', 'age_group', 'household_size', 'contact_home', 'contact_work', 'contact_school', 'contact_transport', 'contact_leisure', 'contact_other', 'gender_female', 'gender_male', 'country_be', 'country_de', 'country_fi', 'country_gb', 'country_it', 'country_lu', 'country_nl', 'country_pl' ] # wrap up
_null = wr.col_corr_ols(df.head(50), 'bouncerate1', 'bouncerate1') _null = wr.col_drop_outliers(df, 'bouncerate1', threshold=1) _null = wr.col_fill_nan(df, 'admin_city') _null = wr.col_groupby_cdf(df, 'bouncerate1', 'adnetworks', ascending=True) _null = wr.col_groupby_pdf(df, 'bouncerate1', 'adnetworks', ascending=False) _null = wr.col_groupby_stats(df_cont_cat, 'bouncerate1', 'binary') _null = wr.col_impute_nan(df.bouncerate1) _null = wr.col_move_place(df, 'bouncerate1', 'first') _null = wr.col_move_place(df, 'bouncerate1', 'last') _null = wr.col_resample_equal(df.head(50), 'adnetworks', 1) # _null = wr.col_resample_interval() # No datetime column _null = wr.col_rescale_max(df.bouncerate1.values) _null = wr.col_to_biclass(df, 'category', 'NEWS_AND_MEDIA') _null = wr.col_to_binary(df, 'bouncerate1') _null = wr.col_to_buckets(df, 'bouncerate1', 4) _null = wr.col_to_cols(df[['adnetworks', 'bouncerate1']].reset_index(), 'adnetworks', 'index') _null = wr.col_to_multilabel(df, 'category') _null = wr.col_to_split(df.head(10), 'top_downstream', sep='.') # test all the attributes starting with array_ _null = wr.array_random_shuffle(df[['bouncerate1', 'bouncerate2']].values, df.bouncerate2) _null = wr.array_random_weighted(df.bouncerate1.head(10), 'normal', 10) _null = wr.array_reshape_conv1d(df.values) _null = wr.array_reshape_lstm(df.bouncerate1, 10, 10) _null = wr.array_split(df.values, df.bouncerate1.values, .1) _null = wr.array_to_generator(df.values, df.bouncerate1, 20) _null = wr.array_to_kfold(df.values, df.bouncerate1) _null = wr.array_to_multilabel(df.head(5).adnetworks.values)