def set_scalers(self, df): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) self.identifiers = list(df[id_column].unique()) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) data = df[real_inputs].values self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler = sklearn.preprocessing.StandardScaler().fit( df[[target_column]].values) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df, set_real=True): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions) if set_real: self.identifiers = list(df[id_column].unique()) self._real_scalers = {} for col in ['transactions', 'log_sales']: # 'oil' 뺐음 self._real_scalers[col] = (df[col].mean(), df[col].std()) self._target_scaler = (df[target_column].mean(), df[target_column].std()) else: categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first') id_set = set(self.identifiers) valid_idx = df['traj_id'].apply(lambda x: x in id_set) for col in categorical_inputs: srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values) num_classes.append(srs.nunique()) self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) # Format real scalers real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Initialise scaler caches self._real_scalers = {} self._target_scaler = {} identifiers = [] for identifier, sliced in df.groupby(id_column): if len(sliced) >= self._time_steps: data = sliced[real_inputs].values targets = sliced[[target_column]].values self._real_scalers[identifier] \ = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler[identifier] \ = sklearn.preprocessing.StandardScaler().fit(targets) identifiers.append(identifier) # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes # Extract identifiers in case required self.identifiers = identifiers
def set_scalers(self, df, set_real=True): """Calibrates scalers using the data supplied. Label encoding is applied to the entire dataset (i.e. including test), so that unseen labels can be handled at run-time. Args: df: Data to use to calibrate scalers. set_real: Whether to fit set real-valued or categorical scalers """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) if set_real: # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) # Format real scalers self._real_scalers = {} for col in ['oil', 'transactions', 'log_sales']: self._real_scalers[col] = (df[col].mean(), df[col].std()) self._target_scaler = (df[target_column].mean(), df[target_column].std()) else: # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first!') id_set = set(self.identifiers) valid_idx = df['traj_id'].apply(lambda x: x in id_set) for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df, set_real=True): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) if set_real: # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Format real scalers self._real_scalers = {} # for col in real_inputs: # self._real_scalers[col] = (df[col].mean(), df[col].std()) # self._target_scaler = (df[target_column].mean(), df[target_column].std()) data = df[real_inputs].values self._real_scalers = sklearn.preprocessing.StandardScaler().fit( data) self._target_scaler = sklearn.preprocessing.StandardScaler().fit( df[[target_column]].values) else: # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first!') id_set = set(self.identifiers) valid_idx = df['ID'].apply(lambda x: x in id_set) for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df, enable_scaling=False): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) # Format real scalers real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) data = df[real_inputs].values print("Scaling Enabled:" + str(enable_scaling)) self._real_scalers = sklearn.preprocessing.StandardScaler( with_mean=enable_scaling, with_std=enable_scaling).fit(data) self._target_scaler = sklearn.preprocessing.StandardScaler( with_mean=enable_scaling, with_std=enable_scaling).fit(df[[target_column ]].values) # used for predictions # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') # Extract relevant columns column_definitions = self.get_column_definition() id_col = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Transform real inputs per entity df_list = [] for identifier, sliced in df.groupby(id_col): # Filter out any trajectories that are too short if len(sliced) >= self._time_steps: sliced_copy = sliced.copy() sliced_copy[real_inputs] = self._real_scalers[ identifier].transform(sliced_copy[real_inputs].values) df_list.append(sliced_copy) output = pd.concat(df_list, axis=0) # Format categorical inputs for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output
def _get_single_col_by_type(self, input_type): """Returns name of single column for input type.""" return utils.get_single_col_by_input_type(input_type, self.column_definition)
def _get_single_col_by_type(self, input_type): return utils.get_single_col_by_input_type(input_type, self.column_definition)