def predict(self, when_data=None, when=None): """ Predict given when conditions :param when_data: a dataframe :param when: a dictionary :return: a complete dataframe """ if when is not None: when_dict = {key: [when[key]] for key in when} when_data = pandas.DataFrame(when_dict) when_data_ds = DataSource(when_data, self.config) when_data_ds.encoders = self._mixer.encoders main_mixer_predictions = self._mixer.predict(when_data_ds) if CONFIG.HELPER_MIXERS and self.has_boosting_mixer: for output_column in main_mixer_predictions: if self._helper_mixers is not None and output_column in self._helper_mixers: if (self._helper_mixers[output_column]['accuracy'] > 1.00 * self.train_accuracy[output_column]['value'] ) or CONFIG.FORCE_HELPER_MIXERS: helper_mixer_predictions = self._helper_mixers[ output_column]['model'].predict( when_data_ds, [output_column]) main_mixer_predictions[ output_column] = helper_mixer_predictions[ output_column] return main_mixer_predictions
def test_prepare_encoders(self): df, config = self.df, self.config ds = DataSource(df, config) assert ds.enable_cache encoders = ds.encoders for col in ['x1', 'x2']: assert isinstance(encoders[col], NumericEncoder) assert encoders[col]._prepared is True assert encoders[col].is_target is False assert encoders[col]._type == 'int' assert isinstance(encoders['y'], CategoricalAutoEncoder) assert encoders['y']._prepared is True assert encoders['y'].is_target is True assert encoders['y'].onehot_encoder._prepared is True assert encoders['y'].onehot_encoder.is_target is True assert encoders['y'].use_autoencoder is False encoded_column_x1 = ds.get_encoded_column_data('x1') assert isinstance(encoded_column_x1, Tensor) assert encoded_column_x1.shape[0] == len(df) encoded_column_x2 = ds.get_encoded_column_data('x2') assert isinstance(encoded_column_x2, Tensor) assert encoded_column_x2.shape[0] == len(df) encoded_column_y = ds.get_encoded_column_data('y') assert isinstance(encoded_column_y, Tensor) assert encoded_column_y.shape[0] == len(df)
def test_encoded_cache(self): df, config = self.df, self.config ds = DataSource(df, config) assert ds.enable_cache for column in ['x1', 'x2', 'y']: assert not column in ds.encoded_cache encoded_column = ds.get_encoded_column_data(column) assert (ds.encoded_cache[column] == encoded_column).all()
def predict(self, when_data=None, when=None): """ Predict given when conditions :param when_data: a dataframe :param when: a dictionary :return: a complete dataframe """ if when is not None: when_dict = {key: [when[key]] for key in when} when_data = pandas.DataFrame(when_dict) when_data_ds = DataSource(when_data, self.config) when_data_ds.encoders = self._mixer.encoders return self._mixer.predict(when_data_ds)
def learn(self, from_data, test_data=None): """ Train and save a model (you can use this to retrain model from data). :param from_data: DataFrame or DataSource The data to learn from :param test_data: DataFrame or DataSource The data to test accuracy and learn_error from """ device, _available_devices = get_devices() log.info(f'Computing device used: {device}') # generate the configuration and set the order for the input and output columns if self._generate_config is True: self._input_columns = [col for col in from_data if col not in self._output_columns] self.config = { 'input_features': [{'name': col, 'type': self._type_map(from_data, col)} for col in self._input_columns], 'output_features': [{'name': col, 'type': self._type_map(from_data, col)} for col in self._output_columns] } self.config = predictor_config_schema.validate(self.config) log.info('Automatically generated a configuration') log.info(self.config) else: self._output_columns = [col['name'] for col in self.config['output_features']] self._input_columns = [col['name'] for col in self.config['input_features']] if isinstance(from_data, pandas.DataFrame): train_ds = DataSource(from_data, self.config) elif isinstance(from_data, DataSource): train_ds = from_data else: raise TypeError(':from_data: must be either DataFrame or DataSource') nr_subsets = 3 if len(train_ds) > 100 else 1 if test_data is None: test_ds = train_ds.subset(0.1) elif isinstance(test_data, pandas.DataFrame): test_ds = train_ds.make_child(test_data) elif isinstance(test_data, DataSource): test_ds = test_data else: raise TypeError(':test_data: must be either DataFrame or DataSource') train_ds.create_subsets(nr_subsets) test_ds.create_subsets(nr_subsets) train_ds.train() test_ds.train() mixer_class = self.config['mixer']['class'] mixer_kwargs = self.config['mixer']['kwargs'] self._mixer = mixer_class(**mixer_kwargs) self._mixer.fit(train_ds=train_ds, test_ds=test_ds) self.train_accuracy = self._mixer.calculate_accuracy(test_ds) return self
def test_fit_and_predict(self): config = { 'input_features': [ { 'name': 'x', 'type': 'numeric' }, { 'name': 'y', 'type': 'numeric' } ], 'output_features': [ { 'name': 'z', 'type': 'numeric' }, { 'name': 'z`', 'type': 'categorical' } ] } config = predictor_config_schema.validate(config) N = 100 data = {'x': [i for i in range(N)], 'y': [random.randint(i, i + 20) for i in range(N)]} nums = [data['x'][i] * data['y'][i] for i in range(N)] data['z'] = [i + 0.5 for i in range(N)] data['z`'] = ['low' if i < 50 else 'high' for i in nums] data_frame = pandas.DataFrame(data) train_ds = DataSource(data_frame, config) train_ds.create_subsets(1) mixer = NnMixer(stop_training_after_seconds=50) mixer.fit(train_ds, train_ds) test_ds = train_ds.make_child(data_frame[['x', 'y']]) predictions = mixer.predict(test_ds)
def calculate_accuracy(self, from_data): """ calculates the accuracy of the model :param from_data:a dataframe :return accuracies: dictionaries of accuracies """ if self._mixer is None: logging.error("Please train the model before calculating accuracy") return ds = from_data if isinstance(from_data, DataSource) else DataSource( from_data, self.config) predictions = self._mixer.predict(ds, include_extra_data=True) accuracies = {} for output_column in self._output_columns: real = list(map(str, ds.get_column_original_data(output_column))) predicted = list( map(str, predictions[output_column]['predictions'])) weight_map = None if 'weights' in ds.get_column_config(output_column): weight_map = ds.get_column_config(output_column)['weights'] accuracy = self.apply_accuracy_function( ds.get_column_config(output_column)['type'], real, predicted, weight_map=weight_map) if ds.get_column_config(output_column)['type'] in ( COLUMN_DATA_TYPES.NUMERIC): ds.encoders[output_column].decode_log = True predicted = ds.get_decoded_column_data( output_column, predictions[output_column]['encoded_predictions']) alternative_accuracy = self.apply_accuracy_function( ds.get_column_config(output_column)['type'], real, predicted, weight_map=weight_map) if alternative_accuracy['value'] > accuracy['value']: accuracy = alternative_accuracy else: ds.encoders[output_column].decode_log = False accuracies[output_column] = accuracy return accuracies
def predict(self, when_data=None, when=None): """ Predict given when conditions. :param when_data: pandas.DataFrame :param when: dict :return: pandas.DataFrame """ device, _available_devices = get_devices() log.info(f'Computing device used: {device}') if when is not None: when_dict = {key: [when[key]] for key in when} when_data = pandas.DataFrame(when_dict) when_data_ds = DataSource(when_data, self.config, prepare_encoders=False) when_data_ds.eval() kwargs = {'include_extra_data': self.config.get('include_extra_data', False)} return self._mixer.predict(when_data_ds, **kwargs)
def test_transformed_cache(self): df, config = self.df, self.config ds = DataSource(df, config) assert ds.enable_cache assert ds.transformed_cache is None encoded_row = ds[0] # This creates ds.transformed_cache assert len(ds.transformed_cache) == len(df) assert ds.transformed_cache[0] == encoded_row for i in range(1, len(df)): assert ds.transformed_cache[i] is None encoded_row = ds[i] assert ds.transformed_cache[i] == encoded_row alternate_config = copy(config) alternate_config['data_source']['cache_transformed_data'] = False ds = DataSource(df, alternate_config) assert not ds.enable_cache for i in range(len(df)): encoded_row = ds[i] assert ds.transformed_cache is None
def calculate_accuracy(self, from_data): """ calculates the accuracy of the model :param from_data:a dataframe :return accuracies: dictionaries of accuracies """ if self._mixer is None: logging.error("Please train the model before calculating accuracy") return ds = from_data if isinstance(from_data, DataSource) else DataSource( from_data, self.config) predictions = self._mixer.predict(ds, include_encoded_predictions=True) accuracies = {} for output_column in self._mixer.output_column_names: properties = ds.get_column_config(output_column) if properties['type'] == 'categorical': accuracies[output_column] = { 'function': 'accuracy_score', 'value': accuracy_score( list( map(str, ds.get_column_original_data(output_column))), list( map(str, predictions[output_column]["predictions"]))) } else: # Note: We use this method instead of using `encoded_predictions` since the values in encoded_predictions are never prefectly 0 or 1, and this leads to rather large unwaranted different in the r2 score, re-encoding the predictions means all "flag" values (sign, isnull, iszero) become either 1 or 0 encoded_predictions = ds.encoders[output_column].encode( predictions[output_column]["predictions"]) accuracies[output_column] = { 'function': 'r2_score', 'value': r2_score(ds.get_encoded_column_data(output_column), encoded_predictions) } return accuracies
def calculate_accuracy(self, from_data): """ calculates the accuracy of the model :param from_data:a dataframe :return accuracies: dictionaries of accuracies """ if self._mixer is None: logging.log.error("Please train the model before calculating accuracy") return ds = from_data if isinstance(from_data, DataSource) else DataSource(from_data, self.config) predictions = self._mixer.predict(ds, include_encoded_predictions=True) accuracies = {} for output_column in self._mixer.output_column_names: properties = ds.get_column_config(output_column) if properties['type'] == 'categorical': accuracies[output_column] = accuracy_score(ds.get_column_original_data(output_column), predictions[output_column]["predictions"]) else: accuracies[output_column] = r2_score(ds.get_encoded_column_data(output_column), predictions[output_column]["encoded_predictions"]) return accuracies
def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config == True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['input_features'] ] self._input_columns = [ col['name'] for col in self.config['output_features'] ] # @TODO Make Cross Entropy Loss work with multiple outputs if len(self.config['output_features'] ) == 1 and self.config['output_features'][0]['type'] in ( COLUMN_DATA_TYPES.CATEGORICAL): is_categorical_output = True else: is_categorical_output = False if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources nr_subsets = 3 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except: # Not all mixers might require this pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, is_categorical_output, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} mixer = mixer_class(best_parameters, is_categorical_output=is_categorical_output) self._mixer = mixer for param in mixer_params: if hasattr(mixer, param): setattr(mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(mixer)))) started = time.time() epoch = 0 eval_next_on_epoch = eval_every_x_epochs stop_training = False for subset_iteration in [1, 2]: if stop_training: break for subset_id in [*from_data_ds.subsets.keys()]: if stop_training: break subset_train_ds = from_data_ds.subsets[subset_id] subset_test_ds = test_data_ds.subsets[subset_id] lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] best_model = None #iterate over the iter_fit and see what the epoch and mixer error is for epoch, training_error in enumerate( mixer.iter_fit(subset_train_ds)): logging.info( 'training iteration {iter_i}, error {error}'.format( iter_i=epoch, error=training_error)) if epoch >= eval_next_on_epoch: # Prime the model on each subset for a bit if subset_iteration == 1: break eval_next_on_epoch += eval_every_x_epochs test_error = mixer.error(test_data_ds) subset_test_error = mixer.error(subset_test_ds) if lowest_error is None or test_error < lowest_error: lowest_error = test_error best_model = mixer.get_model_copy() if last_subset_test_error is None: subset_test_error_delta_buff.append(0) else: subset_test_error_delta_buff.append( last_subset_test_error - subset_test_error) if last_test_error is None: test_error_delta_buff.append(0) else: test_error_delta_buff.append(last_test_error - test_error) last_test_error = test_error delta_mean = np.mean(test_error_delta_buff[-10:]) subset_delta_mean = np.mean( subset_test_error_delta_buff[-10:]) if callback_on_iter is not None: callback_on_iter( epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) ## Stop if the model is overfitting if delta_mean < 0 and len(test_error_delta_buff) > 9: stop_training = True # Stop if we're past the time limit allocated for training if (time.time() - started) > stop_training_after_seconds: stop_training = True # If the training subset is overfitting on it's associated testing subset if subset_delta_mean < 0 and len( subset_test_error_delta_buff) > 9: break if stop_training: mixer.update_model(best_model) self._mixer = mixer self.train_accuracy = self.calculate_accuracy( test_data_ds) self.overall_certainty = mixer.overall_certainty() if subset_id == 'full': logging.info('Finished training model !') else: logging.info( 'Finished fitting on {subset_id} of {no_subsets} subset' .format(subset_id=subset_id, no_subsets=len( from_data_ds.subsets.keys()))) break self._mixer.encoders = from_data_ds.encoders return self
def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less, # than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config is True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } self.config = predictor_config_schema.validate(self.config) logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['output_features'] ] self._input_columns = [ col['name'] for col in self.config['input_features'] ] if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources if len(from_data_ds) > 100: nr_subsets = 3 else: # Don't use k-fold cross validation for very small input sizes nr_subsets = 1 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except Exception as e: # Not all mixers might require this # print(e) pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( self.config, mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} self._mixer = mixer_class(best_parameters, self.config) for param in mixer_params: if hasattr(self._mixer, param): setattr(self._mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(self._mixer)))) def callback_on_iter_w_acc(epoch, training_error, test_error, delta_mean): callback_on_iter(epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) self._mixer.fit( train_ds=from_data_ds, test_ds=test_data_ds, callback=callback_on_iter_w_acc, stop_training_after_seconds=stop_training_after_seconds, eval_every_x_epochs=eval_every_x_epochs) self.train_accuracy = self.calculate_accuracy(test_data_ds) # Train some alternative mixers if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and ( CONFIG.FORCE_HELPER_MIXERS or len(from_data_ds) < 12 * pow(10, 3)): try: self._helper_mixers = self.train_helper_mixers( from_data_ds, test_data_ds, self._mixer.quantiles[self._mixer.quantiles_pair[0] + 1:self._mixer.quantiles_pair[1] + 1]) except Exception as e: logging.warning( f'Failed to train helper mixers with error: {e}') return self
'output_features': [{ 'name': 'z', 'type': 'categorical', # 'encoder_path': 'lightwood.encoders.categorical.categorical' }] } data = { 'x': [i for i in range(10)], 'y': [random.randint(i, i + 20) for i in range(10)] } nums = [data['x'][i] * data['y'][i] for i in range(10)] data['z'] = ['low' if i < 50 else 'high' for i in nums] data_frame = pandas.DataFrame(data) print(data_frame) ds = DataSource(data_frame, config) input_ds_for_prediction = DataSource(data_frame[['x', 'y']], config) mixer = SkLearnMixer(input_column_names=['x', 'y'], output_column_names=['z']) for i in mixer.iter_fit(ds): print('training') data_encoded = mixer.fit(ds) predictions = mixer.predict(input_ds_for_prediction, ['z']) print(predictions) ##################################### # For Regression # # Test Case: 2 # #####################################
def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less, # than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config is True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } self.config = predictor_config_schema.validate(self.config) logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['output_features'] ] self._input_columns = [ col['name'] for col in self.config['input_features'] ] if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources nr_subsets = 3 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except Exception as e: # Not all mixers might require this # print(e) pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( self.config, mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and ( CONFIG.FORCE_HELPER_MIXERS or len(from_data_ds) < 12 * pow(10, 3)): try: self._helper_mixers = self.train_helper_mixers( from_data_ds, test_data_ds) except Exception as e: logging.warning( f'Failed to train helper mixers with error: {e}') mixer = mixer_class(best_parameters, self.config) self._mixer = mixer for param in mixer_params: if hasattr(mixer, param): setattr(mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(mixer)))) started = time.time() log_reasure = time.time() first_run = True stop_training = False for subset_iteration in [1, 2]: if stop_training: break subset_id_arr = [*from_data_ds.subsets.keys()] # [1] for subset_id in subset_id_arr: started_subset = time.time() if stop_training: break #subset_train_ds = from_data_ds #.subsets[subset_id] #subset_test_ds = test_data_ds #.subsets[subset_id] subset_train_ds = from_data_ds.subsets[subset_id] subset_test_ds = test_data_ds.subsets[subset_id] lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] best_model = None best_selfaware_model = None #iterate over the iter_fit and see what the epoch and mixer error is for epoch, training_error in enumerate( mixer.iter_fit(subset_train_ds, initialize=first_run, subset_id=subset_id)): first_run = False # Log this every now and then so that the user knows it's running if (int(time.time()) - log_reasure) > 30: log_reasure = time.time() logging.info( f'Lightwood training, iteration {epoch}, training error {training_error}' ) # Prime the model on each subset for a bit if subset_iteration == 1: break # Once the training error is getting smaller, enable dropout to teach the network to predict without certain features if subset_iteration > 1 and training_error < 0.4 and not from_data_ds.enable_dropout: eval_every_x_epochs = max(1, int(eval_every_x_epochs / 2)) logging.info('Enabled dropout !') from_data_ds.enable_dropout = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue # If the selfaware network isn't able to train, go back to the original network if subset_iteration > 1 and ( np.isnan(training_error) or np.isinf(training_error) or training_error > pow(10, 5)) and not mixer.stop_selfaware_training: mixer.start_selfaware_training = False mixer.stop_selfaware_training = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue # Once we are past the priming/warmup period, start training the selfaware network if subset_iteration > 1 and not mixer.is_selfaware and self.config[ 'mixer'][ 'selfaware'] and not mixer.stop_selfaware_training and training_error < 0.35: logging.info('Started selfaware training !') mixer.start_selfaware_training = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue if epoch % eval_every_x_epochs == 0: test_error = mixer.error(test_data_ds) subset_test_error = mixer.error(subset_test_ds, subset_id=subset_id) logging.info( f'Subtest test error: {subset_test_error} on subset {subset_id}, overall test error: {test_error}' ) if lowest_error is None or test_error < lowest_error: lowest_error = test_error if mixer.is_selfaware: best_selfaware_model = mixer.get_model_copy() else: best_model = mixer.get_model_copy() if last_subset_test_error is None: pass else: subset_test_error_delta_buff.append( last_subset_test_error - subset_test_error) last_subset_test_error = subset_test_error if last_test_error is None: pass else: test_error_delta_buff.append(last_test_error - test_error) last_test_error = test_error delta_mean = np.mean(test_error_delta_buff[-5:]) subset_delta_mean = np.mean( subset_test_error_delta_buff[-5:]) if callback_on_iter is not None: callback_on_iter( epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) ## Stop if the model is overfitting #if delta_mean <= 0 and len(test_error_delta_buff) > 4: # stop_training = True # Stop if we're past the time limit allocated for training if (time.time() - started) > stop_training_after_seconds: stop_training = True # If the trauining subset is overfitting on it's associated testing subset if (subset_delta_mean <= 0 and len(subset_test_error_delta_buff) > 4 ) or (time.time() - started_subset ) > stop_training_after_seconds / len( from_data_ds.subsets.keys()): logging.info( 'Finished fitting on {subset_id} of {no_subsets} subset' .format(subset_id=subset_id, no_subsets=len( from_data_ds.subsets.keys()))) if mixer.is_selfaware: if best_selfaware_model is not None: mixer.update_model(best_selfaware_model) else: mixer.update_model(best_model) if subset_id == subset_id_arr[-1]: stop_training = True elif not stop_training: break if stop_training: if mixer.is_selfaware: mixer.update_model(best_selfaware_model) else: mixer.update_model(best_model) self._mixer = mixer self.train_accuracy = self.calculate_accuracy( test_data_ds) self.overall_certainty = mixer.overall_certainty() logging.info('Finished training model !') break self._mixer.build_confidence_normalization_data(test_data_ds) self._mixer.encoders = from_data_ds.encoders return self
def learn(self, from_data, test_data=None, callback_on_iter = None, eval_every_x_epochs = 20, stop_training_after_seconds=3600 * 24 * 5): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ self._stop_training_flag = False # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name])/10: return COLUMN_DATA_TYPES.CATEGORICAL # else asume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config == True: self._input_columns = [col for col in from_data if col not in self._output_columns] self.config = { 'input_features': [{'name': col, 'type': type_map(col)} for col in self._input_columns], 'output_features': [{'name': col, 'type': type_map(col)} for col in self._output_columns] } logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [col['name'] for col in self.config['input_features']] self._input_columns = [col['name'] for col in self.config['output_features']] from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_params = {} if 'mixer' in self.config: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] else: mixer_class = NnMixer mixer = mixer_class() for param in mixer_params: if hasattr(mixer, param): setattr(mixer, param, mixer_params[param]) else: logging.warning('trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter'.format(param=param, mixerclass=str(type(mixer)))) eval_next_on_epoch = eval_every_x_epochs error_delta_buffer = [] # this is a buffer of the delta of test and train error delta_mean = 0 last_test_error = None lowest_error = None lowest_error_epoch = None last_good_model = None started_training_at = int(time.time()) #iterate over the iter_fit and see what the epoch and mixer error is for epoch, mix_error in enumerate(mixer.iter_fit(from_data_ds)): if self._stop_training_flag == True: logging.info('Learn has been stopped') break logging.info('training iteration {iter_i}, error {error}'.format(iter_i=epoch, error=mix_error)) # see if it needs to be evaluated if epoch >= eval_next_on_epoch and test_data_ds: tmp_next = eval_next_on_epoch + eval_every_x_epochs eval_next_on_epoch = tmp_next test_error = mixer.error(test_data_ds) # initialize lowest_error_variable if not initialized yet if lowest_error is None: lowest_error = test_error lowest_error_epoch = epoch is_lowest_error = True else: # define if this is the lowest test error we have had thus far if test_error < lowest_error: lowest_error = test_error lowest_error_epoch = epoch is_lowest_error = True else: is_lowest_error = False if last_test_error is None: last_test_error = test_error # it its the lowest error, make a FULL copy of the mixer so we can return only the best mixer at the end if is_lowest_error: last_good_model = mixer.get_model_copy() delta_error = last_test_error - test_error last_test_error = test_error # keep a stream of training errors delta, so that we can calculate if the mixer is starting to overfit. # We assume if the delta of training error starts to increase # delta is assumed as the difference between the test and train error error_delta_buffer += [delta_error] error_delta_buffer = error_delta_buffer[-10:] delta_mean = np.mean(error_delta_buffer) # update mixer and calculate accuracy self._mixer = mixer accuracy = self.calculate_accuracy(test_data_ds) self.train_accuracy = { var: accuracy[var] if accuracy[var] > 0 else 0 for var in accuracy} logging.debug('Delta of test error {delta}'.format(delta=delta_mean)) # if there is a callback function now its the time to call it if callback_on_iter is not None: callback_on_iter(epoch, mix_error, test_error, delta_mean) # if the model is overfitting that is, that the the test error is becoming greater than the train error if (delta_mean < 0 and len(error_delta_buffer) > 5 and test_error < 0.1) or (test_error < 0.005) or (test_error < 0.0005) or (lowest_error_epoch + round(max(eval_every_x_epochs*2+2,epoch*1.2)) < epoch) or ( (int(time.time()) - started_training_at) > stop_training_after_seconds): mixer.update_model(last_good_model) self.train_accuracy = self.calculate_accuracy(test_data_ds) break # make sure that we update the encoders, we do this, so that the predictor or parent object can pickle the mixers self._mixer.encoders = from_data_ds.encoders return self
config = predictor_config_schema.validate(config) # For Classification data = { 'x': [i for i in range(10)], 'y': [random.randint(i, i + 20) for i in range(10)] } nums = [data['x'][i] * data['y'][i] for i in range(10)] data['z'] = ['low' if i < 50 else 'high' for i in nums] data_frame = pandas.DataFrame(data) # print(data_frame) ds = DataSource(data_frame, config) ds.prepare_encoders() predict_input_ds = DataSource(data_frame[['x', 'y']], config) predict_input_ds.prepare_encoders() #################### mixer = NnMixer({}, config) for i in mixer.iter_fit(ds): if i < 0.01: break predictions = mixer.predict(predict_input_ds) print(predictions) # For Regression
} ##For Classification data = { 'x': [i for i in range(10)], 'y': [random.randint(i, i + 20) for i in range(10)] } nums = [data['x'][i] * data['y'][i] for i in range(10)] data['z'] = ['low' if i < 50 else 'high' for i in nums] data_frame = pandas.DataFrame(data) # print(data_frame) ds = DataSource(data_frame, config) predict_input_ds = DataSource(data_frame[['x', 'y']], config) #################### mixer = NnMixer(input_column_names=['x', 'y'], output_column_names=['z']) data_encoded = mixer.fit(ds) predictions = mixer.predict(predict_input_ds) print(predictions) ##For Regression # GENERATE DATA ############### config = {