def train(self): if self.transaction.lmd['model_order_by'] is not None and len( self.transaction.lmd['model_order_by']) > 0: train_df = self._create_timeseries_df( self.transaction.input_data.train_df) test_df = self._create_timeseries_df( self.transaction.input_data.test_df) else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config() if self.transaction.lmd['skip_model_training'] == True: self.predictor = lightwood.Predictor(load_from_path=os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data')) else: self.predictor = lightwood.Predictor(lightwood_config) self.predictor.learn(from_data=train_df, test_data=test_df) self.transaction.log.info('Training accuracy of: {}'.format( self.predictor.train_accuracy)) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data') self.predictor.save( path_to=self.transaction.lmd['lightwood_data']['save_path'])
def train(self): lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu'] if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0: self.transaction.log.debug('Reshaping data into timeseries format, this may take a while !') train_df = self._create_timeseries_df(self.transaction.input_data.train_df) test_df = self._create_timeseries_df(self.transaction.input_data.test_df) self.transaction.log.debug('Done reshaping data into timeseries format !') else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config() if self.transaction.lmd['skip_model_training'] == True: self.predictor = lightwood.Predictor(load_from_path=os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data')) else: self.predictor = lightwood.Predictor(lightwood_config) if self.transaction.lmd['stop_training_in_x_seconds'] is None: self.predictor.learn(from_data=train_df, test_data=test_df, callback_on_iter=self.callback_on_iter) else: self.predictor.learn(from_data=train_df, test_data=test_df, stop_training_after_seconds=self.transaction.lmd['stop_training_in_x_seconds'], callback_on_iter=self.callback_on_iter) self.transaction.log.info('Training accuracy of: {}'.format(self.predictor.train_accuracy)) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data') self.predictor.save(path_to=self.transaction.lmd['lightwood_data']['save_path'])
def predict(self, mode='predict', ignore_columns=[]): lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu'] lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache'] lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model'] if mode == 'predict': # Doing it here since currently data cleanup is included in this, in the future separate data cleanup lightwood_config = self._create_lightwood_config() df = self.transaction.input_data.data_frame if mode == 'validate': df = self.transaction.input_data.validation_df elif mode == 'test': df = self.transaction.input_data.test_df if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0: df = self._create_timeseries_df(df) if self.predictor is None: self.predictor = lightwood.Predictor(load_from_path=self.transaction.lmd['lightwood_data']['save_path']) # not the most efficient but least prone to bug and should be fast enough if len(ignore_columns) > 0: run_df = df.copy(deep=True) for col_name in ignore_columns: run_df[col_name] = [None] * len(run_df[col_name]) else: run_df = df predictions = self.predictor.predict(when_data=run_df) formated_predictions = {} for k in predictions: formated_predictions[k] = predictions[k]['predictions'] return formated_predictions
def predict(self, mode='predict', ignore_columns=[]): if mode == 'predict': # Doing it here since currently data cleanup is included in this, in the future separate data cleanup lightwood_config = self._create_lightwood_config() df = self.transaction.input_data.data_frame if mode == 'validate': df = self.transaction.input_data.validation_df elif mode == 'test': df = self.transaction.input_data.test_df if self.predictor is None: self.predictor = lightwood.Predictor( load_from_path=self.transaction.lmd['lightwood_data'] ['save_path']) # not the most efficient but least prone to bug and should be fast enough if len(ignore_columns) > 0: run_df = df.copy(deep=True) for col_name in ignore_columns: run_df[col_name] = [None] * len(run_df[col_name]) else: run_df = df predictions = self.predictor.predict(when_data=run_df) formated_predictions = {} for k in predictions: formated_predictions[k] = predictions[k]['predictions'] return formated_predictions
def predict(self, mode='predict', ignore_columns=None): if ignore_columns is None: ignore_columns = [] if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu'] lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache'] lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model'] if mode == 'predict': # Doing it here since currently data cleanup is included in this, in the future separate data cleanup lightwood_config = self._create_lightwood_config() df = self.transaction.input_data.data_frame if mode == 'validate': df = self.transaction.input_data.validation_df elif mode == 'test': df = self.transaction.input_data.test_df if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0: df = self._create_timeseries_df(df) if self.predictor is None: self.predictor = lightwood.Predictor(load_from_path=self.transaction.lmd['lightwood_data']['save_path']) # not the most efficient but least prone to bug and should be fast enough if len(ignore_columns) > 0: run_df = df.copy(deep=True) for col_name in ignore_columns: run_df[col_name] = [None] * len(run_df[col_name]) else: run_df = df predictions = self.predictor.predict(when_data=run_df) formated_predictions = {} for k in predictions: formated_predictions[k] = predictions[k]['predictions'] confidence_arr = [] for confidence_name in ['selfaware_confidences','loss_confidences', 'quantile_confidences']: if confidence_name in predictions[k]: conf_arr = [x if x > 0 else 0 for x in predictions[k][confidence_name]] conf_arr = [x if x < 1 else 1 for x in conf_arr] confidence_arr.append(conf_arr) if len(confidence_arr) > 0: confidences = [] for n in range(len(confidence_arr[0])): confidences.append([]) for i in range(len(confidence_arr)): confidences[-1].append(confidence_arr[i][n]) confidences[-1] = sum(confidences[-1])/len(confidences[-1]) formated_predictions[f'{k}_model_confidence'] = confidences if 'confidence_range' in predictions[k]: formated_predictions[f'{k}_confidence_range'] = predictions[k]['confidence_range'] return formated_predictions
def train(self): if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu'] lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache'] lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model'] if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0: self.transaction.log.debug('Reshaping data into timeseries format, this may take a while !') train_df = self._create_timeseries_df(self.transaction.input_data.train_df) test_df = self._create_timeseries_df(self.transaction.input_data.test_df) self.transaction.log.debug('Done reshaping data into timeseries format !') else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config() if self.transaction.lmd['skip_model_training'] == True: self.predictor = lightwood.Predictor(load_from_path=os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data')) else: self.predictor = lightwood.Predictor(lightwood_config) # Evaluate less often for larger datasets and vice-versa eval_every_x_epochs = int(round(1 * pow(10,6) * (1/len(train_df)))) # Within some limits if eval_every_x_epochs > 20: eval_every_x_epochs = 20 if eval_every_x_epochs < 3: eval_every_x_epochs = 3 logging.getLogger().setLevel(logging.DEBUG) if self.transaction.lmd['stop_training_in_x_seconds'] is None: self.predictor.learn(from_data=train_df, test_data=test_df, callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs) else: self.predictor.learn(from_data=train_df, test_data=test_df, stop_training_after_seconds=self.transaction.lmd['stop_training_in_x_seconds'], callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs) self.transaction.log.info('Training accuracy of: {}'.format(self.predictor.train_accuracy)) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data') self.predictor.save(path_to=self.transaction.lmd['lightwood_data']['save_path'])
def train(self): lightwood_config = self._create_lightwood_config() self.predictor = lightwood.Predictor(lightwood_config) self.predictor.learn(from_data=self.transaction.input_data.train_df, test_data=self.transaction.input_data.test_df) self.transaction.log.info('Training accuracy of: {}'.format( self.predictor.train_accuracy)) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data') self.predictor.save( path_to=self.transaction.lmd['lightwood_data']['save_path'])
def test_model(df_test): print('Testing model') # Load some testing data and extract the real values for the target column predictor = lightwood.Predictor(load_from_path='lightwood_model.dill') predictions = predictor.predict(when_data=df_test) test_tags = df_test.tags predicted_tags = predictions['tags']['predictions'] # We will use an internal encoder to convert the tags to binary vectors # This allows us to evaluate the F1 score measure # It evaluates how good the model is at predicting correct tags and avoiding false positives, while staying robust to class imbalances test_tags_encoded = predictor._mixer.encoders['tags'].encode(test_tags) pred_tags_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags) score = f1_score(test_tags_encoded, pred_tags_encoded, average='weighted') # An f1 score of around 0.2 is expected for this dataset # Mind that such score is expected if applying manual text preprocessing, which we don't do in this example print('Test f1_score', round(score, 4))
def test_model(): # Load some testing data and extract the real values for the target column test = pd.read_csv( 'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/benchmarks/default_of_credit/dataset/test.csv' ) real = [str(x) for x in test['default.payment.next.month']] test = test.drop(columns=['default.payment.next.month']) # Load the lightwood model from where we previously saved it and predict using it predictor = lightwood.Predictor(load_from_path='lightwood_model.dill') predictions = predictor.predict(when_data=test) predicted = [ str(x) for x in predictions['default.payment.next.month']['predictions'] ] # Get the balanced accuracy score to see how well we did (in this case > 50% means better than random) balanced_accuracy_pct = balanced_accuracy_score(real, predicted) * 100 print(f'Balacned accuracy score of {round(balanced_accuracy_pct,1)}%')
def train_model(df_train): # A configuration describing the contents of the dataframe, what are the targets we want to predict and what are the features we want to use config = { 'input_features': [ {'name': 'plot_synopsis', 'type': 'text'}, ], 'output_features': [ {'name': 'tags', 'type': 'multiple_categorical'} ], } # Callback to log various training stats (currently the only hook into the training process) def train_callback(epoch, error, test_error, test_error_gradient, test_accuracy): print(f'We reached epoch {epoch} with error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}') # The actual training process predictor = lightwood.Predictor(config) print('Starting model training') predictor.learn(from_data=df, callback_on_iter=train_callback, eval_every_x_epochs=5) # Save the lightwood model predictor.save('lightwood_model.dill')
test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) test_cases = [gen_multiply(), gen_correlate(), gen_categorical()] log_map = {} for i, data in enumerate(test_cases): df_train, df_test, dropout_arr, out_col, name = data pmap = {} accmap = {} pmap['normal'] = lightwood.Predictor(output=[out_col]) pmap['normal'].learn(from_data=df_train, callback_on_iter=iter_function, eval_every_x_epochs=100) accmap['normal'] = pmap['normal'].calculate_accuracy( from_data=df_test)[out_col]['value'] for cols in dropout_arr: mk = 'missing_' + '_'.join(cols) pmap[mk] = lightwood.Predictor(output=[out_col]) pmap[mk].learn(from_data=df_train.drop(columns=cols), callback_on_iter=iter_function, eval_every_x_epochs=100) accmap[mk + '_unfit'] = pmap['normal'].calculate_accuracy( from_data=df_test.drop(columns=cols))[out_col]['value'] accmap[mk + '_fit'] = pmap[mk].calculate_accuracy(
for n in range(nr_ele) ] data_train = pd.DataFrame(data_train) data_test = pd.DataFrame(data_test) def iter_function(epoch, training_error, test_error, test_error_gradient, test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) if train: predictor = lightwood.Predictor(output=['y']) predictor.learn(from_data=data_train, callback_on_iter=iter_function, eval_every_x_epochs=200) predictor.save('/tmp/ltcrl.pkl') predictor = lightwood.Predictor(load_from_path='/tmp/ltcrl.pkl') print('Train accuracy: ', predictor.train_accuracy['y']['value']) print('Test accuracy: ', predictor.calculate_accuracy(from_data=data_test)['y']['value']) print(f'Accuracy for all columns present: ', predictor.calculate_accuracy(from_data=data_test)['y']['value']) predictions = predictor.calculate_accuracy(from_data=data_test) print(f'Confidence mean for all columns present ',
def train_model(): # Load some training data (default on credit, for predicting whether or not someone will default on their credit) df = pd.read_csv( 'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/benchmarks/default_of_credit/dataset/train.csv' ) # A configuration describing the contents of the dataframe, what are the targets we want to predict and what are the features we want to use # Note: the `weights` for the output column `default.payment.next.month`, since the number of samples is uneven between the two categories, but we care about balanced accuracy rather than overall accuracy config = { 'input_features': [{ 'name': 'ID', 'type': 'numeric' }, { 'name': 'LIMIT_BAL', 'type': 'numeric' }, { 'name': 'SEX', 'type': 'categorical' }, { 'name': 'EDUCATION', 'type': 'categorical' }, { 'name': 'MARRIAGE', 'type': 'categorical' }, { 'name': 'AGE', 'type': 'numeric' }, { 'name': 'PAY_0', 'type': 'numeric' }, { 'name': 'PAY_2', 'type': 'numeric' }, { 'name': 'PAY_3', 'type': 'numeric' }, { 'name': 'PAY_4', 'type': 'numeric' }, { 'name': 'PAY_5', 'type': 'numeric' }, { 'name': 'PAY_6', 'type': 'numeric' }, { 'name': 'BILL_AMT1', 'type': 'numeric' }, { 'name': 'BILL_AMT2', 'type': 'numeric' }, { 'name': 'BILL_AMT3', 'type': 'numeric' }, { 'name': 'BILL_AMT4', 'type': 'numeric' }, { 'name': 'BILL_AMT5', 'type': 'numeric' }, { 'name': 'BILL_AMT6', 'type': 'numeric' }, { 'name': 'PAY_AMT1', 'type': 'numeric' }, { 'name': 'PAY_AMT2', 'type': 'numeric' }, { 'name': 'PAY_AMT3', 'type': 'numeric' }, { 'name': 'PAY_AMT4', 'type': 'numeric' }, { 'name': 'PAY_AMT5', 'type': 'numeric' }, { 'name': 'PAY_AMT6', 'type': 'numeric' }], 'output_features': [{ 'name': 'default.payment.next.month', 'type': 'categorical', 'weights': { '0': 0.3, '1': 1 } }], 'mixer': { 'class': NnMixer } } # Callback to log various training stats (currently the only hook into the training process) def train_callback(epoch, error, test_error, test_error_gradient, test_accuracy): print( f'We reached epoch {epoch} with error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' ) # The actual training process predictor = lightwood.Predictor(config) # Note: If `stop_training_after_seconds` is not set, training will stop automatically once we determine the model is overfitting (we separate a testing and a training dataset internally from the dataframe given and only train on the training one, using the testing one to determine overfitting, pick the best model and evaluate model accuracy) predictor.learn(from_data=df, callback_on_iter=train_callback, eval_every_x_epochs=5, stop_training_after_seconds=100) # Save the lightwood model predictor.save('lightwood_model.dill')
def predict(self, mode='predict', ignore_columns=None): if ignore_columns is None: ignore_columns = [] if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[ 'use_gpu'] if mode == 'predict': df = self.transaction.input_data.data_frame elif mode == 'validate': df = self.transaction.input_data.validation_df elif mode == 'test': df = self.transaction.input_data.test_df elif mode == 'predict_on_train_data': df = self.transaction.input_data.train_df else: raise Exception(f'Unknown mode specified: "{mode}"') if self.transaction.lmd['model_order_by'] is not None and len( self.transaction.lmd['model_order_by']) > 0: df = self._create_timeseries_df(df) if self.predictor is None: self.predictor = lightwood.Predictor( load_from_path=self.transaction.lmd['lightwood_data'] ['save_path']) # not the most efficient but least prone to bug and should be fast enough if len(ignore_columns) > 0: run_df = df.copy(deep=True) for col_name in ignore_columns: run_df[col_name] = [None] * len(run_df[col_name]) else: run_df = df predictions = self.predictor.predict(when_data=run_df) formated_predictions = {} for k in predictions: formated_predictions[k] = predictions[k]['predictions'] model_confidence_dict = {} for confidence_name in [ 'selfaware_confidences', 'loss_confidences', 'quantile_confidences' ]: if confidence_name in predictions[k]: if k not in model_confidence_dict: model_confidence_dict[k] = [] for i in range(len(predictions[k][confidence_name])): if len(model_confidence_dict[k]) <= i: model_confidence_dict[k].append([]) conf = predictions[k][confidence_name][i] # @TODO We should make sure lightwood never returns confidences above or bellow 0 and 1 if conf < 0: conf = 0 if conf > 1: conf = 1 model_confidence_dict[k][i].append(conf) for k in model_confidence_dict: model_confidence_dict[k] = [ np.mean(x) for x in model_confidence_dict[k] ] for k in model_confidence_dict: formated_predictions[ f'{k}_model_confidence'] = model_confidence_dict[k] if 'confidence_range' in predictions[k]: formated_predictions[f'{k}_confidence_range'] = predictions[k][ 'confidence_range'] return formated_predictions
def train(self): if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[ 'use_gpu'] if self.transaction.lmd['model_order_by'] is not None and len( self.transaction.lmd['model_order_by']) > 0: self.transaction.log.debug( 'Reshaping data into timeseries format, this may take a while !' ) train_df = self._create_timeseries_df( self.transaction.input_data.train_df) test_df = self._create_timeseries_df( self.transaction.input_data.test_df) self.transaction.log.debug( 'Done reshaping data into timeseries format !') else: if self.transaction.lmd['sample_settings']['sample_for_training']: sample_margin_of_error = self.transaction.lmd[ 'sample_settings']['sample_margin_of_error'] sample_confidence_level = self.transaction.lmd[ 'sample_settings']['sample_confidence_level'] sample_percentage = self.transaction.lmd['sample_settings'][ 'sample_percentage'] sample_function = self.transaction.hmd['sample_function'] train_df = sample_function( self.transaction.input_data.train_df, sample_margin_of_error, sample_confidence_level, sample_percentage) test_df = sample_function(self.transaction.input_data.test_df, sample_margin_of_error, sample_confidence_level, sample_percentage) sample_size = len(train_df) population_size = len(self.transaction.input_data.train_df) self.transaction.log.warning( f'Training on a sample of {round(sample_size * 100 / population_size, 1)}% your data, results can be unexpected.' ) else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config() self.predictor = lightwood.Predictor(lightwood_config) # Evaluate less often for larger datasets and vice-versa eval_every_x_epochs = int(round(1 * pow(10, 6) * (1 / len(train_df)))) # Within some limits if eval_every_x_epochs > 200: eval_every_x_epochs = 200 if eval_every_x_epochs < 3: eval_every_x_epochs = 3 logging.getLogger().setLevel(logging.DEBUG) if self.transaction.lmd['stop_training_in_x_seconds'] is None: self.predictor.learn(from_data=train_df, test_data=test_df, callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs) else: self.predictor.learn(from_data=train_df, test_data=test_df, stop_training_after_seconds=self.transaction. lmd['stop_training_in_x_seconds'], callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs) self.transaction.log.info('Training accuracy of: {}'.format( self.predictor.train_accuracy)) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data') self.predictor.save( path_to=self.transaction.lmd['lightwood_data']['save_path'])
def train(self): lightwood_config = self._create_lightwood_config() self.predictor = lightwood.Predictor(lightwood_config) self.predictor.learn(from_data=self.transaction.input_data.train_df, test_data=self.transaction.input_data.test_df) print(self.predictor.train_accuracy)
def train(self): if self.transaction.lmd['use_gpu'] is not None: lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[ 'use_gpu'] secondary_type_dict = {} if self.transaction.lmd['tss']['is_timeseries']: self.transaction.log.debug( 'Reshaping data into timeseries format, this may take a while !' ) train_df, secondary_type_dict = self._create_timeseries_df( self.transaction.input_data.train_df) test_df, _ = self._create_timeseries_df( self.transaction.input_data.test_df) self.transaction.log.debug( 'Done reshaping data into timeseries format !') else: if self.transaction.lmd['sample_settings']['sample_for_training']: sample_margin_of_error = self.transaction.lmd[ 'sample_settings']['sample_margin_of_error'] sample_confidence_level = self.transaction.lmd[ 'sample_settings']['sample_confidence_level'] sample_percentage = self.transaction.lmd['sample_settings'][ 'sample_percentage'] sample_function = self.transaction.hmd['sample_function'] train_df = sample_function( self.transaction.input_data.train_df, sample_margin_of_error, sample_confidence_level, sample_percentage) test_df = sample_function(self.transaction.input_data.test_df, sample_margin_of_error, sample_confidence_level, sample_percentage) sample_size = len(train_df) population_size = len(self.transaction.input_data.train_df) self.transaction.log.warning( f'Training on a sample of {round(sample_size * 100 / population_size, 1)}% your data, results can be unexpected.' ) else: train_df = self.transaction.input_data.train_df test_df = self.transaction.input_data.test_df lightwood_config = self._create_lightwood_config(secondary_type_dict) lightwood_train_ds = lightwood.api.data_source.DataSource( train_df, config=lightwood_config) lightwood_test_ds = lightwood_train_ds.make_child(test_df) self.transaction.lmd['lightwood_data']['save_path'] = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'], 'lightwood_data') Path(CONFIG.MINDSDB_STORAGE_PATH).joinpath( self.transaction.lmd['name']).mkdir(mode=0o777, exist_ok=True, parents=True) logging.getLogger().setLevel(logging.DEBUG) predictors_and_accuracies = [] use_mixers = self.transaction.lmd.get('use_mixers', None) if use_mixers is not None: if isinstance(use_mixers, list): mixer_classes = use_mixers else: mixer_classes = [use_mixers] else: mixer_classes = lightwood.mixers.BaseMixer.__subclasses__() for mixer_class in mixer_classes: lightwood_config['mixer']['kwargs'] = {} lightwood_config['mixer']['class'] = mixer_class if lightwood_config['mixer']['class'] == lightwood.mixers.NnMixer: # Evaluate less often for larger datasets and vice-versa eval_every_x_epochs = int( round(1 * pow(10, 6) * (1 / len(train_df)))) # Within some limits if eval_every_x_epochs > 200: eval_every_x_epochs = 200 if eval_every_x_epochs < 3: eval_every_x_epochs = 3 kwargs = lightwood_config['mixer']['kwargs'] kwargs['callback_on_iter'] = self.callback_on_iter kwargs['eval_every_x_epochs'] = eval_every_x_epochs / len( mixer_classes) kwargs['stop_training_after_seconds'] = self.transaction.lmd[ 'stop_training_in_x_seconds'] self.predictor = lightwood.Predictor(lightwood_config.copy()) self.predictor.learn(from_data=lightwood_train_ds, test_data=lightwood_test_ds) self.transaction.log.info('[{}] Training accuracy of: {}'.format( mixer_class.__name__, self.predictor.train_accuracy)) validation_predictions = self.predict('validate') validation_accuracy = evaluate_accuracy( validation_predictions, self.transaction.input_data.validation_df[ self.transaction.input_data. validation_df['make_predictions'].astype(bool) == True] if self.transaction.lmd['tss']['is_timeseries'] else self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], self.transaction.lmd['predict_columns'], backend=self, use_conf_intervals=False # r2_score will be used for regression ) predictors_and_accuracies.append( (self.predictor, validation_accuracy)) best_predictor, best_accuracy = max(predictors_and_accuracies, key=lambda x: x[1]) # Find predictor with NnMixer for predictor, accuracy in predictors_and_accuracies: if isinstance(predictor._mixer, lightwood.mixers.NnMixer): nn_mixer_predictor, nn_mixer_predictor_accuracy = predictor, accuracy break else: nn_mixer_predictor, nn_mixer_predictor_accuracy = None, None self.predictor = best_predictor # If difference between accuracies of best predictor and NnMixer predictor # is small, then use NnMixer predictor if nn_mixer_predictor is not None: SMALL_ACCURACY_DIFFERENCE = 0.01 if (best_accuracy - nn_mixer_predictor_accuracy) < SMALL_ACCURACY_DIFFERENCE: self.predictor = nn_mixer_predictor self.predictor.save( path_to=self.transaction.lmd['lightwood_data']['save_path'])