Пример #1
0
    def train(self):
        if self.transaction.lmd['model_order_by'] is not None and len(
                self.transaction.lmd['model_order_by']) > 0:
            train_df = self._create_timeseries_df(
                self.transaction.input_data.train_df)
            test_df = self._create_timeseries_df(
                self.transaction.input_data.test_df)
        else:
            train_df = self.transaction.input_data.train_df
            test_df = self.transaction.input_data.test_df

        lightwood_config = self._create_lightwood_config()

        if self.transaction.lmd['skip_model_training'] == True:
            self.predictor = lightwood.Predictor(load_from_path=os.path.join(
                CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] +
                '_lightwood_data'))
        else:
            self.predictor = lightwood.Predictor(lightwood_config)
            self.predictor.learn(from_data=train_df, test_data=test_df)
            self.transaction.log.info('Training accuracy of: {}'.format(
                self.predictor.train_accuracy))

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_lightwood_data')
        self.predictor.save(
            path_to=self.transaction.lmd['lightwood_data']['save_path'])
Пример #2
0
    def train(self):
        lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu']

        if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0:
            self.transaction.log.debug('Reshaping data into timeseries format, this may take a while !')
            train_df = self._create_timeseries_df(self.transaction.input_data.train_df)
            test_df = self._create_timeseries_df(self.transaction.input_data.test_df)
            self.transaction.log.debug('Done reshaping data into timeseries format !')
        else:
            train_df = self.transaction.input_data.train_df
            test_df = self.transaction.input_data.test_df

        lightwood_config = self._create_lightwood_config()

        if self.transaction.lmd['skip_model_training'] == True:
            self.predictor = lightwood.Predictor(load_from_path=os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data'))
        else:
            self.predictor = lightwood.Predictor(lightwood_config)

            if self.transaction.lmd['stop_training_in_x_seconds'] is None:
                self.predictor.learn(from_data=train_df, test_data=test_df, callback_on_iter=self.callback_on_iter)
            else:
                self.predictor.learn(from_data=train_df, test_data=test_df, stop_training_after_seconds=self.transaction.lmd['stop_training_in_x_seconds'], callback_on_iter=self.callback_on_iter)

            self.transaction.log.info('Training accuracy of: {}'.format(self.predictor.train_accuracy))

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data')
        self.predictor.save(path_to=self.transaction.lmd['lightwood_data']['save_path'])
Пример #3
0
    def predict(self, mode='predict', ignore_columns=[]):
        lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu']
        lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache']
        lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model']

        if mode == 'predict':
            # Doing it here since currently data cleanup is included in this, in the future separate data cleanup
            lightwood_config = self._create_lightwood_config()
            df = self.transaction.input_data.data_frame
        if mode == 'validate':
            df = self.transaction.input_data.validation_df
        elif mode == 'test':
            df = self.transaction.input_data.test_df

        if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0:
            df = self._create_timeseries_df(df)

        if self.predictor is None:
            self.predictor = lightwood.Predictor(load_from_path=self.transaction.lmd['lightwood_data']['save_path'])

        # not the most efficient but least prone to bug and should be fast enough
        if len(ignore_columns)  > 0:
            run_df = df.copy(deep=True)
            for col_name in ignore_columns:
                run_df[col_name] = [None] * len(run_df[col_name])
        else:
            run_df = df

        predictions = self.predictor.predict(when_data=run_df)

        formated_predictions = {}
        for k in predictions:
            formated_predictions[k] = predictions[k]['predictions']

        return formated_predictions
Пример #4
0
    def predict(self, mode='predict', ignore_columns=[]):
        if mode == 'predict':
            # Doing it here since currently data cleanup is included in this, in the future separate data cleanup
            lightwood_config = self._create_lightwood_config()
            df = self.transaction.input_data.data_frame
        if mode == 'validate':
            df = self.transaction.input_data.validation_df
        elif mode == 'test':
            df = self.transaction.input_data.test_df

        if self.predictor is None:
            self.predictor = lightwood.Predictor(
                load_from_path=self.transaction.lmd['lightwood_data']
                ['save_path'])

        # not the most efficient but least prone to bug and should be fast enough
        if len(ignore_columns) > 0:
            run_df = df.copy(deep=True)
            for col_name in ignore_columns:
                run_df[col_name] = [None] * len(run_df[col_name])
        else:
            run_df = df

        predictions = self.predictor.predict(when_data=run_df)

        formated_predictions = {}
        for k in predictions:
            formated_predictions[k] = predictions[k]['predictions']

        return formated_predictions
Пример #5
0
    def predict(self, mode='predict', ignore_columns=None):
        if ignore_columns is None:
            ignore_columns = []
        if self.transaction.lmd['use_gpu'] is not None:
            lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu']
        lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache']
        lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model']

        if mode == 'predict':
            # Doing it here since currently data cleanup is included in this, in the future separate data cleanup
            lightwood_config = self._create_lightwood_config()
            df = self.transaction.input_data.data_frame
        if mode == 'validate':
            df = self.transaction.input_data.validation_df
        elif mode == 'test':
            df = self.transaction.input_data.test_df

        if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0:
            df = self._create_timeseries_df(df)

        if self.predictor is None:
            self.predictor = lightwood.Predictor(load_from_path=self.transaction.lmd['lightwood_data']['save_path'])

        # not the most efficient but least prone to bug and should be fast enough
        if len(ignore_columns)  > 0:
            run_df = df.copy(deep=True)
            for col_name in ignore_columns:
                run_df[col_name] = [None] * len(run_df[col_name])
        else:
            run_df = df

        predictions = self.predictor.predict(when_data=run_df)

        formated_predictions = {}
        for k in predictions:
            formated_predictions[k] = predictions[k]['predictions']

            confidence_arr = []
            for confidence_name in ['selfaware_confidences','loss_confidences', 'quantile_confidences']:
                if confidence_name in predictions[k]:
                    conf_arr = [x if x > 0 else 0 for x in predictions[k][confidence_name]]
                    conf_arr = [x if x < 1 else 1 for x in conf_arr]
                    confidence_arr.append(conf_arr)

            if len(confidence_arr) > 0:
                confidences = []
                for n in range(len(confidence_arr[0])):
                    confidences.append([])
                    for i in range(len(confidence_arr)):
                        confidences[-1].append(confidence_arr[i][n])
                    confidences[-1] = sum(confidences[-1])/len(confidences[-1])
                formated_predictions[f'{k}_model_confidence'] = confidences

            if 'confidence_range' in predictions[k]:
                formated_predictions[f'{k}_confidence_range'] = predictions[k]['confidence_range']

        return formated_predictions
Пример #6
0
    def train(self):
        if self.transaction.lmd['use_gpu'] is not None:
            lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd['use_gpu']
        lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = not self.transaction.lmd['force_disable_cache']
        lightwood.config.config.CONFIG.SELFAWARE = self.transaction.lmd['use_selfaware_model']

        if self.transaction.lmd['model_order_by'] is not None and len(self.transaction.lmd['model_order_by']) > 0:
            self.transaction.log.debug('Reshaping data into timeseries format, this may take a while !')
            train_df = self._create_timeseries_df(self.transaction.input_data.train_df)
            test_df = self._create_timeseries_df(self.transaction.input_data.test_df)
            self.transaction.log.debug('Done reshaping data into timeseries format !')
        else:
            train_df = self.transaction.input_data.train_df
            test_df = self.transaction.input_data.test_df

        lightwood_config = self._create_lightwood_config()

        if self.transaction.lmd['skip_model_training'] == True:
            self.predictor = lightwood.Predictor(load_from_path=os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data'))
        else:
            self.predictor = lightwood.Predictor(lightwood_config)

            # Evaluate less often for larger datasets and vice-versa
            eval_every_x_epochs = int(round(1 * pow(10,6) * (1/len(train_df))))

            # Within some limits
            if eval_every_x_epochs > 20:
                eval_every_x_epochs = 20
            if eval_every_x_epochs < 3:
                eval_every_x_epochs = 3

            logging.getLogger().setLevel(logging.DEBUG)
            if self.transaction.lmd['stop_training_in_x_seconds'] is None:
                self.predictor.learn(from_data=train_df, test_data=test_df, callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs)
            else:
                self.predictor.learn(from_data=train_df, test_data=test_df, stop_training_after_seconds=self.transaction.lmd['stop_training_in_x_seconds'], callback_on_iter=self.callback_on_iter, eval_every_x_epochs=eval_every_x_epochs)

            self.transaction.log.info('Training accuracy of: {}'.format(self.predictor.train_accuracy))

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_lightwood_data')
        self.predictor.save(path_to=self.transaction.lmd['lightwood_data']['save_path'])
Пример #7
0
    def train(self):
        lightwood_config = self._create_lightwood_config()

        self.predictor = lightwood.Predictor(lightwood_config)

        self.predictor.learn(from_data=self.transaction.input_data.train_df,
                             test_data=self.transaction.input_data.test_df)

        self.transaction.log.info('Training accuracy of: {}'.format(
            self.predictor.train_accuracy))

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_lightwood_data')
        self.predictor.save(
            path_to=self.transaction.lmd['lightwood_data']['save_path'])
def test_model(df_test):
    print('Testing model')
    # Load some testing data and extract the real values for the target column
    predictor = lightwood.Predictor(load_from_path='lightwood_model.dill')
    predictions = predictor.predict(when_data=df_test)

    test_tags = df_test.tags
    predicted_tags = predictions['tags']['predictions']

    # We will use an internal encoder to convert the tags to binary vectors
    # This allows us to evaluate the F1 score measure
    # It evaluates how good the model is at predicting correct tags and avoiding false positives, while staying robust to class imbalances
    test_tags_encoded = predictor._mixer.encoders['tags'].encode(test_tags)
    pred_tags_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags)
    score = f1_score(test_tags_encoded, pred_tags_encoded, average='weighted')

    # An f1 score of around 0.2 is expected for this dataset
    # Mind that such score is expected if applying manual text preprocessing, which we don't do in this example
    print('Test f1_score', round(score, 4))
Пример #9
0
def test_model():
    # Load some testing data and extract the real values for the target column
    test = pd.read_csv(
        'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/benchmarks/default_of_credit/dataset/test.csv'
    )
    real = [str(x) for x in test['default.payment.next.month']]

    test = test.drop(columns=['default.payment.next.month'])

    # Load the lightwood model from where we previously saved it and predict using it
    predictor = lightwood.Predictor(load_from_path='lightwood_model.dill')
    predictions = predictor.predict(when_data=test)
    predicted = [
        str(x)
        for x in predictions['default.payment.next.month']['predictions']
    ]

    # Get the balanced accuracy score to see how well we did (in this case > 50% means better than random)
    balanced_accuracy_pct = balanced_accuracy_score(real, predicted) * 100
    print(f'Balacned accuracy score of {round(balanced_accuracy_pct,1)}%')
Пример #10
0
def train_model(df_train):
    # A configuration describing the contents of the dataframe, what are the targets we want to predict and what are the features we want to use
    config = {
        'input_features': [
            {'name': 'plot_synopsis',
             'type': 'text'},
        ],
        'output_features': [
            {'name': 'tags', 'type': 'multiple_categorical'}
        ],
    }

    # Callback to log various training stats (currently the only hook into the training process)
    def train_callback(epoch, error, test_error, test_error_gradient, test_accuracy):
        print(f'We reached epoch {epoch} with error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}')

    # The actual training process
    predictor = lightwood.Predictor(config)
    print('Starting model training')
    predictor.learn(from_data=df,
                    callback_on_iter=train_callback,
                    eval_every_x_epochs=5)
    # Save the lightwood model
    predictor.save('lightwood_model.dill')
Пример #11
0
                  test_accuracy):
    print(
        f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}'
    )


test_cases = [gen_multiply(), gen_correlate(), gen_categorical()]

log_map = {}
for i, data in enumerate(test_cases):
    df_train, df_test, dropout_arr, out_col, name = data

    pmap = {}
    accmap = {}

    pmap['normal'] = lightwood.Predictor(output=[out_col])
    pmap['normal'].learn(from_data=df_train,
                         callback_on_iter=iter_function,
                         eval_every_x_epochs=100)
    accmap['normal'] = pmap['normal'].calculate_accuracy(
        from_data=df_test)[out_col]['value']

    for cols in dropout_arr:
        mk = 'missing_' + '_'.join(cols)
        pmap[mk] = lightwood.Predictor(output=[out_col])
        pmap[mk].learn(from_data=df_train.drop(columns=cols),
                       callback_on_iter=iter_function,
                       eval_every_x_epochs=100)
        accmap[mk + '_unfit'] = pmap['normal'].calculate_accuracy(
            from_data=df_test.drop(columns=cols))[out_col]['value']
        accmap[mk + '_fit'] = pmap[mk].calculate_accuracy(
Пример #12
0
        for n in range(nr_ele)
    ]

data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)


def iter_function(epoch, training_error, test_error, test_error_gradient,
                  test_accuracy):
    print(
        f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}'
    )


if train:
    predictor = lightwood.Predictor(output=['y'])
    predictor.learn(from_data=data_train,
                    callback_on_iter=iter_function,
                    eval_every_x_epochs=200)
    predictor.save('/tmp/ltcrl.pkl')

predictor = lightwood.Predictor(load_from_path='/tmp/ltcrl.pkl')
print('Train accuracy: ', predictor.train_accuracy['y']['value'])
print('Test accuracy: ',
      predictor.calculate_accuracy(from_data=data_test)['y']['value'])

print(f'Accuracy for all columns present: ',
      predictor.calculate_accuracy(from_data=data_test)['y']['value'])

predictions = predictor.calculate_accuracy(from_data=data_test)
print(f'Confidence mean for all columns present ',
Пример #13
0
def train_model():
    # Load some training data (default on credit, for predicting whether or not someone will default on their credit)
    df = pd.read_csv(
        'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/benchmarks/default_of_credit/dataset/train.csv'
    )

    # A configuration describing the contents of the dataframe, what are the targets we want to predict and what are the features we want to use
    # Note: the `weights` for the output column `default.payment.next.month`, since the number of samples is uneven between the two categories, but we care about balanced accuracy rather than overall accuracy
    config = {
        'input_features': [{
            'name': 'ID',
            'type': 'numeric'
        }, {
            'name': 'LIMIT_BAL',
            'type': 'numeric'
        }, {
            'name': 'SEX',
            'type': 'categorical'
        }, {
            'name': 'EDUCATION',
            'type': 'categorical'
        }, {
            'name': 'MARRIAGE',
            'type': 'categorical'
        }, {
            'name': 'AGE',
            'type': 'numeric'
        }, {
            'name': 'PAY_0',
            'type': 'numeric'
        }, {
            'name': 'PAY_2',
            'type': 'numeric'
        }, {
            'name': 'PAY_3',
            'type': 'numeric'
        }, {
            'name': 'PAY_4',
            'type': 'numeric'
        }, {
            'name': 'PAY_5',
            'type': 'numeric'
        }, {
            'name': 'PAY_6',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT1',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT2',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT3',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT4',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT5',
            'type': 'numeric'
        }, {
            'name': 'BILL_AMT6',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT1',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT2',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT3',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT4',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT5',
            'type': 'numeric'
        }, {
            'name': 'PAY_AMT6',
            'type': 'numeric'
        }],
        'output_features': [{
            'name': 'default.payment.next.month',
            'type': 'categorical',
            'weights': {
                '0': 0.3,
                '1': 1
            }
        }],
        'mixer': {
            'class': NnMixer
        }
    }

    # Callback to log various training stats (currently the only hook into the training process)
    def train_callback(epoch, error, test_error, test_error_gradient,
                       test_accuracy):
        print(
            f'We reached epoch {epoch} with error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'
        )

    # The actual training process
    predictor = lightwood.Predictor(config)
    # Note: If `stop_training_after_seconds` is not set, training will stop automatically once we determine the model is overfitting (we separate a testing and a training dataset internally from the dataframe given and only train on the training one, using the testing one to determine overfitting, pick the best model and evaluate model accuracy)
    predictor.learn(from_data=df,
                    callback_on_iter=train_callback,
                    eval_every_x_epochs=5,
                    stop_training_after_seconds=100)

    # Save the lightwood model
    predictor.save('lightwood_model.dill')
Пример #14
0
    def predict(self, mode='predict', ignore_columns=None):
        if ignore_columns is None:
            ignore_columns = []
        if self.transaction.lmd['use_gpu'] is not None:
            lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[
                'use_gpu']

        if mode == 'predict':
            df = self.transaction.input_data.data_frame
        elif mode == 'validate':
            df = self.transaction.input_data.validation_df
        elif mode == 'test':
            df = self.transaction.input_data.test_df
        elif mode == 'predict_on_train_data':
            df = self.transaction.input_data.train_df
        else:
            raise Exception(f'Unknown mode specified: "{mode}"')

        if self.transaction.lmd['model_order_by'] is not None and len(
                self.transaction.lmd['model_order_by']) > 0:
            df = self._create_timeseries_df(df)

        if self.predictor is None:
            self.predictor = lightwood.Predictor(
                load_from_path=self.transaction.lmd['lightwood_data']
                ['save_path'])

        # not the most efficient but least prone to bug and should be fast enough
        if len(ignore_columns) > 0:
            run_df = df.copy(deep=True)
            for col_name in ignore_columns:
                run_df[col_name] = [None] * len(run_df[col_name])
        else:
            run_df = df

        predictions = self.predictor.predict(when_data=run_df)

        formated_predictions = {}
        for k in predictions:
            formated_predictions[k] = predictions[k]['predictions']

            model_confidence_dict = {}
            for confidence_name in [
                    'selfaware_confidences', 'loss_confidences',
                    'quantile_confidences'
            ]:

                if confidence_name in predictions[k]:
                    if k not in model_confidence_dict:
                        model_confidence_dict[k] = []

                    for i in range(len(predictions[k][confidence_name])):
                        if len(model_confidence_dict[k]) <= i:
                            model_confidence_dict[k].append([])
                        conf = predictions[k][confidence_name][i]
                        # @TODO We should make sure lightwood never returns confidences above or bellow 0 and 1
                        if conf < 0:
                            conf = 0
                        if conf > 1:
                            conf = 1
                        model_confidence_dict[k][i].append(conf)

            for k in model_confidence_dict:
                model_confidence_dict[k] = [
                    np.mean(x) for x in model_confidence_dict[k]
                ]

            for k in model_confidence_dict:
                formated_predictions[
                    f'{k}_model_confidence'] = model_confidence_dict[k]

            if 'confidence_range' in predictions[k]:
                formated_predictions[f'{k}_confidence_range'] = predictions[k][
                    'confidence_range']

        return formated_predictions
Пример #15
0
    def train(self):
        if self.transaction.lmd['use_gpu'] is not None:
            lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[
                'use_gpu']

        if self.transaction.lmd['model_order_by'] is not None and len(
                self.transaction.lmd['model_order_by']) > 0:
            self.transaction.log.debug(
                'Reshaping data into timeseries format, this may take a while !'
            )
            train_df = self._create_timeseries_df(
                self.transaction.input_data.train_df)
            test_df = self._create_timeseries_df(
                self.transaction.input_data.test_df)
            self.transaction.log.debug(
                'Done reshaping data into timeseries format !')
        else:
            if self.transaction.lmd['sample_settings']['sample_for_training']:
                sample_margin_of_error = self.transaction.lmd[
                    'sample_settings']['sample_margin_of_error']
                sample_confidence_level = self.transaction.lmd[
                    'sample_settings']['sample_confidence_level']
                sample_percentage = self.transaction.lmd['sample_settings'][
                    'sample_percentage']
                sample_function = self.transaction.hmd['sample_function']

                train_df = sample_function(
                    self.transaction.input_data.train_df,
                    sample_margin_of_error, sample_confidence_level,
                    sample_percentage)

                test_df = sample_function(self.transaction.input_data.test_df,
                                          sample_margin_of_error,
                                          sample_confidence_level,
                                          sample_percentage)

                sample_size = len(train_df)
                population_size = len(self.transaction.input_data.train_df)

                self.transaction.log.warning(
                    f'Training on a sample of {round(sample_size * 100 / population_size, 1)}% your data, results can be unexpected.'
                )
            else:
                train_df = self.transaction.input_data.train_df
                test_df = self.transaction.input_data.test_df

        lightwood_config = self._create_lightwood_config()

        self.predictor = lightwood.Predictor(lightwood_config)

        # Evaluate less often for larger datasets and vice-versa
        eval_every_x_epochs = int(round(1 * pow(10, 6) * (1 / len(train_df))))

        # Within some limits
        if eval_every_x_epochs > 200:
            eval_every_x_epochs = 200
        if eval_every_x_epochs < 3:
            eval_every_x_epochs = 3

        logging.getLogger().setLevel(logging.DEBUG)
        if self.transaction.lmd['stop_training_in_x_seconds'] is None:
            self.predictor.learn(from_data=train_df,
                                 test_data=test_df,
                                 callback_on_iter=self.callback_on_iter,
                                 eval_every_x_epochs=eval_every_x_epochs)
        else:
            self.predictor.learn(from_data=train_df,
                                 test_data=test_df,
                                 stop_training_after_seconds=self.transaction.
                                 lmd['stop_training_in_x_seconds'],
                                 callback_on_iter=self.callback_on_iter,
                                 eval_every_x_epochs=eval_every_x_epochs)

        self.transaction.log.info('Training accuracy of: {}'.format(
            self.predictor.train_accuracy))

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_lightwood_data')
        self.predictor.save(
            path_to=self.transaction.lmd['lightwood_data']['save_path'])
Пример #16
0
 def train(self):
     lightwood_config = self._create_lightwood_config()
     self.predictor = lightwood.Predictor(lightwood_config)
     self.predictor.learn(from_data=self.transaction.input_data.train_df,
                          test_data=self.transaction.input_data.test_df)
     print(self.predictor.train_accuracy)
Пример #17
0
    def train(self):
        if self.transaction.lmd['use_gpu'] is not None:
            lightwood.config.config.CONFIG.USE_CUDA = self.transaction.lmd[
                'use_gpu']

        secondary_type_dict = {}
        if self.transaction.lmd['tss']['is_timeseries']:
            self.transaction.log.debug(
                'Reshaping data into timeseries format, this may take a while !'
            )
            train_df, secondary_type_dict = self._create_timeseries_df(
                self.transaction.input_data.train_df)
            test_df, _ = self._create_timeseries_df(
                self.transaction.input_data.test_df)
            self.transaction.log.debug(
                'Done reshaping data into timeseries format !')
        else:
            if self.transaction.lmd['sample_settings']['sample_for_training']:
                sample_margin_of_error = self.transaction.lmd[
                    'sample_settings']['sample_margin_of_error']
                sample_confidence_level = self.transaction.lmd[
                    'sample_settings']['sample_confidence_level']
                sample_percentage = self.transaction.lmd['sample_settings'][
                    'sample_percentage']
                sample_function = self.transaction.hmd['sample_function']

                train_df = sample_function(
                    self.transaction.input_data.train_df,
                    sample_margin_of_error, sample_confidence_level,
                    sample_percentage)

                test_df = sample_function(self.transaction.input_data.test_df,
                                          sample_margin_of_error,
                                          sample_confidence_level,
                                          sample_percentage)

                sample_size = len(train_df)
                population_size = len(self.transaction.input_data.train_df)

                self.transaction.log.warning(
                    f'Training on a sample of {round(sample_size * 100 / population_size, 1)}% your data, results can be unexpected.'
                )
            else:
                train_df = self.transaction.input_data.train_df
                test_df = self.transaction.input_data.test_df

        lightwood_config = self._create_lightwood_config(secondary_type_dict)

        lightwood_train_ds = lightwood.api.data_source.DataSource(
            train_df, config=lightwood_config)
        lightwood_test_ds = lightwood_train_ds.make_child(test_df)

        self.transaction.lmd['lightwood_data']['save_path'] = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'],
            'lightwood_data')
        Path(CONFIG.MINDSDB_STORAGE_PATH).joinpath(
            self.transaction.lmd['name']).mkdir(mode=0o777,
                                                exist_ok=True,
                                                parents=True)

        logging.getLogger().setLevel(logging.DEBUG)

        predictors_and_accuracies = []

        use_mixers = self.transaction.lmd.get('use_mixers', None)
        if use_mixers is not None:
            if isinstance(use_mixers, list):
                mixer_classes = use_mixers
            else:
                mixer_classes = [use_mixers]
        else:
            mixer_classes = lightwood.mixers.BaseMixer.__subclasses__()

        for mixer_class in mixer_classes:
            lightwood_config['mixer']['kwargs'] = {}
            lightwood_config['mixer']['class'] = mixer_class

            if lightwood_config['mixer']['class'] == lightwood.mixers.NnMixer:
                # Evaluate less often for larger datasets and vice-versa
                eval_every_x_epochs = int(
                    round(1 * pow(10, 6) * (1 / len(train_df))))

                # Within some limits
                if eval_every_x_epochs > 200:
                    eval_every_x_epochs = 200
                if eval_every_x_epochs < 3:
                    eval_every_x_epochs = 3

                kwargs = lightwood_config['mixer']['kwargs']

                kwargs['callback_on_iter'] = self.callback_on_iter
                kwargs['eval_every_x_epochs'] = eval_every_x_epochs / len(
                    mixer_classes)
                kwargs['stop_training_after_seconds'] = self.transaction.lmd[
                    'stop_training_in_x_seconds']

            self.predictor = lightwood.Predictor(lightwood_config.copy())

            self.predictor.learn(from_data=lightwood_train_ds,
                                 test_data=lightwood_test_ds)

            self.transaction.log.info('[{}] Training accuracy of: {}'.format(
                mixer_class.__name__, self.predictor.train_accuracy))

            validation_predictions = self.predict('validate')
            validation_accuracy = evaluate_accuracy(
                validation_predictions,
                self.transaction.input_data.validation_df[
                    self.transaction.input_data.
                    validation_df['make_predictions'].astype(bool) == True]
                if self.transaction.lmd['tss']['is_timeseries'] else
                self.transaction.input_data.validation_df,
                self.transaction.lmd['stats_v2'],
                self.transaction.lmd['predict_columns'],
                backend=self,
                use_conf_intervals=False  # r2_score will be used for regression
            )

            predictors_and_accuracies.append(
                (self.predictor, validation_accuracy))

        best_predictor, best_accuracy = max(predictors_and_accuracies,
                                            key=lambda x: x[1])

        # Find predictor with NnMixer
        for predictor, accuracy in predictors_and_accuracies:
            if isinstance(predictor._mixer, lightwood.mixers.NnMixer):
                nn_mixer_predictor, nn_mixer_predictor_accuracy = predictor, accuracy
                break
        else:
            nn_mixer_predictor, nn_mixer_predictor_accuracy = None, None

        self.predictor = best_predictor

        # If difference between accuracies of best predictor and NnMixer predictor
        # is small, then use NnMixer predictor
        if nn_mixer_predictor is not None:
            SMALL_ACCURACY_DIFFERENCE = 0.01
            if (best_accuracy -
                    nn_mixer_predictor_accuracy) < SMALL_ACCURACY_DIFFERENCE:
                self.predictor = nn_mixer_predictor

        self.predictor.save(
            path_to=self.transaction.lmd['lightwood_data']['save_path'])