示例#1
0
    def predict(self, when_data=None, when=None):
        """
        Predict given when conditions
        :param when_data: a dataframe
        :param when: a dictionary
        :return: a complete dataframe
        """
        if when is not None:
            when_dict = {key: [when[key]] for key in when}
            when_data = pandas.DataFrame(when_dict)

        when_data_ds = DataSource(when_data, self.config)
        when_data_ds.encoders = self._mixer.encoders

        main_mixer_predictions = self._mixer.predict(when_data_ds)

        if CONFIG.HELPER_MIXERS and self.has_boosting_mixer:
            for output_column in main_mixer_predictions:
                if self._helper_mixers is not None and output_column in self._helper_mixers:
                    if (self._helper_mixers[output_column]['accuracy'] >
                            1.00 * self.train_accuracy[output_column]['value']
                        ) or CONFIG.FORCE_HELPER_MIXERS:
                        helper_mixer_predictions = self._helper_mixers[
                            output_column]['model'].predict(
                                when_data_ds, [output_column])

                        main_mixer_predictions[
                            output_column] = helper_mixer_predictions[
                                output_column]

        return main_mixer_predictions
示例#2
0
    def test_prepare_encoders(self):
        df, config = self.df, self.config
        ds = DataSource(df, config)
        assert ds.enable_cache

        encoders = ds.encoders

        for col in ['x1', 'x2']:
            assert isinstance(encoders[col], NumericEncoder)
            assert encoders[col]._prepared is True
            assert encoders[col].is_target is False
            assert encoders[col]._type == 'int'

        assert isinstance(encoders['y'], CategoricalAutoEncoder)
        assert encoders['y']._prepared is True
        assert encoders['y'].is_target is True
        assert encoders['y'].onehot_encoder._prepared is True
        assert encoders['y'].onehot_encoder.is_target is True
        assert encoders['y'].use_autoencoder is False

        encoded_column_x1 = ds.get_encoded_column_data('x1')
        assert isinstance(encoded_column_x1, Tensor)
        assert encoded_column_x1.shape[0] == len(df)
        encoded_column_x2 = ds.get_encoded_column_data('x2')
        assert isinstance(encoded_column_x2, Tensor)
        assert encoded_column_x2.shape[0] == len(df)
        encoded_column_y = ds.get_encoded_column_data('y')
        assert isinstance(encoded_column_y, Tensor)
        assert encoded_column_y.shape[0] == len(df)
示例#3
0
    def test_encoded_cache(self):
        df, config = self.df, self.config

        ds = DataSource(df, config)
        assert ds.enable_cache

        for column in ['x1', 'x2', 'y']:
            assert not column in ds.encoded_cache
            encoded_column = ds.get_encoded_column_data(column)
            assert (ds.encoded_cache[column] == encoded_column).all()
示例#4
0
    def predict(self, when_data=None, when=None):
        """
        Predict given when conditions
        :param when_data: a dataframe
        :param when: a dictionary
        :return: a complete dataframe
        """

        if when is not None:
            when_dict = {key: [when[key]] for key in when}
            when_data = pandas.DataFrame(when_dict)

        when_data_ds = DataSource(when_data, self.config)
        when_data_ds.encoders = self._mixer.encoders

        return self._mixer.predict(when_data_ds)
示例#5
0
    def learn(self, from_data, test_data=None):
        """
        Train and save a model (you can use this to retrain model from data).

        :param from_data: DataFrame or DataSource
            The data to learn from

        :param test_data: DataFrame or DataSource
            The data to test accuracy and learn_error from
        """
        device, _available_devices = get_devices()
        log.info(f'Computing device used: {device}')
        # generate the configuration and set the order for the input and output columns
        if self._generate_config is True:
            self._input_columns = [col for col in from_data if col not in self._output_columns]
            self.config = {
                'input_features': [{'name': col, 'type': self._type_map(from_data, col)} for col in self._input_columns],
                'output_features': [{'name': col, 'type': self._type_map(from_data, col)} for col in self._output_columns]
            }
            self.config = predictor_config_schema.validate(self.config)
            log.info('Automatically generated a configuration')
            log.info(self.config)
        else:
            self._output_columns = [col['name'] for col in self.config['output_features']]
            self._input_columns = [col['name'] for col in self.config['input_features']]

        if isinstance(from_data, pandas.DataFrame):
            train_ds = DataSource(from_data, self.config)
        elif isinstance(from_data, DataSource):
            train_ds = from_data
        else:
            raise TypeError(':from_data: must be either DataFrame or DataSource')

        nr_subsets = 3 if len(train_ds) > 100 else 1

        if test_data is None:
            test_ds = train_ds.subset(0.1)
        elif isinstance(test_data, pandas.DataFrame):
            test_ds = train_ds.make_child(test_data)
        elif isinstance(test_data, DataSource):
            test_ds = test_data
        else:
            raise TypeError(':test_data: must be either DataFrame or DataSource')

        train_ds.create_subsets(nr_subsets)
        test_ds.create_subsets(nr_subsets)

        train_ds.train()
        test_ds.train()

        mixer_class = self.config['mixer']['class']
        mixer_kwargs = self.config['mixer']['kwargs']
        self._mixer = mixer_class(**mixer_kwargs)
        self._mixer.fit(train_ds=train_ds, test_ds=test_ds)
        self.train_accuracy = self._mixer.calculate_accuracy(test_ds)

        return self
示例#6
0
    def test_fit_and_predict(self):
        config = {
            'input_features': [
                {
                    'name': 'x',
                    'type': 'numeric'
                },
                {
                    'name': 'y',
                    'type': 'numeric'
                }
            ],

            'output_features': [
                {
                    'name': 'z',
                    'type': 'numeric'
                },
                {
                    'name': 'z`',
                    'type': 'categorical'
                }
            ]
        }
        config = predictor_config_schema.validate(config)

        N = 100

        data = {'x': [i for i in range(N)], 'y': [random.randint(i, i + 20) for i in range(N)]}
        nums = [data['x'][i] * data['y'][i] for i in range(N)]

        data['z'] = [i + 0.5 for i in range(N)]
        data['z`'] = ['low' if i < 50 else 'high' for i in nums]

        data_frame = pandas.DataFrame(data)
        train_ds = DataSource(data_frame, config)
        train_ds.create_subsets(1)

        mixer = NnMixer(stop_training_after_seconds=50)
        mixer.fit(train_ds, train_ds)

        test_ds = train_ds.make_child(data_frame[['x', 'y']])
        predictions = mixer.predict(test_ds)
示例#7
0
    def calculate_accuracy(self, from_data):
        """
        calculates the accuracy of the model
        :param from_data:a dataframe
        :return accuracies: dictionaries of accuracies
        """

        if self._mixer is None:
            logging.error("Please train the model before calculating accuracy")
            return
        ds = from_data if isinstance(from_data, DataSource) else DataSource(
            from_data, self.config)
        predictions = self._mixer.predict(ds, include_extra_data=True)
        accuracies = {}

        for output_column in self._output_columns:

            real = list(map(str, ds.get_column_original_data(output_column)))
            predicted = list(
                map(str, predictions[output_column]['predictions']))

            weight_map = None
            if 'weights' in ds.get_column_config(output_column):
                weight_map = ds.get_column_config(output_column)['weights']

            accuracy = self.apply_accuracy_function(
                ds.get_column_config(output_column)['type'],
                real,
                predicted,
                weight_map=weight_map)

            if ds.get_column_config(output_column)['type'] in (
                    COLUMN_DATA_TYPES.NUMERIC):
                ds.encoders[output_column].decode_log = True
                predicted = ds.get_decoded_column_data(
                    output_column,
                    predictions[output_column]['encoded_predictions'])

                alternative_accuracy = self.apply_accuracy_function(
                    ds.get_column_config(output_column)['type'],
                    real,
                    predicted,
                    weight_map=weight_map)

                if alternative_accuracy['value'] > accuracy['value']:
                    accuracy = alternative_accuracy
                else:
                    ds.encoders[output_column].decode_log = False

            accuracies[output_column] = accuracy

        return accuracies
示例#8
0
    def predict(self, when_data=None, when=None):
        """
        Predict given when conditions.

        :param when_data: pandas.DataFrame
        :param when: dict

        :return: pandas.DataFrame
        """
        device, _available_devices = get_devices()
        log.info(f'Computing device used: {device}')
        if when is not None:
            when_dict = {key: [when[key]] for key in when}
            when_data = pandas.DataFrame(when_dict)

        when_data_ds = DataSource(when_data, self.config, prepare_encoders=False)

        when_data_ds.eval()

        kwargs = {'include_extra_data': self.config.get('include_extra_data', False)}

        return self._mixer.predict(when_data_ds, **kwargs)
示例#9
0
    def test_transformed_cache(self):
        df, config = self.df, self.config

        ds = DataSource(df, config)
        assert ds.enable_cache

        assert ds.transformed_cache is None
        encoded_row = ds[0]  # This creates ds.transformed_cache
        assert len(ds.transformed_cache) == len(df)
        assert ds.transformed_cache[0] == encoded_row

        for i in range(1, len(df)):
            assert ds.transformed_cache[i] is None
            encoded_row = ds[i]
            assert ds.transformed_cache[i] == encoded_row

        alternate_config = copy(config)
        alternate_config['data_source']['cache_transformed_data'] = False
        ds = DataSource(df, alternate_config)
        assert not ds.enable_cache

        for i in range(len(df)):
            encoded_row = ds[i]
            assert ds.transformed_cache is None
示例#10
0
    def calculate_accuracy(self, from_data):
        """
        calculates the accuracy of the model
        :param from_data:a dataframe
        :return accuracies: dictionaries of accuracies
        """
        if self._mixer is None:
            logging.error("Please train the model before calculating accuracy")
            return
        ds = from_data if isinstance(from_data, DataSource) else DataSource(
            from_data, self.config)
        predictions = self._mixer.predict(ds, include_encoded_predictions=True)
        accuracies = {}
        for output_column in self._mixer.output_column_names:
            properties = ds.get_column_config(output_column)
            if properties['type'] == 'categorical':
                accuracies[output_column] = {
                    'function':
                    'accuracy_score',
                    'value':
                    accuracy_score(
                        list(
                            map(str,
                                ds.get_column_original_data(output_column))),
                        list(
                            map(str,
                                predictions[output_column]["predictions"])))
                }
            else:
                # Note: We use this method instead of using `encoded_predictions` since the values in encoded_predictions are never prefectly 0 or 1, and this leads to rather large unwaranted different in the r2 score, re-encoding the predictions means all "flag" values (sign, isnull, iszero) become either 1 or 0

                encoded_predictions = ds.encoders[output_column].encode(
                    predictions[output_column]["predictions"])
                accuracies[output_column] = {
                    'function':
                    'r2_score',
                    'value':
                    r2_score(ds.get_encoded_column_data(output_column),
                             encoded_predictions)
                }

        return accuracies
示例#11
0
    def calculate_accuracy(self, from_data):
        """
        calculates the accuracy of the model
        :param from_data:a dataframe
        :return accuracies: dictionaries of accuracies
        """
        if self._mixer is None:
            logging.log.error("Please train the model before calculating accuracy")
            return
        ds = from_data if isinstance(from_data, DataSource) else DataSource(from_data, self.config)
        predictions = self._mixer.predict(ds, include_encoded_predictions=True)
        accuracies = {}
        for output_column in self._mixer.output_column_names:
            properties = ds.get_column_config(output_column)
            if properties['type'] == 'categorical':
                accuracies[output_column] = accuracy_score(ds.get_column_original_data(output_column), predictions[output_column]["predictions"])

            else:
                accuracies[output_column] = r2_score(ds.get_encoded_column_data(output_column), predictions[output_column]["encoded_predictions"])




        return accuracies
示例#12
0
    def learn(self,
              from_data,
              test_data=None,
              callback_on_iter=None,
              eval_every_x_epochs=20,
              stop_training_after_seconds=None,
              stop_model_building_after_seconds=None):
        """
        Train and save a model (you can use this to retrain model from data)

        :param from_data: (Pandas DataFrame) The data to learn from
        :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from
        :param callback_on_iter: This is function that can be called on every X evaluation cycle
        :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy

        :return: None
        """

        # This is a helper function that will help us auto-determine roughly what data types are in each column
        # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT'
        def type_map(col_name):
            col_pd_type = from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if unique < 100 or unique < len(from_data[col_name]) / 10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else assume its text
                return COLUMN_DATA_TYPES.TEXT

        # generate the configuration and set the order for the input and output columns
        if self._generate_config == True:
            self._input_columns = [
                col for col in from_data if col not in self._output_columns
            ]
            self.config = {
                'input_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._input_columns],
                'output_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._output_columns]
            }
            logging.info('Automatically generated a configuration')
            logging.info(self.config)
        else:
            self._output_columns = [
                col['name'] for col in self.config['input_features']
            ]
            self._input_columns = [
                col['name'] for col in self.config['output_features']
            ]

        # @TODO Make Cross Entropy Loss work with multiple outputs
        if len(self.config['output_features']
               ) == 1 and self.config['output_features'][0]['type'] in (
                   COLUMN_DATA_TYPES.CATEGORICAL):
            is_categorical_output = True
        else:
            is_categorical_output = False

        if stop_training_after_seconds is None:
            stop_training_after_seconds = round(from_data.shape[0] *
                                                from_data.shape[1] / 5)

        if stop_model_building_after_seconds is None:
            stop_model_building_after_seconds = stop_training_after_seconds * 3

        from_data_ds = DataSource(from_data, self.config)

        if test_data is not None:
            test_data_ds = DataSource(test_data, self.config)
        else:
            test_data_ds = from_data_ds.extractRandomSubset(0.1)

        from_data_ds.training = True

        mixer_class = NnMixer
        mixer_params = {}

        if 'mixer' in self.config:
            if 'class' in self.config['mixer']:
                mixer_class = self.config['mixer']['class']
            if 'attrs' in self.config['mixer']:
                mixer_params = self.config['mixer']['attrs']

        # Initialize data sources
        nr_subsets = 3
        from_data_ds.prepare_encoders()
        from_data_ds.create_subsets(nr_subsets)

        try:
            mixer_class({}).fit_data_source(from_data_ds)
        except:
            # Not all mixers might require this
            pass

        input_size = len(from_data_ds[0][0])
        training_data_length = len(from_data_ds)

        test_data_ds.transformer = from_data_ds.transformer
        test_data_ds.encoders = from_data_ds.encoders
        test_data_ds.output_weights = from_data_ds.output_weights
        test_data_ds.create_subsets(nr_subsets)

        if 'optimizer' in self.config:
            optimizer = self.config['optimizer']()

            while True:
                training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

                # Some heuristics...
                if training_time_per_iteration > input_size:
                    if training_time_per_iteration > min(
                        (training_data_length /
                         (4 * input_size)), 16 * input_size):
                        break

                optimizer.total_trials = optimizer.total_trials - 1
                if optimizer.total_trials < 8:
                    optimizer.total_trials = 8
                    break

            training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

            best_parameters = optimizer.evaluate(
                lambda dynamic_parameters: Predictor.evaluate_mixer(
                    mixer_class,
                    mixer_params,
                    from_data_ds,
                    test_data_ds,
                    dynamic_parameters,
                    is_categorical_output,
                    max_training_time=training_time_per_iteration,
                    max_epochs=None))
            logging.info('Using hyperparameter set: ', best_parameters)
        else:
            best_parameters = {}

        mixer = mixer_class(best_parameters,
                            is_categorical_output=is_categorical_output)
        self._mixer = mixer

        for param in mixer_params:
            if hasattr(mixer, param):
                setattr(mixer, param, mixer_params[param])
            else:
                logging.warning(
                    'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter'
                    .format(param=param, mixerclass=str(type(mixer))))

        started = time.time()
        epoch = 0
        eval_next_on_epoch = eval_every_x_epochs

        stop_training = False

        for subset_iteration in [1, 2]:
            if stop_training:
                break
            for subset_id in [*from_data_ds.subsets.keys()]:
                if stop_training:
                    break

                subset_train_ds = from_data_ds.subsets[subset_id]
                subset_test_ds = test_data_ds.subsets[subset_id]

                lowest_error = None
                last_test_error = None
                last_subset_test_error = None
                test_error_delta_buff = []
                subset_test_error_delta_buff = []
                best_model = None

                #iterate over the iter_fit and see what the epoch and mixer error is
                for epoch, training_error in enumerate(
                        mixer.iter_fit(subset_train_ds)):
                    logging.info(
                        'training iteration {iter_i}, error {error}'.format(
                            iter_i=epoch, error=training_error))

                    if epoch >= eval_next_on_epoch:
                        # Prime the model on each subset for a bit
                        if subset_iteration == 1:
                            break

                        eval_next_on_epoch += eval_every_x_epochs

                        test_error = mixer.error(test_data_ds)
                        subset_test_error = mixer.error(subset_test_ds)

                        if lowest_error is None or test_error < lowest_error:
                            lowest_error = test_error
                            best_model = mixer.get_model_copy()

                        if last_subset_test_error is None:
                            subset_test_error_delta_buff.append(0)
                        else:
                            subset_test_error_delta_buff.append(
                                last_subset_test_error - subset_test_error)

                        if last_test_error is None:
                            test_error_delta_buff.append(0)
                        else:
                            test_error_delta_buff.append(last_test_error -
                                                         test_error)

                        last_test_error = test_error

                        delta_mean = np.mean(test_error_delta_buff[-10:])
                        subset_delta_mean = np.mean(
                            subset_test_error_delta_buff[-10:])

                        if callback_on_iter is not None:
                            callback_on_iter(
                                epoch, training_error, test_error, delta_mean,
                                self.calculate_accuracy(test_data_ds))

                        ## Stop if the model is overfitting
                        if delta_mean < 0 and len(test_error_delta_buff) > 9:
                            stop_training = True

                        # Stop if we're past the time limit allocated for training
                        if (time.time() -
                                started) > stop_training_after_seconds:
                            stop_training = True

                        # If the training subset is overfitting on it's associated testing subset
                        if subset_delta_mean < 0 and len(
                                subset_test_error_delta_buff) > 9:
                            break

                        if stop_training:
                            mixer.update_model(best_model)
                            self._mixer = mixer
                            self.train_accuracy = self.calculate_accuracy(
                                test_data_ds)
                            self.overall_certainty = mixer.overall_certainty()
                            if subset_id == 'full':
                                logging.info('Finished training model !')
                            else:
                                logging.info(
                                    'Finished fitting on {subset_id} of {no_subsets} subset'
                                    .format(subset_id=subset_id,
                                            no_subsets=len(
                                                from_data_ds.subsets.keys())))
                            break

        self._mixer.encoders = from_data_ds.encoders
        return self
示例#13
0
    def learn(self,
              from_data,
              test_data=None,
              callback_on_iter=None,
              eval_every_x_epochs=20,
              stop_training_after_seconds=None,
              stop_model_building_after_seconds=None):
        """
        Train and save a model (you can use this to retrain model from data)

        :param from_data: (Pandas DataFrame) The data to learn from
        :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from
        :param callback_on_iter: This is function that can be called on every X evaluation cycle
        :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy

        :return: None
        """

        # This is a helper function that will help us auto-determine roughly what data types are in each column
        # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT'
        def type_map(col_name):
            col_pd_type = from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less,
                # than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if unique < 100 or unique < len(from_data[col_name]) / 10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else assume its text
                return COLUMN_DATA_TYPES.TEXT

        # generate the configuration and set the order for the input and output columns
        if self._generate_config is True:
            self._input_columns = [
                col for col in from_data if col not in self._output_columns
            ]
            self.config = {
                'input_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._input_columns],
                'output_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._output_columns]
            }
            self.config = predictor_config_schema.validate(self.config)
            logging.info('Automatically generated a configuration')
            logging.info(self.config)
        else:
            self._output_columns = [
                col['name'] for col in self.config['output_features']
            ]
            self._input_columns = [
                col['name'] for col in self.config['input_features']
            ]

        if stop_training_after_seconds is None:
            stop_training_after_seconds = round(from_data.shape[0] *
                                                from_data.shape[1] / 5)

        if stop_model_building_after_seconds is None:
            stop_model_building_after_seconds = stop_training_after_seconds * 3

        from_data_ds = DataSource(from_data, self.config)

        if test_data is not None:
            test_data_ds = DataSource(test_data, self.config)
        else:
            test_data_ds = from_data_ds.extractRandomSubset(0.1)

        from_data_ds.training = True

        mixer_class = NnMixer
        mixer_params = {}

        if 'mixer' in self.config:
            if 'class' in self.config['mixer']:
                mixer_class = self.config['mixer']['class']
            if 'attrs' in self.config['mixer']:
                mixer_params = self.config['mixer']['attrs']

        # Initialize data sources
        if len(from_data_ds) > 100:
            nr_subsets = 3
        else:
            # Don't use k-fold cross validation for very small input sizes
            nr_subsets = 1

        from_data_ds.prepare_encoders()
        from_data_ds.create_subsets(nr_subsets)
        try:
            mixer_class({}).fit_data_source(from_data_ds)
        except Exception as e:
            # Not all mixers might require this
            # print(e)
            pass

        input_size = len(from_data_ds[0][0])
        training_data_length = len(from_data_ds)

        test_data_ds.transformer = from_data_ds.transformer
        test_data_ds.encoders = from_data_ds.encoders
        test_data_ds.output_weights = from_data_ds.output_weights
        test_data_ds.create_subsets(nr_subsets)

        if 'optimizer' in self.config:
            optimizer = self.config['optimizer']()

            while True:
                training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

                # Some heuristics...
                if training_time_per_iteration > input_size:
                    if training_time_per_iteration > min(
                        (training_data_length /
                         (4 * input_size)), 16 * input_size):
                        break

                optimizer.total_trials = optimizer.total_trials - 1
                if optimizer.total_trials < 8:
                    optimizer.total_trials = 8
                    break

            training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

            best_parameters = optimizer.evaluate(
                lambda dynamic_parameters: Predictor.evaluate_mixer(
                    self.config,
                    mixer_class,
                    mixer_params,
                    from_data_ds,
                    test_data_ds,
                    dynamic_parameters,
                    max_training_time=training_time_per_iteration,
                    max_epochs=None))

            logging.info('Using hyperparameter set: ', best_parameters)
        else:
            best_parameters = {}

        self._mixer = mixer_class(best_parameters, self.config)

        for param in mixer_params:
            if hasattr(self._mixer, param):
                setattr(self._mixer, param, mixer_params[param])
            else:
                logging.warning(
                    'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter'
                    .format(param=param, mixerclass=str(type(self._mixer))))

        def callback_on_iter_w_acc(epoch, training_error, test_error,
                                   delta_mean):
            callback_on_iter(epoch, training_error, test_error, delta_mean,
                             self.calculate_accuracy(test_data_ds))

        self._mixer.fit(
            train_ds=from_data_ds,
            test_ds=test_data_ds,
            callback=callback_on_iter_w_acc,
            stop_training_after_seconds=stop_training_after_seconds,
            eval_every_x_epochs=eval_every_x_epochs)
        self.train_accuracy = self.calculate_accuracy(test_data_ds)

        # Train some alternative mixers
        if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and (
                CONFIG.FORCE_HELPER_MIXERS
                or len(from_data_ds) < 12 * pow(10, 3)):
            try:
                self._helper_mixers = self.train_helper_mixers(
                    from_data_ds, test_data_ds,
                    self._mixer.quantiles[self._mixer.quantiles_pair[0] +
                                          1:self._mixer.quantiles_pair[1] + 1])
            except Exception as e:
                logging.warning(
                    f'Failed to train helper mixers with error: {e}')

        return self
示例#14
0
        'output_features': [{
            'name': 'z',
            'type': 'categorical',
            # 'encoder_path': 'lightwood.encoders.categorical.categorical'
        }]
    }
    data = {
        'x': [i for i in range(10)],
        'y': [random.randint(i, i + 20) for i in range(10)]
    }
    nums = [data['x'][i] * data['y'][i] for i in range(10)]
    data['z'] = ['low' if i < 50 else 'high' for i in nums]
    data_frame = pandas.DataFrame(data)
    print(data_frame)

    ds = DataSource(data_frame, config)
    input_ds_for_prediction = DataSource(data_frame[['x', 'y']], config)

    mixer = SkLearnMixer(input_column_names=['x', 'y'],
                         output_column_names=['z'])
    for i in mixer.iter_fit(ds):
        print('training')

    data_encoded = mixer.fit(ds)
    predictions = mixer.predict(input_ds_for_prediction, ['z'])
    print(predictions)

    #####################################
    # For Regression                    #
    # Test Case: 2                      #
    #####################################
示例#15
0
    def learn(self,
              from_data,
              test_data=None,
              callback_on_iter=None,
              eval_every_x_epochs=20,
              stop_training_after_seconds=None,
              stop_model_building_after_seconds=None):
        """
        Train and save a model (you can use this to retrain model from data)

        :param from_data: (Pandas DataFrame) The data to learn from
        :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from
        :param callback_on_iter: This is function that can be called on every X evaluation cycle
        :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy

        :return: None
        """

        # This is a helper function that will help us auto-determine roughly what data types are in each column
        # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT'
        def type_map(col_name):
            col_pd_type = from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less,
                # than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if unique < 100 or unique < len(from_data[col_name]) / 10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else assume its text
                return COLUMN_DATA_TYPES.TEXT

        # generate the configuration and set the order for the input and output columns
        if self._generate_config is True:
            self._input_columns = [
                col for col in from_data if col not in self._output_columns
            ]
            self.config = {
                'input_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._input_columns],
                'output_features': [{
                    'name': col,
                    'type': type_map(col)
                } for col in self._output_columns]
            }
            self.config = predictor_config_schema.validate(self.config)
            logging.info('Automatically generated a configuration')
            logging.info(self.config)
        else:
            self._output_columns = [
                col['name'] for col in self.config['output_features']
            ]
            self._input_columns = [
                col['name'] for col in self.config['input_features']
            ]

        if stop_training_after_seconds is None:
            stop_training_after_seconds = round(from_data.shape[0] *
                                                from_data.shape[1] / 5)

        if stop_model_building_after_seconds is None:
            stop_model_building_after_seconds = stop_training_after_seconds * 3

        from_data_ds = DataSource(from_data, self.config)

        if test_data is not None:
            test_data_ds = DataSource(test_data, self.config)
        else:
            test_data_ds = from_data_ds.extractRandomSubset(0.1)

        from_data_ds.training = True

        mixer_class = NnMixer
        mixer_params = {}

        if 'mixer' in self.config:
            if 'class' in self.config['mixer']:
                mixer_class = self.config['mixer']['class']
            if 'attrs' in self.config['mixer']:
                mixer_params = self.config['mixer']['attrs']

        # Initialize data sources
        nr_subsets = 3
        from_data_ds.prepare_encoders()
        from_data_ds.create_subsets(nr_subsets)
        try:
            mixer_class({}).fit_data_source(from_data_ds)
        except Exception as e:
            # Not all mixers might require this
            # print(e)
            pass

        input_size = len(from_data_ds[0][0])
        training_data_length = len(from_data_ds)

        test_data_ds.transformer = from_data_ds.transformer
        test_data_ds.encoders = from_data_ds.encoders
        test_data_ds.output_weights = from_data_ds.output_weights
        test_data_ds.create_subsets(nr_subsets)

        if 'optimizer' in self.config:
            optimizer = self.config['optimizer']()

            while True:
                training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

                # Some heuristics...
                if training_time_per_iteration > input_size:
                    if training_time_per_iteration > min(
                        (training_data_length /
                         (4 * input_size)), 16 * input_size):
                        break

                optimizer.total_trials = optimizer.total_trials - 1
                if optimizer.total_trials < 8:
                    optimizer.total_trials = 8
                    break

            training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials

            best_parameters = optimizer.evaluate(
                lambda dynamic_parameters: Predictor.evaluate_mixer(
                    self.config,
                    mixer_class,
                    mixer_params,
                    from_data_ds,
                    test_data_ds,
                    dynamic_parameters,
                    max_training_time=training_time_per_iteration,
                    max_epochs=None))

            logging.info('Using hyperparameter set: ', best_parameters)
        else:
            best_parameters = {}

        if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and (
                CONFIG.FORCE_HELPER_MIXERS
                or len(from_data_ds) < 12 * pow(10, 3)):
            try:
                self._helper_mixers = self.train_helper_mixers(
                    from_data_ds, test_data_ds)
            except Exception as e:
                logging.warning(
                    f'Failed to train helper mixers with error: {e}')

        mixer = mixer_class(best_parameters, self.config)
        self._mixer = mixer

        for param in mixer_params:
            if hasattr(mixer, param):
                setattr(mixer, param, mixer_params[param])
            else:
                logging.warning(
                    'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter'
                    .format(param=param, mixerclass=str(type(mixer))))

        started = time.time()
        log_reasure = time.time()
        first_run = True
        stop_training = False

        for subset_iteration in [1, 2]:
            if stop_training:
                break
            subset_id_arr = [*from_data_ds.subsets.keys()]  # [1]
            for subset_id in subset_id_arr:
                started_subset = time.time()
                if stop_training:
                    break

                #subset_train_ds = from_data_ds #.subsets[subset_id]
                #subset_test_ds = test_data_ds #.subsets[subset_id]

                subset_train_ds = from_data_ds.subsets[subset_id]
                subset_test_ds = test_data_ds.subsets[subset_id]

                lowest_error = None
                last_test_error = None
                last_subset_test_error = None
                test_error_delta_buff = []
                subset_test_error_delta_buff = []
                best_model = None
                best_selfaware_model = None

                #iterate over the iter_fit and see what the epoch and mixer error is
                for epoch, training_error in enumerate(
                        mixer.iter_fit(subset_train_ds,
                                       initialize=first_run,
                                       subset_id=subset_id)):
                    first_run = False

                    # Log this every now and then so that the user knows it's running
                    if (int(time.time()) - log_reasure) > 30:
                        log_reasure = time.time()
                        logging.info(
                            f'Lightwood training, iteration {epoch}, training error {training_error}'
                        )

                    # Prime the model on each subset for a bit
                    if subset_iteration == 1:
                        break

                    # Once the training error is getting smaller, enable dropout to teach the network to predict without certain features
                    if subset_iteration > 1 and training_error < 0.4 and not from_data_ds.enable_dropout:
                        eval_every_x_epochs = max(1,
                                                  int(eval_every_x_epochs / 2))
                        logging.info('Enabled dropout !')
                        from_data_ds.enable_dropout = True
                        lowest_error = None
                        last_test_error = None
                        last_subset_test_error = None
                        test_error_delta_buff = []
                        subset_test_error_delta_buff = []
                        continue

                    # If the selfaware network isn't able to train, go back to the original network
                    if subset_iteration > 1 and (
                            np.isnan(training_error)
                            or np.isinf(training_error) or training_error >
                            pow(10, 5)) and not mixer.stop_selfaware_training:
                        mixer.start_selfaware_training = False
                        mixer.stop_selfaware_training = True
                        lowest_error = None
                        last_test_error = None
                        last_subset_test_error = None
                        test_error_delta_buff = []
                        subset_test_error_delta_buff = []
                        continue

                    # Once we are past the priming/warmup period, start training the selfaware network

                    if subset_iteration > 1 and not mixer.is_selfaware and self.config[
                            'mixer'][
                                'selfaware'] and not mixer.stop_selfaware_training and training_error < 0.35:
                        logging.info('Started selfaware training !')
                        mixer.start_selfaware_training = True
                        lowest_error = None
                        last_test_error = None
                        last_subset_test_error = None
                        test_error_delta_buff = []
                        subset_test_error_delta_buff = []
                        continue

                    if epoch % eval_every_x_epochs == 0:
                        test_error = mixer.error(test_data_ds)
                        subset_test_error = mixer.error(subset_test_ds,
                                                        subset_id=subset_id)
                        logging.info(
                            f'Subtest test error: {subset_test_error} on subset {subset_id}, overall test error: {test_error}'
                        )

                        if lowest_error is None or test_error < lowest_error:
                            lowest_error = test_error
                            if mixer.is_selfaware:
                                best_selfaware_model = mixer.get_model_copy()
                            else:
                                best_model = mixer.get_model_copy()

                        if last_subset_test_error is None:
                            pass
                        else:
                            subset_test_error_delta_buff.append(
                                last_subset_test_error - subset_test_error)

                        last_subset_test_error = subset_test_error

                        if last_test_error is None:
                            pass
                        else:
                            test_error_delta_buff.append(last_test_error -
                                                         test_error)

                        last_test_error = test_error

                        delta_mean = np.mean(test_error_delta_buff[-5:])
                        subset_delta_mean = np.mean(
                            subset_test_error_delta_buff[-5:])

                        if callback_on_iter is not None:
                            callback_on_iter(
                                epoch, training_error, test_error, delta_mean,
                                self.calculate_accuracy(test_data_ds))

                        ## Stop if the model is overfitting
                        #if delta_mean <= 0 and len(test_error_delta_buff) > 4:
                        #    stop_training = True

                        # Stop if we're past the time limit allocated for training
                        if (time.time() -
                                started) > stop_training_after_seconds:
                            stop_training = True

                        # If the trauining subset is overfitting on it's associated testing subset
                        if (subset_delta_mean <= 0
                                and len(subset_test_error_delta_buff) > 4
                            ) or (time.time() - started_subset
                                  ) > stop_training_after_seconds / len(
                                      from_data_ds.subsets.keys()):
                            logging.info(
                                'Finished fitting on {subset_id} of {no_subsets} subset'
                                .format(subset_id=subset_id,
                                        no_subsets=len(
                                            from_data_ds.subsets.keys())))

                            if mixer.is_selfaware:
                                if best_selfaware_model is not None:
                                    mixer.update_model(best_selfaware_model)
                            else:
                                mixer.update_model(best_model)

                            if subset_id == subset_id_arr[-1]:
                                stop_training = True
                            elif not stop_training:
                                break

                        if stop_training:
                            if mixer.is_selfaware:
                                mixer.update_model(best_selfaware_model)
                            else:
                                mixer.update_model(best_model)
                            self._mixer = mixer
                            self.train_accuracy = self.calculate_accuracy(
                                test_data_ds)
                            self.overall_certainty = mixer.overall_certainty()
                            logging.info('Finished training model !')
                            break

        self._mixer.build_confidence_normalization_data(test_data_ds)
        self._mixer.encoders = from_data_ds.encoders
        return self
示例#16
0
    def learn(self, from_data, test_data=None, callback_on_iter = None, eval_every_x_epochs = 20, stop_training_after_seconds=3600 * 24 * 5):
        """
        Train and save a model (you can use this to retrain model from data)

        :param from_data: (Pandas DataFrame) The data to learn from
        :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from
        :param callback_on_iter: This is function that can be called on every X evaluation cycle
        :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy

        :return: None
        """
        self._stop_training_flag = False

        # This is a helper function that will help us auto-determine roughly what data types are in each column
        # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT'
        def type_map(col_name):
            col_pd_type =  from_data[col_name].dtype
            col_pd_type = str(col_pd_type)

            if col_pd_type in ['int64', 'float64', 'timedelta']:
                return COLUMN_DATA_TYPES.NUMERIC
            elif col_pd_type in ['bool', 'category']:
                return COLUMN_DATA_TYPES.CATEGORICAL
            else:
                # if the number of uniques is elss than 100 or less than 10% of the total number of rows then keep it as categorical
                unique = from_data[col_name].nunique()
                if  unique < 100 or unique < len(from_data[col_name])/10:
                    return COLUMN_DATA_TYPES.CATEGORICAL
                # else asume its text
                return COLUMN_DATA_TYPES.TEXT

        # generate the configuration and set the order for the input and output columns
        if self._generate_config == True:
            self._input_columns = [col for col in from_data if col not in self._output_columns]
            self.config = {
                'input_features': [{'name': col, 'type': type_map(col)} for col in self._input_columns],
                'output_features': [{'name': col, 'type': type_map(col)} for col in self._output_columns]
            }
            logging.info('Automatically generated a configuration')
            logging.info(self.config)
        else:
            self._output_columns = [col['name'] for col in self.config['input_features']]
            self._input_columns = [col['name'] for col in self.config['output_features']]


        from_data_ds = DataSource(from_data, self.config)

        if test_data is not None:
            test_data_ds = DataSource(test_data, self.config)
        else:
            test_data_ds = from_data_ds.extractRandomSubset(0.1)

        from_data_ds.training = True

        mixer_params = {}

        if 'mixer' in self.config:
            mixer_class = self.config['mixer']['class']
            if 'attrs' in  self.config['mixer']:
                mixer_params = self.config['mixer']['attrs']
        else:
            mixer_class = NnMixer


        mixer = mixer_class()

        for param in mixer_params:
            if hasattr(mixer, param):
                setattr(mixer, param, mixer_params[param])
            else:
                logging.warning('trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter'.format(param=param, mixerclass=str(type(mixer))))

        eval_next_on_epoch = eval_every_x_epochs
        error_delta_buffer = []  # this is a buffer of the delta of test and train error
        delta_mean = 0
        last_test_error = None
        lowest_error = None
        lowest_error_epoch = None
        last_good_model = None

        started_training_at = int(time.time())
        #iterate over the iter_fit and see what the epoch and mixer error is
        for epoch, mix_error in enumerate(mixer.iter_fit(from_data_ds)):
            if self._stop_training_flag == True:
                logging.info('Learn has been stopped')
                break

            logging.info('training iteration {iter_i}, error {error}'.format(iter_i=epoch, error=mix_error))

            # see if it needs to be evaluated
            if epoch >= eval_next_on_epoch and test_data_ds:
                tmp_next = eval_next_on_epoch + eval_every_x_epochs
                eval_next_on_epoch = tmp_next

                test_error = mixer.error(test_data_ds)

                # initialize lowest_error_variable if not initialized yet
                if lowest_error is None:
                    lowest_error = test_error
                    lowest_error_epoch = epoch
                    is_lowest_error = True

                else:
                    # define if this is the lowest test error we have had thus far
                    if test_error < lowest_error:
                        lowest_error = test_error
                        lowest_error_epoch = epoch
                        is_lowest_error = True
                    else:
                        is_lowest_error = False

                if last_test_error is None:
                    last_test_error = test_error

                # it its the lowest error, make a FULL copy of the mixer so we can return only the best mixer at the end
                if is_lowest_error:
                    last_good_model = mixer.get_model_copy()

                delta_error = last_test_error - test_error
                last_test_error = test_error

                # keep a stream of training errors delta, so that we can calculate if the mixer is starting to overfit.
                # We assume if the delta of training error starts to increase
                # delta is assumed as the difference between the test and train error
                error_delta_buffer += [delta_error]
                error_delta_buffer = error_delta_buffer[-10:]
                delta_mean = np.mean(error_delta_buffer)

                # update mixer and calculate accuracy
                self._mixer = mixer
                accuracy = self.calculate_accuracy(test_data_ds)
                self.train_accuracy = { var: accuracy[var] if accuracy[var] > 0 else 0 for var in accuracy}
                logging.debug('Delta of test error {delta}'.format(delta=delta_mean))

                # if there is a callback function now its the time to call it
                if callback_on_iter is not None:
                    callback_on_iter(epoch, mix_error, test_error, delta_mean)

                # if the model is overfitting that is, that the the test error is becoming greater than the train error
                if (delta_mean < 0 and len(error_delta_buffer) > 5 and test_error < 0.1) or (test_error < 0.005) or (test_error < 0.0005) or (lowest_error_epoch + round(max(eval_every_x_epochs*2+2,epoch*1.2)) < epoch) or ( (int(time.time()) - started_training_at) > stop_training_after_seconds):
                    mixer.update_model(last_good_model)
                    self.train_accuracy = self.calculate_accuracy(test_data_ds)
                    break


        # make sure that we update the encoders, we do this, so that the predictor or parent object can pickle the mixers
        self._mixer.encoders = from_data_ds.encoders

        return self
示例#17
0
    config = predictor_config_schema.validate(config)
    # For Classification
    data = {
        'x': [i for i in range(10)],
        'y': [random.randint(i, i + 20) for i in range(10)]
    }
    nums = [data['x'][i] * data['y'][i] for i in range(10)]

    data['z'] = ['low' if i < 50 else 'high' for i in nums]

    data_frame = pandas.DataFrame(data)

    # print(data_frame)

    ds = DataSource(data_frame, config)
    ds.prepare_encoders()
    predict_input_ds = DataSource(data_frame[['x', 'y']], config)
    predict_input_ds.prepare_encoders()
    ####################

    mixer = NnMixer({}, config)

    for i in mixer.iter_fit(ds):
        if i < 0.01:
            break

    predictions = mixer.predict(predict_input_ds)
    print(predictions)

    # For Regression
示例#18
0
文件: nn.py 项目: abitrolly/lightwood
    }

    ##For Classification
    data = {
        'x': [i for i in range(10)],
        'y': [random.randint(i, i + 20) for i in range(10)]
    }
    nums = [data['x'][i] * data['y'][i] for i in range(10)]

    data['z'] = ['low' if i < 50 else 'high' for i in nums]

    data_frame = pandas.DataFrame(data)

    # print(data_frame)

    ds = DataSource(data_frame, config)
    predict_input_ds = DataSource(data_frame[['x', 'y']], config)
    ####################

    mixer = NnMixer(input_column_names=['x', 'y'], output_column_names=['z'])

    data_encoded = mixer.fit(ds)
    predictions = mixer.predict(predict_input_ds)
    print(predictions)

    ##For Regression

    # GENERATE DATA
    ###############

    config = {