def test_nnd_nothing_to_remove(self):
        """
        Dataset with no null values.
        :return: True if returns the same dataset.
        """
        correct_numeric_no_null = {
            'Feature1': [12, 23, 34, 22],
            'Feature2': [20, 21, 19, 18],
            'Feature3': [34, 84, 10, 20]
        }
        data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(data.equals(nvc.nnd(data)))
    def test_nnd_inner_mixed_two_rows(self):
        """
        Two null rows mixed numeric and non mode approach.
        :return: True if both rows with null were replaced by median and mode.
        """
        mixed_two_null = {
            'Feature1': ['a', 'b', np.nan, 'c'],
            'Feature2': [1, np.nan, 45, 23],
            'Feature3': [np.nan, 'd', '1', 'b']
        }

        correct_non_numeric_no_null = {
            'Feature1': ['a', 'b', 'a', 'c'],
            'Feature2': [1.0, 23.0, 45.0, 23.0],
            'Feature3': ['1', 'd', '1', 'b']
        }

        initial_data = pd.DataFrame(mixed_two_null)
        correct_data = pd.DataFrame(correct_non_numeric_no_null)

        print(nvc.nnd(initial_data, strategy='mean'))

        self.assertTrue(
            correct_data.equals(nvc.nnd(initial_data, strategy='mean')))
    def test_nnd_mode_two_rows(self):
        """
        Two null rows mode approach.
        :return: True if both rows with null were removed.
        """
        correct_non_numeric_no_null = {
            'Feature1': ['a', 'b', 'a', 'c'],
            'Feature2': ['a', 'a', 'r', 'f'],
            'Feature3': ['d', 'd', 'q', 't']
        }

        initial_data = pd.DataFrame(self.non_numeric_only_two_null)
        correct_data = pd.DataFrame(correct_non_numeric_no_null)

        self.assertTrue(correct_data.equals(nvc.nnd(initial_data)))
    def test_nnd_remove_two_null_rows(self):
        """
        Two null rows remove approach.
        :return: True if both rows with null were removed.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 22.0],
            'Feature2': [20.0, 18.0],
            'Feature3': [34, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(correct_data.equals(nvc.nnd(initial_data, drop=True)))
    def test_nnd_median_two_null_rows_median_approach(self):
        """
        Two null rows median approach.
        :return: True if both nulls were replaced by median.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 23.0, 22.0, 22.0],
            'Feature2': [20.0, 19.0, 19.0, 18.0],
            'Feature3': [34, 84, 10, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(nvc.nnd(initial_data, strategy='median')))
    def test_nnd_remove_two_rows(self):
        """
        Three null rows with explicitly removing two of them.
        :return: True if the specified rows were removed.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 22.0],
            'Feature2': [20.0, 18.0],
            'Feature3': [34.0, 34.0]
        }

        initial_data = pd.DataFrame(self.numeric_three_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(
                nvc.nnd(initial_data, strategy='median', remove_rows=[1, 2])))
    def test_nnd_remove_one_row(self):
        """
        Two null rows with explicitly removing one of them.
        :return: True if the specified row was removed.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 17.0, 22.0],
            'Feature2': [20.0, 19.0, 18.0],
            'Feature3': [34, 10, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(
                nvc.nnd(initial_data, strategy='median', remove_rows=[1])))
    def test_nnd_keep_one_row(self):
        """
        Two null rows with explicitly keeping one of them.
        :return: True if the specified row was kept and the null there was replaced by the median.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 23.0, 22.0],
            'Feature2': [20.0, 19.0, 18.0],
            'Feature3': [34, 84, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(nvc.nnd(initial_data, drop=True,
                                        keep_rows=[1])))
    def test_nnd_keep_two_rows(self):
        """
        Three null rows with explicitly keeping two of them.
        :return: True if the specified rows were kept and the nulls there were replaced by the
        median.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 23.0, 17.5],
            'Feature2': [20.0, 19.5, 19.0],
            'Feature3': [34.0, 84.0, 10.0]
        }

        initial_data = pd.DataFrame(self.numeric_three_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(
                nvc.nnd(initial_data, drop=True, keep_rows=[1, 2])))
    def test_nnd_remove_one_null_row(self):
        """
        One null row with remove approach.
        :return: True if the row with null was removed.
        """
        numeric_one_null = {
            'Feature1': [12, 23, 34, 22],
            'Feature2': [20, np.nan, 19, 18],
            'Feature3': [34, 84, 10, 20]
        }
        correct_numeric_no_null = {
            'Feature1': [12, 34, 22],
            'Feature2': [20.0, 19.0, 18.0],
            'Feature3': [34, 10, 20]
        }

        initial_data = pd.DataFrame(numeric_one_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(correct_data.equals(nvc.nnd(initial_data, drop=True)))
    def test_nnd_median_two_null_rows_no_reindex(self):
        """
        Two null rows with remove approach and without reindexing.
        :return: True if rows with null values were removed and indexing was preserved.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 22.0],
            'Feature2': [20.0, 18.0],
            'Feature3': [34, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        new_index = pd.Series([0, 3])
        correct_data = correct_data.set_index([new_index], drop=True)

        self.assertTrue(
            correct_data.equals(nvc.nnd(initial_data, drop=True,
                                        reindex=False)))
    def test_nnd_keep_remove_two_row_keep_conflict(self):
        """
        Two null rows with explicitly keeping both of them and removing one of them. Keeping has
        a priority.
        :return: True if the specified rows were kept and the null there was replaced by the median.
        """
        correct_numeric_no_null = {
            'Feature1': [12.0, 23.0, 22.0, 22.0],
            'Feature2': [20.0, 19.0, 19.0, 18.0],
            'Feature3': [34, 84, 10, 20]
        }

        initial_data = pd.DataFrame(self.numeric_two_null)
        correct_data = pd.DataFrame(correct_numeric_no_null)

        self.assertTrue(
            correct_data.equals(
                nvc.nnd(initial_data,
                        drop=True,
                        keep_rows=[1, 2],
                        remove_rows=[1])))
예제 #13
0
    def preprocess(
        self,
        target_columns: list,
        input_columns=None,
        missing_value_strategy='median',
        keep_rows=None,
        remove_rows=None,
        drop_missing_values=False,
        remove_outliers=True,
        outliers_strategy='Z',
        z_score_threshold=3,
        encode_categorical_data=True,
        encoding_drop_first=True,
        standardize=True,
        split=True,
        test_size=0.2,
        reindex=True,
    ):
        """
        Method to automatically preprocess data and split it into train - test datasets.

        :param input_columns: input columns.
        :param target_columns: target columns.
        :param missing_value_strategy: The way the method should handle rows with missing values
        (median,mean, mode).
        :param keep_rows: Specify the rows to keep in case 'remove' method was chosen, otherwise
        it has no effect. keep_rows has a priority over remove_rows.
        :param remove_rows: Specify the rows to remove. Can be used with all methods.
        :param drop_missing_values: Removes all the rows that contain null values, except for
        those in keep_rows
        :param remove_outliers: Whether to remove outliers.
        :param outliers_strategy: A strategy for removal (Z or IQR).
        :param z_score_threshold: A threshold for a value to be considered outliers in case
        Z-score was chosen.
        :param encode_categorical_data: Whether to encode categorical data.
        :param encoding_drop_first: Whether to get k-1 dummies out of k categorical levels by
        removing the first level.
        :param standardize: Whether to standardize dataset (only possible if categorical data is
        encoded).
        :param split: Whether to split dataset.
        :param test_size: size of test dataset.
        :param reindex: A new dataset will create new indexes if True.
        :return: Four datasets: target_train, target_test, input_train, input_test or a
        preprocessed dataset if split=False
        """

        if input_columns is None:
            input_columns = []

        # Missing values' replacement/removal
        self.data = nvc.nnd(self.data,
                            strategy=missing_value_strategy,
                            keep_rows=keep_rows,
                            remove_rows=remove_rows,
                            reindex=reindex,
                            drop=drop_missing_values)

        # Outliers' removal
        if remove_outliers:
            self.data = oc.remove_outliers(self.data,
                                           strategy=outliers_strategy,
                                           reindex=reindex,
                                           threshold=z_score_threshold)

        # Split dataset into targets, inputs
        if split:
            targets = self.data[target_columns]

            input_columns = [x for x in input_columns if x not in targets]

            if not input_columns:
                inputs = self.data.drop(target_columns, axis=1)

            else:
                inputs = self.data[input_columns]

        # Categorical data encoding
        if encode_categorical_data and split:
            self.data = pd.get_dummies(inputs, drop_first=encoding_drop_first)

        elif encode_categorical_data:
            self.data = pd.get_dummies(self.data,
                                       drop_first=encoding_drop_first)

        # Data standardization
        if standardize and encode_categorical_data:
            self.data = self.__standardize_data()

        # Data split into train - test
        if split:
            test_size = 0.2 if test_size not in (0, 1) else test_size

            targets_train, targets_test = np.split(
                targets.to_numpy(), [int((1 - test_size) * len(targets))])
            inputs_train, inputs_test = np.split(
                self.data, [int((1 - test_size) * len(targets))])
            return targets_train, targets_test, inputs_train, inputs_test

        return self.data