def test_nnd_nothing_to_remove(self): """ Dataset with no null values. :return: True if returns the same dataset. """ correct_numeric_no_null = { 'Feature1': [12, 23, 34, 22], 'Feature2': [20, 21, 19, 18], 'Feature3': [34, 84, 10, 20] } data = pd.DataFrame(correct_numeric_no_null) self.assertTrue(data.equals(nvc.nnd(data)))
def test_nnd_inner_mixed_two_rows(self): """ Two null rows mixed numeric and non mode approach. :return: True if both rows with null were replaced by median and mode. """ mixed_two_null = { 'Feature1': ['a', 'b', np.nan, 'c'], 'Feature2': [1, np.nan, 45, 23], 'Feature3': [np.nan, 'd', '1', 'b'] } correct_non_numeric_no_null = { 'Feature1': ['a', 'b', 'a', 'c'], 'Feature2': [1.0, 23.0, 45.0, 23.0], 'Feature3': ['1', 'd', '1', 'b'] } initial_data = pd.DataFrame(mixed_two_null) correct_data = pd.DataFrame(correct_non_numeric_no_null) print(nvc.nnd(initial_data, strategy='mean')) self.assertTrue( correct_data.equals(nvc.nnd(initial_data, strategy='mean')))
def test_nnd_mode_two_rows(self): """ Two null rows mode approach. :return: True if both rows with null were removed. """ correct_non_numeric_no_null = { 'Feature1': ['a', 'b', 'a', 'c'], 'Feature2': ['a', 'a', 'r', 'f'], 'Feature3': ['d', 'd', 'q', 't'] } initial_data = pd.DataFrame(self.non_numeric_only_two_null) correct_data = pd.DataFrame(correct_non_numeric_no_null) self.assertTrue(correct_data.equals(nvc.nnd(initial_data)))
def test_nnd_remove_two_null_rows(self): """ Two null rows remove approach. :return: True if both rows with null were removed. """ correct_numeric_no_null = { 'Feature1': [12.0, 22.0], 'Feature2': [20.0, 18.0], 'Feature3': [34, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue(correct_data.equals(nvc.nnd(initial_data, drop=True)))
def test_nnd_median_two_null_rows_median_approach(self): """ Two null rows median approach. :return: True if both nulls were replaced by median. """ correct_numeric_no_null = { 'Feature1': [12.0, 23.0, 22.0, 22.0], 'Feature2': [20.0, 19.0, 19.0, 18.0], 'Feature3': [34, 84, 10, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals(nvc.nnd(initial_data, strategy='median')))
def test_nnd_remove_two_rows(self): """ Three null rows with explicitly removing two of them. :return: True if the specified rows were removed. """ correct_numeric_no_null = { 'Feature1': [12.0, 22.0], 'Feature2': [20.0, 18.0], 'Feature3': [34.0, 34.0] } initial_data = pd.DataFrame(self.numeric_three_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals( nvc.nnd(initial_data, strategy='median', remove_rows=[1, 2])))
def test_nnd_remove_one_row(self): """ Two null rows with explicitly removing one of them. :return: True if the specified row was removed. """ correct_numeric_no_null = { 'Feature1': [12.0, 17.0, 22.0], 'Feature2': [20.0, 19.0, 18.0], 'Feature3': [34, 10, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals( nvc.nnd(initial_data, strategy='median', remove_rows=[1])))
def test_nnd_keep_one_row(self): """ Two null rows with explicitly keeping one of them. :return: True if the specified row was kept and the null there was replaced by the median. """ correct_numeric_no_null = { 'Feature1': [12.0, 23.0, 22.0], 'Feature2': [20.0, 19.0, 18.0], 'Feature3': [34, 84, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals(nvc.nnd(initial_data, drop=True, keep_rows=[1])))
def test_nnd_keep_two_rows(self): """ Three null rows with explicitly keeping two of them. :return: True if the specified rows were kept and the nulls there were replaced by the median. """ correct_numeric_no_null = { 'Feature1': [12.0, 23.0, 17.5], 'Feature2': [20.0, 19.5, 19.0], 'Feature3': [34.0, 84.0, 10.0] } initial_data = pd.DataFrame(self.numeric_three_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals( nvc.nnd(initial_data, drop=True, keep_rows=[1, 2])))
def test_nnd_remove_one_null_row(self): """ One null row with remove approach. :return: True if the row with null was removed. """ numeric_one_null = { 'Feature1': [12, 23, 34, 22], 'Feature2': [20, np.nan, 19, 18], 'Feature3': [34, 84, 10, 20] } correct_numeric_no_null = { 'Feature1': [12, 34, 22], 'Feature2': [20.0, 19.0, 18.0], 'Feature3': [34, 10, 20] } initial_data = pd.DataFrame(numeric_one_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue(correct_data.equals(nvc.nnd(initial_data, drop=True)))
def test_nnd_median_two_null_rows_no_reindex(self): """ Two null rows with remove approach and without reindexing. :return: True if rows with null values were removed and indexing was preserved. """ correct_numeric_no_null = { 'Feature1': [12.0, 22.0], 'Feature2': [20.0, 18.0], 'Feature3': [34, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) new_index = pd.Series([0, 3]) correct_data = correct_data.set_index([new_index], drop=True) self.assertTrue( correct_data.equals(nvc.nnd(initial_data, drop=True, reindex=False)))
def test_nnd_keep_remove_two_row_keep_conflict(self): """ Two null rows with explicitly keeping both of them and removing one of them. Keeping has a priority. :return: True if the specified rows were kept and the null there was replaced by the median. """ correct_numeric_no_null = { 'Feature1': [12.0, 23.0, 22.0, 22.0], 'Feature2': [20.0, 19.0, 19.0, 18.0], 'Feature3': [34, 84, 10, 20] } initial_data = pd.DataFrame(self.numeric_two_null) correct_data = pd.DataFrame(correct_numeric_no_null) self.assertTrue( correct_data.equals( nvc.nnd(initial_data, drop=True, keep_rows=[1, 2], remove_rows=[1])))
def preprocess( self, target_columns: list, input_columns=None, missing_value_strategy='median', keep_rows=None, remove_rows=None, drop_missing_values=False, remove_outliers=True, outliers_strategy='Z', z_score_threshold=3, encode_categorical_data=True, encoding_drop_first=True, standardize=True, split=True, test_size=0.2, reindex=True, ): """ Method to automatically preprocess data and split it into train - test datasets. :param input_columns: input columns. :param target_columns: target columns. :param missing_value_strategy: The way the method should handle rows with missing values (median,mean, mode). :param keep_rows: Specify the rows to keep in case 'remove' method was chosen, otherwise it has no effect. keep_rows has a priority over remove_rows. :param remove_rows: Specify the rows to remove. Can be used with all methods. :param drop_missing_values: Removes all the rows that contain null values, except for those in keep_rows :param remove_outliers: Whether to remove outliers. :param outliers_strategy: A strategy for removal (Z or IQR). :param z_score_threshold: A threshold for a value to be considered outliers in case Z-score was chosen. :param encode_categorical_data: Whether to encode categorical data. :param encoding_drop_first: Whether to get k-1 dummies out of k categorical levels by removing the first level. :param standardize: Whether to standardize dataset (only possible if categorical data is encoded). :param split: Whether to split dataset. :param test_size: size of test dataset. :param reindex: A new dataset will create new indexes if True. :return: Four datasets: target_train, target_test, input_train, input_test or a preprocessed dataset if split=False """ if input_columns is None: input_columns = [] # Missing values' replacement/removal self.data = nvc.nnd(self.data, strategy=missing_value_strategy, keep_rows=keep_rows, remove_rows=remove_rows, reindex=reindex, drop=drop_missing_values) # Outliers' removal if remove_outliers: self.data = oc.remove_outliers(self.data, strategy=outliers_strategy, reindex=reindex, threshold=z_score_threshold) # Split dataset into targets, inputs if split: targets = self.data[target_columns] input_columns = [x for x in input_columns if x not in targets] if not input_columns: inputs = self.data.drop(target_columns, axis=1) else: inputs = self.data[input_columns] # Categorical data encoding if encode_categorical_data and split: self.data = pd.get_dummies(inputs, drop_first=encoding_drop_first) elif encode_categorical_data: self.data = pd.get_dummies(self.data, drop_first=encoding_drop_first) # Data standardization if standardize and encode_categorical_data: self.data = self.__standardize_data() # Data split into train - test if split: test_size = 0.2 if test_size not in (0, 1) else test_size targets_train, targets_test = np.split( targets.to_numpy(), [int((1 - test_size) * len(targets))]) inputs_train, inputs_test = np.split( self.data, [int((1 - test_size) * len(targets))]) return targets_train, targets_test, inputs_train, inputs_test return self.data