def drop_duplicate_column(comparable: Compare): header_duplicate = [ header[Field.column_name.value] for header in comparable.header if header[Field.column_type.value] == Field.duplicate.value ] comparable.data_frame = comparable.data_frame.drop( columns=header_duplicate)
def drop_not_checked_column(comparable: Compare): header_not_checked = [ header[Field.column_name.value] for header in comparable.header if header[Field.column_type.value] == Field.not_checked.value ] comparable.data_frame = comparable.data_frame.drop( columns=header_not_checked)
def test_extract_not_checked_column(self): comparable = Compare() comparable.header = [ {"column_name": "id", "column_location": 1, "column_type": ""}, {"column_name": "first_name", "column_location": 2, "column_type": ""}, {"column_name": "last_name", "column_location": 3, "column_type": "not_checked"}, {"column_name": "middle_name", "column_location": 3, "column_type": "not_checked"}, {"column_name": "requirement", "column_location": 5, "column_type": "mapped"}, {"column_name": "alternate_name", "column_location": 6, "column_type": ""}] comparable.index_column_name = [ {"column_name": "id", "column_location": 1}] data = { 'id': [1, 2, 3], 'first_name': ['f1', 'f2', 'f3'], 'last_name': ['l1', 'l2', 'l3'], 'middle_name': ['m1', 'm2', 'm3'], 'requirement': ['r1', 'r2', 'r3'], 'alternate_name': ['a1', 'a2', 'a3'] } expected = pd.DataFrame({'last_name': ['l1', 'l2', 'l3'], 'middle_name': ['m1', 'm2', 'm3']}) comparable.data_frame = pd.DataFrame(data) data_importer.extract_not_checked_column(comparable) assert expected.equals(comparable.not_checked_column)
def drop_disjunctive_column(comparable: Compare): header_disjunctive = [ header[Field.column_name.value] for header in comparable.header if header[Field.column_type.value] == Field.disjunctive.value ] comparable.data_frame = comparable.data_frame.drop( columns=header_disjunctive)
def test_stringify_index(self, mock_df, mock_index): comparable = Compare() comparable.data_frame = mock_df comparable.index_column_name = mock_index index_validator.stringify_index(comparable) for val in comparable.data_frame['id']: assert isinstance(val, str)
def test_validate_index_identity(self): comparable_a = Compare() comparable_b = Compare() comparable_a.header = TestIndexValidator.mock_header comparable_a.index_column_name = TestIndexValidator.mock_index comparable_b.header = TestIndexValidator.mock_header comparable_b.index_column_name = TestIndexValidator.mock_index data = TestIndexValidator.mock_data_a comparable_a.data_frame = pd.DataFrame(data) comparable_b.data_frame = pd.DataFrame(data) assert cell_comparator.validate_index_identity(comparable_a, comparable_b) is None
def test_sort_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index d = {'id': ['1', '2', 'str', 'abc', 'abc']} comparable.data_frame = pd.DataFrame(data=d, dtype="object") index_validator.sort_index(comparable) expected = ['1', '2', 'abc', 'abc', 'str'] assert expected == list(comparable.data_frame['id'])
def test_strip_index(self, mock_df, mock_index): comparable = Compare() comparable.data_frame = mock_df comparable.index_column_name = mock_index index_validator.strip_index(comparable) expected = [1, 2, 'str', 'abc', 'abc'] assert expected == list(comparable.data_frame["id"])
def test_check_for_duplicate_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index d = {'id': ['dup1', '1', '2', 'dup1', '3', '4']} comparable.data_frame = pd.DataFrame(data=d, dtype="object") index_validator.check_for_duplicate_index(comparable) actual = list(comparable.duplicate_index['id'].values) expected = ['dup1', 'dup1'] assert expected == actual
def test_check_for_empty_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index d = {'id': ['', '', 'str', 'abc', 'abc', '', None, np.nan]} comparable.data_frame = pd.DataFrame(data=d, dtype="object") index_validator.check_for_empty_index(comparable) expected = [0, 1, 5, 6, 7] actual = comparable.empty_index assert expected == actual
def test_drop_empty_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index d = {'id': ['', '', 'str', 'abc', 'abc', '', None]} comparable.data_frame = pd.DataFrame(data=d, dtype="object") comparable.empty_index = [0, 1, 5, 6] index_validator.drop_empty_index(comparable) expected = ['str', 'abc', 'abc'] actual = list(comparable.data_frame["id"]) assert expected == actual
def test_check_for_disjunctive_index(self, mock_index): comparable_1 = Compare() comparable_2 = Compare() comparable_1.index_column_name = mock_index comparable_2.index_column_name = mock_index data1 = { 'id': ['a', 'b', '1', '2', 'c'], "fname": ['p1', 'q1', 'r1', 's1', 'j1'] } data2 = { 'id': ['a', 'b', '3', '4', 'c'], "fname": ['p2', 'q2', 'r2', 's2', 'j2'] } comparable_1.data_frame = pd.DataFrame(data=data1, dtype="object") comparable_2.data_frame = pd.DataFrame(data=data2, dtype="object") index_validator.check_for_disjunctive_index(comparable_1, comparable_2) expected = ['1', '2'] actual = list(comparable_1.disjunctive_index['id'].values) assert expected == actual
def test_drop_duplicate_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index d = {'id': ['dup1', 'dup1', 'dup1', 'AAA', 'BBB']} dup = {'id': ['dup1', 'dup1', 'dup1']} comparable.data_frame = pd.DataFrame(data=d, dtype="object") comparable.duplicate_index = pd.DataFrame(data=dup, dtype="object") index_validator.drop_duplicate_index(comparable) expected = ['AAA', 'BBB'] actual = list(comparable.data_frame['id'].values) assert expected == actual
def test_drop_disjunctive_index(self, mock_index): comparable = Compare() comparable.index_column_name = mock_index data1 = { 'id': ['1', '2', 'a', 'b', 'c'], "fname": ['p1', 'q1', 'r1', 's1', 'j1'] } comparable.data_frame = pd.DataFrame(data=data1, dtype="object") disjunctive = {'id': ['1', '2']} comparable.disjunctive_index = pd.DataFrame(data=disjunctive, dtype="object") index_validator.drop_disjunctive_index(comparable) expected = ['a', 'b', 'c'] actual = list(comparable.data_frame['id'].values) assert expected == actual
def test_strip_index_2(self, mock_index): comparable = Compare() data1 = pd.DataFrame({ 'id': [ ' 1 ', '2 ', ' a \n\n', '\t\t\t b \n', ' c ' ], "fname": ['p1', 'q1', 'r1', 's1', 'j1'] }) expected = pd.DataFrame({ 'id': ['1', '2', 'a', 'b', 'c'], "fname": ['p1', 'q1', 'r1', 's1', 'j1'] }) comparable.index_column_name = mock_index comparable.data_frame = data1 index_validator.strip_index(comparable) assert expected.equals(data1)
def test_remove_white_space_char(self): comparable = Compare() comparable.header = TestIndexValidator.mock_header comparable.index_column_name = TestIndexValidator.mock_index comparable.data_frame = pd.DataFrame(TestIndexValidator.mock_data_white_space_char) cell_comparator.remove_non_printable_char(comparable) expected = pd.DataFrame({ 'id': [1, 2, 3], 'first_name': ['f1', 'f 2', 'f3'], 'last_name': ['l1', 'l2', 'l3'], 'middle_name': ['m1', 'm2', 'm3'], 'requirement': ['r1', 'r 2', 'r 3'], 'alternate_name': ['a 1', 'a 2', 'a 3'] }) cell_comparator.remove_white_space_char(comparable) assert expected.equals(comparable.data_frame)
def remove_nan(comparable: Compare): comparable.data_frame = comparable.data_frame.fillna('')
def drop_empty_index(comparable: Compare): comparable.data_frame = comparable.data_frame.drop(comparable.empty_index)
def drop_duplicate_index(comparable: Compare): index_name = idx.get_index_name(comparable) index_of_dup = comparable.duplicate_index[index_name].index.values comparable.data_frame = comparable.data_frame.drop(index_of_dup)
def set_data_frame(comparable: Compare): comparable.data_frame = comparable.original_data_frame.copy(deep=True)
def drop_disjunctive_index(comparable: Compare): index_name = idx.get_index_name(comparable) index_of_disjunctive = comparable.disjunctive_index[ index_name].index.values comparable.data_frame = comparable.data_frame.drop(index_of_disjunctive)