def test_date_columns_equal_with_ignore_spaces_and_case(): data = """a|b|expected 2017-01-01|2017-01-01 |True 2017-01-02 |2017-01-02|True 2017-10-01 |2017-10-10 |False 2017-01-01||False |2017-01-01|False ||True""" df = pd.read_csv(six.StringIO(data), sep="|") # First compare just the strings actual_out = datacompy.columns_equal( df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True ) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False) # Then compare converted to datetime objects df["a"] = pd.to_datetime(df["a"]) df["b"] = pd.to_datetime(df["b"]) actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False) # and reverse actual_out_rev = datacompy.columns_equal(df.b, df.a, rel_tol=0.2, ignore_spaces=True) assert_series_equal(expect_out, actual_out_rev, check_names=False)
def test_mixed_column(): df = pd.DataFrame([{ 'a': 'hi', 'b': 'hi', 'expected': True }, { 'a': 1, 'b': 1, 'expected': True }, { 'a': np.inf, 'b': np.inf, 'expected': True }, { 'a': Decimal('1'), 'b': Decimal('1'), 'expected': True }, { 'a': 1, 'b': '1', 'expected': False }, { 'a': 1, 'b': 'yo', 'expected': False }]) actual_out = datacompy.columns_equal(df.a, df.b) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False)
def test_rounded_date_columns(): """If strings can't be coerced into dates then it should be false for the whole column. """ df = pd.DataFrame([ { "a": "2017-01-01", "b": "2017-01-01 00:00:00.000000", "exp": True }, { "a": "2017-01-01", "b": "2017-01-01 00:00:00.123456", "exp": False }, { "a": "2017-01-01", "b": "2017-01-01 00:00:01.000000", "exp": False }, { "a": "2017-01-01", "b": "2017-01-01 00:00:00", "exp": True }, ]) df["a_dt"] = pd.to_datetime(df["a"]) actual = datacompy.columns_equal(df.a_dt, df.b) expected = df["exp"] assert_series_equal(actual, expected, check_names=False)
def test_infinity_and_beyond(): df = pd.DataFrame([{ 'a': np.inf, 'b': np.inf, 'expected': True }, { 'a': -np.inf, 'b': -np.inf, 'expected': True }, { 'a': -np.inf, 'b': np.inf, 'expected': False }, { 'a': np.inf, 'b': -np.inf, 'expected': False }, { 'a': 1, 'b': 1, 'expected': True }, { 'a': 1, 'b': 0, 'expected': False }]) actual_out = datacompy.columns_equal(df.a, df.b) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False)
def test_rounded_date_columns(): """If strings can't be coerced into dates then it should be false for the whole column. """ df = pd.DataFrame([{ 'a': '2017-01-01', 'b': '2017-01-01 00:00:00.000000', 'exp': True }, { 'a': '2017-01-01', 'b': '2017-01-01 00:00:00.123456', 'exp': False }, { 'a': '2017-01-01', 'b': '2017-01-01 00:00:01.000000', 'exp': False }, { 'a': '2017-01-01', 'b': '2017-01-01 00:00:00', 'exp': True }]) df['a_dt'] = pd.to_datetime(df['a']) actual = datacompy.columns_equal(df.a_dt, df.b) expected = df['exp'] assert_series_equal(actual, expected, check_names=False)
def test_bad_date_columns(): """If strings can't be coerced into dates then it should be false for the whole column. """ df = pd.DataFrame( [{"a": "2017-01-01", "b": "2017-01-01"}, {"a": "2017-01-01", "b": "217-01-01"}] ) df["a_dt"] = pd.to_datetime(df["a"]) assert not datacompy.columns_equal(df.a_dt, df.b).any()
def test_mixed_column_with_ignore_spaces_and_case(): df = pd.DataFrame([ { "a": "hi", "b": "hi ", "expected": True }, { "a": 1, "b": 1, "expected": True }, { "a": np.inf, "b": np.inf, "expected": True }, { "a": Decimal("1"), "b": Decimal("1"), "expected": True }, { "a": 1, "b": "1 ", "expected": False }, { "a": 1, "b": "yo ", "expected": False }, { "a": "Hi", "b": "hI ", "expected": True }, { "a": "HI", "b": "HI ", "expected": True }, { "a": "hi", "b": "hi ", "expected": True }, ]) actual_out = datacompy.columns_equal(df.a, df.b, ignore_spaces=True, ignore_case=True) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_numeric_columns_equal_abs(): data = '''a|b|expected 1|1|True 2|2.1|True 3|4|False 4|NULL|False NULL|4|False NULL|NULL|True''' df = pd.read_csv(six.StringIO(data), sep='|') actual_out = datacompy.columns_equal(df.a, df.b, abs_tol=0.2) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False)
def test_numeric_columns_equal_rel(): data = """a|b|expected 1|1|True 2|2.1|True 3|4|False 4|NULL|False NULL|4|False NULL|NULL|True""" df = pd.read_csv(six.StringIO(data), sep="|") actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_date_columns_unequal(): """I want datetime fields to match with dates stored as strings """ df = pd.DataFrame([{"a": "2017-01-01", "b": "2017-01-02"}, {"a": "2017-01-01"}]) df["a_dt"] = pd.to_datetime(df["a"]) df["b_dt"] = pd.to_datetime(df["b"]) assert datacompy.columns_equal(df.a, df.a_dt).all() assert datacompy.columns_equal(df.b, df.b_dt).all() assert datacompy.columns_equal(df.a_dt, df.a).all() assert datacompy.columns_equal(df.b_dt, df.b).all() assert not datacompy.columns_equal(df.b_dt, df.a).any() assert not datacompy.columns_equal(df.a_dt, df.b).any() assert not datacompy.columns_equal(df.a, df.b_dt).any() assert not datacompy.columns_equal(df.b, df.a_dt).any()
def test_bad_date_columns(): """If strings can't be coerced into dates then it should be false for the whole column. """ df = pd.DataFrame([{ 'a': '2017-01-01', 'b': '2017-01-01' }, { 'a': '2017-01-01', 'b': '217-01-01' }]) df['a_dt'] = pd.to_datetime(df['a']) assert not datacompy.columns_equal(df.a_dt, df.b).any()
def test_date_columns_equal(): data = '''a|b|expected 2017-01-01|2017-01-01|True 2017-01-02|2017-01-02|True 2017-10-01|2017-10-10|False 2017-01-01||False |2017-01-01|False ||True''' df = pd.read_csv(six.StringIO(data), sep='|') #First compare just the strings actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False) #Then compare converted to datetime objects df['a'] = pd.to_datetime(df['a']) df['b'] = pd.to_datetime(df['b']) actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False) #and reverse actual_out_rev = datacompy.columns_equal(df.b, df.a, rel_tol=0.2) assert_series_equal(expect_out, actual_out_rev, check_names=False)
def test_mixed_column(): df = pd.DataFrame( [ {"a": "hi", "b": "hi", "expected": True}, {"a": 1, "b": 1, "expected": True}, {"a": np.inf, "b": np.inf, "expected": True}, {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, {"a": 1, "b": "1", "expected": False}, {"a": 1, "b": "yo", "expected": False}, ] ) actual_out = datacompy.columns_equal(df.a, df.b) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_infinity_and_beyond(): df = pd.DataFrame( [ {"a": np.inf, "b": np.inf, "expected": True}, {"a": -np.inf, "b": -np.inf, "expected": True}, {"a": -np.inf, "b": np.inf, "expected": False}, {"a": np.inf, "b": -np.inf, "expected": False}, {"a": 1, "b": 1, "expected": True}, {"a": 1, "b": 0, "expected": False}, ] ) actual_out = datacompy.columns_equal(df.a, df.b) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_decimal_columns_equal_rel(): df = pd.DataFrame( [ {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, {"a": Decimal("1.3"), "b": Decimal("1.3"), "expected": True}, {"a": Decimal("1.000003"), "b": Decimal("1.000003"), "expected": True}, {"a": Decimal("1.000000004"), "b": Decimal("1.000000003"), "expected": True}, {"a": Decimal("1.3"), "b": Decimal("1.2"), "expected": False}, {"a": np.nan, "b": np.nan, "expected": True}, {"a": np.nan, "b": Decimal("1"), "expected": False}, {"a": Decimal("1"), "b": np.nan, "expected": False}, ] ) actual_out = datacompy.columns_equal(df.a, df.b, abs_tol=0.001) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_date_columns_unequal(): """I want datetime fields to match with dates stored as strings """ df = pd.DataFrame([{ 'a': '2017-01-01', 'b': '2017-01-02' }, { 'a': '2017-01-01' }]) df['a_dt'] = pd.to_datetime(df['a']) df['b_dt'] = pd.to_datetime(df['b']) assert datacompy.columns_equal(df.a, df.a_dt).all() assert datacompy.columns_equal(df.b, df.b_dt).all() assert datacompy.columns_equal(df.a_dt, df.a).all() assert datacompy.columns_equal(df.b_dt, df.b).all() assert not datacompy.columns_equal(df.b_dt, df.a).any() assert not datacompy.columns_equal(df.a_dt, df.b).any() assert not datacompy.columns_equal(df.a, df.b_dt).any() assert not datacompy.columns_equal(df.b, df.a_dt).any()
def test_string_columns_equal_with_ignore_spaces(): data = """a|b|expected Hi|Hi|True Yo|Yo|True Hey|Hey |True résumé|resume|False résumé|résumé|True 💩|💩|True 💩|🤔|False | |True | |True datacompy|DataComPy|False something||False |something|False ||True""" df = pd.read_csv(six.StringIO(data), sep="|") actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False)
def test_string_columns_equal(): data = '''a|b|expected Hi|Hi|True Yo|Yo|True Hey|Hey |False résumé|resume|False résumé|résumé|True 💩|💩|True 💩|🤔|False | |True | |False datacompy|DataComPy|False something||False |something|False ||True''' df = pd.read_csv(six.StringIO(data), sep='|') actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False)
def test_decimal_columns_equal_rel(): df = pd.DataFrame([{ 'a': Decimal('1'), 'b': Decimal('1'), 'expected': True }, { 'a': Decimal('1.3'), 'b': Decimal('1.3'), 'expected': True }, { 'a': Decimal('1.000003'), 'b': Decimal('1.000003'), 'expected': True }, { 'a': Decimal('1.000000004'), 'b': Decimal('1.000000003'), 'expected': True }, { 'a': Decimal('1.3'), 'b': Decimal('1.2'), 'expected': False }, { 'a': np.nan, 'b': np.nan, 'expected': True }, { 'a': np.nan, 'b': Decimal('1'), 'expected': False }, { 'a': Decimal('1'), 'b': np.nan, 'expected': False }]) actual_out = datacompy.columns_equal(df.a, df.b, abs_tol=0.001) expect_out = df['expected'] assert_series_equal(expect_out, actual_out, check_names=False)