def example_failing_verification(): n_failures = 0 df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN], 'b': ['one', 'one', 'two', 'three', pd.np.NaN]}) v = verify_df(df, TDDA_FILE) if v.failures > 0: print('Correctly failed to verify dataframe that does not satisify ' 'all the constraints in %s' % TDDA_FILE) if v.failures != 7 and v.passes != 5: print('However, expected 7 failures and 5 passes.\n' 'Actual: Failures: %d, Passes: %s.\n' '*** Not great!' % (v.failures, v.passes)) n_failures = 1 elif v.failures == 0: print('*** Incorrectly verified dataframe that should have failed ' 'against constraints in\n %s.' % TDDA_FILE, file=sys.stderr) n_failures = 1 print('\nRESULT AS A STRING:\n') print(str(v)) print('\nRESULT AS A DATAFRAME:\n') print(v.to_frame()) print('\n') return n_failures
def example_failing_verification(): n_failures = 0 df = pd.DataFrame({'a': [0, 1, 2, 10, np.NaN], 'b': ['one', 'one', 'two', 'three', np.NaN]}) v = verify_df(df, TDDA_FILE) if v.failures > 0: print('Correctly failed to verify dataframe that does not satisify ' 'all the constraints in %s' % TDDA_FILE) if v.failures != 7 and v.passes != 5: print('However, expected 7 failures and 5 passes.\n' 'Actual: Failures: %d, Passes: %s.\n' '*** Not great!' % (v.failures, v.passes)) n_failures = 1 elif v.failures == 0: print('*** Incorrectly verified dataframe that should have failed ' 'against constraints in\n %s.' % TDDA_FILE, file=sys.stderr) n_failures = 1 print('\nRESULT AS A STRING:\n') print(str(v)) print('\nRESULT AS A DATAFRAME:\n') print(v.to_frame()) print('\n') return n_failures
def testElements92rex(self): csv_path = os.path.join(TESTDATA_DIR, 'elements92.csv') df = pd.read_csv(csv_path) constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda') v = verify_df(df, constraints_path) self.assertEqual(v.passes, 78) self.assertEqual(v.failures, 0)
def test_input_csvs_meet_constraints(self): """Check that each csv in the /data/raw directory meets the constraints required. This should be a layup - the files and the constraints should not have changed.""" self.failures = {} for name in self.filenames: df = self.raw_dfs_dict[name] v = verify_df(df, self.constraint_paths[name]) assert v.failures == 0
def testDDD_df(self): csv_path = os.path.join(TESTDATA_DIR, 'ddd.csv') df = pd.read_csv(csv_path) constraints_path = os.path.join(TESTDATA_DIR, 'ddd.tdda') v = verify_df(df, constraints_path) # expect 3 failures: # - the pandas CSV reader will have read 'elevens' as an int # - the pandas CSV reader will have read the date columns as strings self.assertEqual(v.passes, 58) self.assertEqual(v.failures, 3)
def testElements118rex(self): csv_path = os.path.join(TESTDATA_DIR, 'elements118.csv') df = pd.read_csv(csv_path) constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda') v = verify_df(df, constraints_path, report='fields') self.assertEqual(v.passes, 61) self.assertEqual(v.failures, 17) vdf = v.to_dataframe() vdf.sort_values('field', inplace=True) self.assertStringCorrect(vdf.to_string(), 'elements118rex.df')
def _validate(self, data, constraints): """ 1. Check if everything is available and is conform to our expectations """ result = verify_df(data, constraints, type_checking='strict') if result.failures != 0: raise KeyError( "One or more columns were not fitting the validation constraints: failures: {}" .format(result.failures)) else: pass
def testDetectElements118rexToFile(self): csv_path = os.path.join(TESTDATA_DIR, 'elements118.csv') df = pd.read_csv(csv_path) constraints_path = os.path.join(TESTDATA_DIR, 'elements92rex.tdda') detectfile = os.path.join(self.tmp_dir, 'elements118rex_detect.csv') v = verify_df(df, constraints_path, report='fields', detect_outpath=detectfile, detect_output_fields=['Z']) self.assertEqual(v.passes, 61) self.assertEqual(v.failures, 17) self.assertFileCorrect(detectfile, 'elements118rex_detect.csv')
def example_positive_verification(): n_failures = 0 df = pd.DataFrame({'a': [2, 4], 'b': ['one', np.NaN]}) v = verify_df(df, TDDA_FILE) if v.failures == 0: print('Correctly verified dataframe against constraints in %s.' % TDDA_FILE) else: print('*** Unexpectedly failed to verify dataframe against constraints' ' in %s.\nSomething is wrong!' % TDDA_FILE, file=sys.stderr) print(v) n_failures = 1 return n_failures
def example_positive_verification(): n_failures = 0 df = pd.DataFrame({'a': [2, 4], 'b': ['one', pd.np.NaN]}) v = verify_df(df, TDDA_FILE) if v.failures == 0: print('Correctly verified dataframe against constraints in %s.' % TDDA_FILE) else: print('*** Unexpectedly failed to verify dataframe against constraints' ' in %s.\nSomething is wrong!' % TDDA_FILE, file=sys.stderr) print(v) n_failures = 1 return n_failures
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs): if df_path == '-' or df_path is None: df_path = StringIO(sys.stdin.read()) if constraints_path is None: if not isinstance(df_path, StringIO): split = os.path.splitext(df_path) if split[1] in ('.csv', '.feather'): constraints_path = split[0] + '.tdda' if constraints_path is None: print('No constraints file specified.', file=sys.stderr) sys.exit(1) df = load_df(df_path) v = verify_df(df, constraints_path, **kwargs) if verbose: print(v) return v
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs): if df_path == '-' or df_path is None: df_path = StringIO(sys.stdin.read()) if constraints_path is None: if not isinstance(df_path, StringIO): split = os.path.splitext(df_path) if split[1] in ('.csv', '.feather'): constraints_path = split[0] + '.tdda' if constraints_path is None: print('No constraints file specified.', file=sys.stderr) sys.exit(1) df = load_df(df_path) v = verify_df(df, constraints_path, **kwargs) if verbose: print(v) return v
# accounts_verify_25k_against_1k.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/accounts25k.csv') v = verify_df(df, 'accounts1k.tdda') vdf = v.to_frame() print(vdf)
import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.DataFrame({'a': [2, 4], 'b': ['one', pd.np.NaN]}) v = verify_df(df, 'example_constraints.tdda') print('Passes: %d' % v.passes) print('Failures: %d\n\n\n' % v.failures) print(str(v)) print('\n\n') print(v.to_frame())
# elements_verify_118_against_92.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/elements118.csv') print(verify_df(df, 'elements92.tdda'))
import numpy as np import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.DataFrame({'a': [2, 4], 'b': ['one', np.NaN]}) v = verify_df(df, 'example_constraints.tdda') print('Passes: %d' % v.passes) print('Failures: %d\n\n\n' % v.failures) print(str(v)) print('\n\n') print(v.to_frame())
def find_with_tdda(df, show=True): v = verify_df(df, 'constraints.tdda', detect=True, detect_per_constraint=True, detect_output_fields=[]) bads = v.detected() show_df(bads, 'BAD RECORDS (FOUND WITH TDDA)', show, all_cols=True) return bads
# accounts_verify_25k.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/accounts25k.csv') print(verify_df(df, 'accounts25k.tdda'))
def verify_df_from_file(df_path, constraints_path, verbose=True, **kwargs): df = load_df(df_path) v = verify_df(df, constraints_path, **kwargs) if verbose: print(v) return v
# accounts_verify_25k_against_1k_feather.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/accounts25k.csv') verification = verify_df(df, 'testdata/accounts1k.tdda') print('Basic Verification:') print(verification) print('\n') print('Verification DataFrame:') dfv = verification.to_frame() print(dfv)
import os import pandas as pd import sys from tdda.constraints.pd.constraints import verify_df inpath = '../data/processed/wrangled_dataframe.csv' constraint_path = ''.join([ '../data/interim/constraints_initial_csvs/', 'wrangled_dataframe_constraints.tdda' ]) outpath = '../data/interim/constraints_initial_csvs/wrangled_verification.tdda' df = pd.read_csv(inpath, low_memory=False) v = verify_df(df, constraint_path) print('Constraints passing: %d\n' % v.passes) print('Constraints failing: %d\n' % v.failures) if v.failures > 0: print('\n', str(v)) print('\n', v.to_frame()) if v.failures == 0: with open(outpath, 'w') as f: f.write('Success!') f.write('\n') f.write(f'{inpath} meets all the constraints of {constraint_path}.') else: with open(outpath, 'w') as f: f.write('There was at least one failure.')
# elements_verify_118.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/elements118.csv') print(verify_df(df, 'elements118.tdda'))
def test_wrangled_csv_meets_constraints(self): """Check that the wrangled csv meets the constraints required.""" wrangled_df = pd.read_csv(self.raw_csv_paths['wrangled'], low_memory=False) v = verify_df(wrangled_df, self.constraint_paths['wrangled_csv']) assert v.failures == 0
# elements_verify_118_against_92_feather.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/elements118.csv') verification = verify_df(df, 'testdata/elements92.tdda') print('Basic Verification:') print(verification) print('\n') print('Verification DataFrame:') dfv = verification.to_frame() print(dfv)
# elements_verify_118.py from __future__ import print_function import pandas as pd from tdda.constraints.pd.constraints import verify_df df = pd.read_csv('testdata/elements118.csv') verification = verify_df(df, 'testdata/elements92.tdda') print('Basic Verification:') print(verification) print('\n') print('Verification DataFrame:') dfv = verification.to_frame() print(dfv)