def test_start_to_end_no_duplication_between_batches(self): for table_name in processing_info_maps.search_type.keys(): if processing_info_maps.search_type[table_name] == 'start_to_end': print('Validating start_to_end type for table {}'.format( table_name)) start_test_window = defaults.nem_data_model_start_time start_test_window = '2018/01/01 00:00:00' start_time = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') end_time = datetime.strptime('2018/05/01 00:00:00', '%Y/%m/%d %H:%M:%S') if table_name == 'FCAS_4_SECOND': start_test_window = '2015/01/01 00:00:00' end_time = datetime.strptime('2015/01/02 00:00:00', '%Y/%m/%d %H:%M:%S') start_search = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') data_tables = data_fetch_methods.dynamic_data_fetch_loop( start_search=start_search, start_time=start_time, end_time=end_time, table_name=table_name, raw_data_location='E:/raw_aemo_data', select_columns=defaults.table_primary_keys[table_name], date_filter=None, search_type='start_to_end') all_data = pd.concat(data_tables, sort=False) contains_duplicates = all_data.duplicated().any() self.assertEqual(False, contains_duplicates, 'table {}'.format(table_name)) print('Type valid, no duplicates found.')
def test_last_contains_data_from_first(self): for table_name in processing_info_maps.search_type.keys(): if processing_info_maps.search_type[table_name] == 'end': start_test_window = defaults.nem_data_model_start_time #start_test_window = '2018/01/01 00:00:00' start_time = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') end_time = datetime.strptime('2018/01/01 00:00:00', '%Y/%m/%d %H:%M:%S') start_search = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') data_tables = data_fetch_methods.dynamic_data_fetch_loop( start_search=start_search, start_time=start_time, end_time=end_time, table_name=table_name, raw_data_location='E:/raw_aemo_data', select_columns=None, date_filter=None, search_type='end') first_data_table = data_tables[ 35].loc[:, defaults.table_primary_keys[table_name]] last_data_table = data_tables[-1] comp = pd.merge(first_data_table, last_data_table, 'left', defaults.table_primary_keys[table_name]) non_primary_col = [ col for col in defaults.table_columns[table_name] if col not in defaults.table_primary_keys[table_name] ][0] missing_from_last = comp[comp[non_primary_col].isnull()] self.assertEqual(False, missing_from_last.empty)
def test_all_no_duplication_between_batches_with_finalise_step(self): for table_name in processing_info_maps.search_type.keys(): if processing_info_maps.search_type[table_name] == 'all': print('Testing duplicate removal for table {}'.format( table_name)) start_test_window = defaults.nem_data_model_start_time #start_test_window = '2018/01/01 00:00:00' start_time = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') end_time = datetime.strptime('2018/01/01 00:00:00', '%Y/%m/%d %H:%M:%S') start_search = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') data_tables = data_fetch_methods.dynamic_data_fetch_loop( start_search=start_search, start_time=start_time, end_time=end_time, table_name=table_name, raw_data_location='E:/raw_aemo_data', select_columns=defaults.table_primary_keys[table_name], date_filter=None, search_type='all') all_data = pd.concat(data_tables, sort=False) all_data = query_wrapers.drop_duplicates_by_primary_key( all_data, start_time, table_name) contains_duplicates = all_data.duplicated().any() self.assertEqual(False, contains_duplicates) print('Type valid, no duplicates found.')
def test_all_no_duplication_between_batches(self): for table_name in processing_info_maps.search_type.keys(): if processing_info_maps.search_type[table_name] == 'all': print('Validating all type for table {}'.format(table_name)) if table_name in [ 'GENCONDATA', 'SPDCONNECTIONPOINTCONSTRAINT', 'SPDINTERCONNECTORCONSTRAINT', 'DUDETAILSUMMARY', 'LOSSMODEL', 'LOSSFACTORMODEL', 'MNSP_DAYOFFER', 'MNSP_PEROFFER', 'MNSP_INTERCONNECTOR', 'INTERCONNECTOR', 'INTERCONNECTORCONSTRAINT', 'DUDETAIL', 'MARKET_PRICE_THRESHOLDS' ]: print( '{} is known to contain duplicate entries and is exempted from this test, a finalise ' 'data processing step is included in dynamic data fetch to clean up these duplicates.' .format(table_name)) continue start_test_window = defaults.nem_data_model_start_time #start_test_window = '2018/01/01 00:00:00' start_time = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') end_time = datetime.strptime('2018/01/01 00:00:00', '%Y/%m/%d %H:%M:%S') start_search = datetime.strptime(start_test_window, '%Y/%m/%d %H:%M:%S') data_tables = data_fetch_methods.dynamic_data_fetch_loop( start_search=start_search, start_time=start_time, end_time=end_time, table_name=table_name, raw_data_location='E:/raw_aemo_data', select_columns=defaults.table_primary_keys[table_name], date_filter=None, search_type='all') all_data = pd.concat(data_tables, sort=False) contains_duplicates = all_data.duplicated().any() self.assertEqual(False, contains_duplicates, 'table {}'.format(table_name)) print('Type valid, no duplicates found.')