def test_nested_create_gather_updates_query(self, project_id): """Tests QueryCreator's ability to create a query with nested fields to gather updates. Args: project_id(str): ID of the project that holds the test BQ tables. Returns: True if test passes, else False. """ if not project_id: raise Exception( 'Test needs project_id to pass. ' 'Add --project_id={your project ID} to test command') abs_path = os.path.abspath(os.path.dirname(__file__)) schema_path = os.path.join(abs_path, 'test_schemas/test_nested_schema.json') test_query_creator = query_creator.QueryCreator( schema_path=schema_path, user_id_field_name=self.user_id_field_name, ingest_timestamp_field_name=self.ingest_timestamp_field_name, project_id=project_id, dataset_id=self.dataset_id, updates_table_id=self.user_info_updates_id, temp_updates_table_id=self.temp_user_info_updates_id, final_table_id=self.user_info_final_id) gather_updates_query = test_query_creator.create_gather_updates_query() abs_path = os.path.abspath(os.path.dirname(__file__)) test_gather_updates_query_path = os.path.join( abs_path, 'test_queries/test_nested_gather_updates_query.txt') with open(test_gather_updates_query_path, 'r') as input_file: expected_gather_updates_query = input_file.read().format( project_id, '{0:s}') assert gather_updates_query == expected_gather_updates_query
def test_second_update(self, project_id): """Tests UserInfoUpdater ability to run an update after at least one has been run prior. Args: project_id(str): ID of the project that holds the test BQ tables. Returns: True if test passes, else False. """ # Load a second set of user updates to user_info_updates table to # simulate a second iteration. self.load_csv_to_bq(filename='test_data/user_info_updates_data_2.csv', table=self.dataset_ref.table( self.user_info_updates_id)) # Load data into the temp table and final table so that they will # contain the same data they did at the end of the # test_initial_update() test. self.load_csv_to_bq( filename='test_data/temp_user_info_updates_expected_1.csv', table=self.dataset_ref.table(self.temp_user_info_updates_id)) self.load_csv_to_bq( filename='test_data/user_info_final_expected_1.csv', table=self.dataset_ref.table(self.user_info_final_id), ) # Run the UserInfoUpdater on the second set of updates. test_updater = user_info_updater.UserInfoUpdater( project_id, self.dataset_id, self.user_info_updates_id, self.temp_user_info_updates_id, self.user_info_final_id) update_query_creator = query_creator.QueryCreator( schema_path=self.schema_path, user_id_field_name='userId', ingest_timestamp_field_name='ingestTimestamp', project_id=project_id, dataset_id=self.dataset_id, updates_table_id=self.user_info_updates_id, temp_updates_table_id=self.temp_user_info_updates_id, final_table_id=self.user_info_final_id) gather_updates_query = update_query_creator.create_gather_updates_query( ) test_updater.gather_updates(gather_updates_query) merge_udpates_query = update_query_creator.create_merge_query() test_updater.merge_updates(merge_udpates_query) # Query the temp table to test that the gather_updates() function worked temp_table_query_config = bigquery.QueryJobConfig() temp_table_query_config.use_legacy_sql = False temp_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.temp_user_info_updates_id), job_config=temp_table_query_config, location='US') temp_table_query.result() temp_table_results_df = temp_table_query.to_dataframe() \ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison abs_path = os.path.abspath(os.path.dirname(__file__)) expected_temp_data_file = os.path.join( abs_path, 'test_data/temp_user_info_updates_expected_2.csv') expected_temp_table_df = pd.read_csv(expected_temp_data_file) # convert ingestTimestamp to datetime expected_temp_table_df['ingestTimestamp'] = pd.to_datetime( expected_temp_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(temp_table_results_df, expected_temp_table_df) # Query the final table to test that the merge_updates() function worked final_table_query_config = bigquery.QueryJobConfig() final_table_query_config.use_legacy_sql = False final_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.user_info_final_id), job_config=final_table_query_config, location='US') final_table_query.result() final_table_results_df = final_table_query.to_dataframe() \ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison expected_final_data_file = os.path.join( abs_path, 'test_data/user_info_final_expected_2.csv') expected_final_table_df = pd.read_csv(expected_final_data_file) # convert ingestTimestamp to datetime expected_final_table_df['ingestTimestamp'] = pd.to_datetime( expected_final_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(final_table_results_df, expected_final_table_df)
def test_initial_update(self, project_id): """Tests UserInfoUpdater ability to run an initial update. Args: project_id(str): ID of the project that holds the test BQ tables. Returns: True if test passes, else False. """ if not project_id: raise Exception( 'Test needs project_id to pass. ' 'Add --project_id={your project ID} to test command') # Load the first set of user updates to user_info_updates table. # All other tables should be empty at this point. self.load_csv_to_bq(filename='test_data/user_info_updates_data_1.csv', table=self.dataset_ref.table( self.user_info_updates_id)) # Run the UserInfoUpdater on the first set of updates. test_updater = user_info_updater.UserInfoUpdater( project_id, self.dataset_id, self.user_info_updates_id, self.temp_user_info_updates_id, self.user_info_final_id) # Get Queries to Run update_query_creator = query_creator.QueryCreator( schema_path=self.schema_path, user_id_field_name='userId', ingest_timestamp_field_name='ingestTimestamp', project_id=project_id, dataset_id=self.dataset_id, updates_table_id=self.user_info_updates_id, temp_updates_table_id=self.temp_user_info_updates_id, final_table_id=self.user_info_final_id) gather_updates_query = update_query_creator.create_gather_updates_query( ) test_updater.gather_updates(gather_updates_query) merge_udpates_query = update_query_creator.create_merge_query() test_updater.merge_updates(merge_udpates_query) # Query the temp table to test that the gather_updates() function worked temp_table_query_config = bigquery.QueryJobConfig() temp_table_query_config.use_legacy_sql = False temp_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.temp_user_info_updates_id), job_config=temp_table_query_config, location='US') temp_table_query.result() temp_table_results_df = temp_table_query.to_dataframe()\ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison abs_path = os.path.abspath(os.path.dirname(__file__)) expected_temp_data_file = os.path.join( abs_path, 'test_data/temp_user_info_updates_expected_1.csv') expected_temp_table_df = pd.read_csv(expected_temp_data_file) # convert ingestTimestamp to datetime expected_temp_table_df['ingestTimestamp'] = pd.to_datetime( expected_temp_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(temp_table_results_df, expected_temp_table_df) # Query the final table to test that the merge_updates() function worked final_table_query_config = bigquery.QueryJobConfig() final_table_query_config.use_legacy_sql = False final_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.user_info_final_id), job_config=final_table_query_config, location='US') final_table_query.result() final_table_results_df = final_table_query.to_dataframe() \ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison expected_final_data_file = os.path.join( abs_path, 'test_data/user_info_final_expected_1.csv') expected_final_table_df = pd.read_csv(expected_final_data_file) expected_final_table_df['ingestTimestamp'] = pd.to_datetime( expected_final_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(final_table_results_df, expected_final_table_df)
def test_nested_data_user_update(self, project_id): """Tests UserInfoUpdater ability to run an update on nested user data. Args: project_id(str): ID of the project that holds the test BQ tables. Returns: True if test passes, else False. """ if not project_id: raise Exception( 'Test needs project_id to pass. ' 'Add --project_id={your project ID} to test command') # Load a set of user updates to nested user_info_updates table. self.load_json_to_bq( filename='test_data/nested_data/user_info_updates_data.json', table=self.dataset_ref.table(self.user_info_updates_id)) # Load data into the temp table and final table to simulate a previous # run. self.load_json_to_bq(filename='test_data/nested_data/' 'temp_user_info_updates_initial.json', table=self.dataset_ref.table( self.temp_user_info_updates_id)) self.load_json_to_bq( filename='test_data/nested_data/user_info_final_initial.json', table=self.dataset_ref.table(self.user_info_final_id)) # Run the UserInfoUpdater on the second set of updates. test_updater = user_info_updater.UserInfoUpdater( project_id, self.dataset_id, self.user_info_updates_id, self.temp_user_info_updates_id, self.user_info_final_id) update_query_creator = query_creator.QueryCreator( schema_path=self.schema_path, user_id_field_name='userId', ingest_timestamp_field_name='ingestTimestamp', project_id=project_id, dataset_id=self.dataset_id, updates_table_id=self.user_info_updates_id, temp_updates_table_id=self.temp_user_info_updates_id, final_table_id=self.user_info_final_id) gather_updates_query = update_query_creator.create_gather_updates_query( ) test_updater.gather_updates(gather_updates_query) merge_udpates_query = update_query_creator.create_merge_query() test_updater.merge_updates(merge_udpates_query) # Query the temp table to test that the gather_updates() function worked temp_table_query_config = bigquery.QueryJobConfig() temp_table_query_config.use_legacy_sql = False temp_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.temp_user_info_updates_id), job_config=temp_table_query_config, location='US') temp_table_query.result() temp_table_results_df = temp_table_query.to_dataframe() \ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison abs_path = os.path.abspath(os.path.dirname(__file__)) expected_temp_data_file = os.path.join( abs_path, 'test_data/nested_data/temp_user_info_updates_expected.json') expected_temp_table_df = pd.read_json(expected_temp_data_file) # Reorder columns since read_json() reads them alphabetically with open(self.schema_path, 'r') as f: json_schema = json.loads(f.read()) col_list = [str(col['name']) for col in json_schema['fields']] expected_temp_table_df = expected_temp_table_df[col_list] # convert ingestTimestamp to datetime expected_temp_table_df['ingestTimestamp'] = pd.to_datetime( expected_temp_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(temp_table_results_df, expected_temp_table_df) # Query the final table to test that the merge_updates() function worked final_table_query_config = bigquery.QueryJobConfig() final_table_query_config.use_legacy_sql = False final_table_query = self.bq_client.query( query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format( project_id, self.dataset_id, self.user_info_final_id), job_config=final_table_query_config, location='US') final_table_query.result() final_table_results_df = final_table_query.to_dataframe() \ .sort_values(by=['userId']).reset_index(drop=True) # Gather expected results for comparison expected_final_data_file = os.path.join( abs_path, 'test_data/nested_data/user_info_final_expected.json') expected_final_table_df = pd.read_json(expected_final_data_file) # Reorder columns since read_json() reads them alphabetically with open(self.schema_path, 'r') as f: json_schema = json.loads(f.read()) col_list = [str(col['name']) for col in json_schema['fields']] expected_final_table_df = expected_final_table_df[col_list] # convert ingestTimestamp to datetime expected_final_table_df['ingestTimestamp'] = pd.to_datetime( expected_final_table_df['ingestTimestamp']) # Compare results pd.testing.assert_frame_equal(final_table_results_df, expected_final_table_df)