def __init__(self, settings: AthenaToCassandraSettings): self.__settings = settings self._athena = AthenaUtil( database=self.__settings.source_database, conn=AwsConnectionManager( self.__settings.source_connection_settings)) (bucket, key) = self._athena.get_table_data_location( self.__settings.source_table) super().__init__( S3ToCassandraSettings( source_bucket=bucket, source_key_prefix=key, source_connection_settings=self.__settings. source_connection_settings, destination_keyspace=self.__settings.destination_keyspace, destination_table=self.__settings.destination_table, destination_table_primary_keys=self.__settings. destination_table_primary_keys, destination_table_partition_key=self.__settings. destination_table_partition_key, destination_table_options_statement=self.__settings. destination_table_options_statement, destination_batch_size=self.__settings.destination_batch_size, destination_connection_settings=self.__settings. destination_connection_settings, ))
def test__should__get_correct_estimations__with__etl_get_parallel_payloads( self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_to_athena_table_creation" etl_settings = AdWordsToAthenaSettings( source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy( 'Id'), source_service="AdGroupAdService", source_service_version="v201809", source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)]) etl = AdWordsToAthena(etl_settings) actual_payloads = etl.get_parallel_payloads(page_size=1000, number_of_workers=3) expected_payloads = [{ 'number_of_pages': 393, 'page_size': 1000, 'start_index': 0, 'worker': 0 }, { 'number_of_pages': 393, 'page_size': 1000, 'start_index': 393000, 'worker': 1 }, { 'number_of_pages': 393, 'page_size': 1000, 'start_index': 786000, 'worker': 2 }] self.assertListEqual(expected_payloads, actual_payloads) etl.create_athena_table() conn = AwsConnectionManager(aws_setting) au = AthenaUtil("dev", conn) actual = au.get_glue_table_metadata(target_table) print(actual)
def test__should__create_table__with__a_general_report(self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_negative_report" etl_settings = AdWordsReportsToAthenaSettings( source_query=(ReportQueryBuilder().Select( 'AccountDescriptiveName', 'CampaignId', 'CampaignName', 'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From( 'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build()), source_include_zero_impressions=True, source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)], target_file_prefix="data", transformation_field_type_mask=None) etl = AdWordsReportsToAthena(etl_settings) etl.transfer() etl.create_athena_table() etl.add_partitions() au = AthenaUtil(database="dev", conn=AwsConnectionManager(aws_setting), output_bucket=os.environ["S3_TEST_BUCKET"]) actual = au.run_query(query_string=""" select * from dev.test_adwords_negative_report limit 10 """, return_result=True) print(actual) expected = 11 self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
class AthenaToCassandra(S3ToCassandra): """ Class to transfer parquet data from s3 to Cassandra Args: settings (AthenaToCassandraSettings): the settings around the etl to be executed """ def __init__(self, settings: AthenaToCassandraSettings): self.__settings = settings self._athena = AthenaUtil( database=self.__settings.source_database, conn=AwsConnectionManager( self.__settings.source_connection_settings)) (bucket, key) = self._athena.get_table_data_location( self.__settings.source_table) super().__init__( S3ToCassandraSettings( source_bucket=bucket, source_key_prefix=key, source_connection_settings=self.__settings. source_connection_settings, destination_keyspace=self.__settings.destination_keyspace, destination_table=self.__settings.destination_table, destination_table_primary_keys=self.__settings. destination_table_primary_keys, destination_table_options_statement=self.__settings. destination_table_options_statement, destination_batch_size=self.__settings.destination_batch_size, destination_connection_settings=self.__settings. destination_connection_settings, ))
def _get_athena_util(self): if self._athena is None: self._athena = AthenaUtil( database=self.__settings.source_database, conn=AwsConnectionManager( self.__settings.source_connection_settings)) return self._athena
def test__get_table_columns__should_return_tuple_when_partition_is_present( self): mock_au = Mock() expected = ( [ { "Name": "first_column", "Type": "string" }, { "Name": "second_column", "Type": "string" }, ], [ { "Name": "first_partition_column", "Type": "string" }, { "Name": "second_partition_column", "Type": "string" }, ], ) # mock_au.get_glue_table_metadata.return_value = { "Table": { "Name": "test", "DatabaseName": "test", "StorageDescriptor": { "Columns": [ { "Name": "first_column", "Type": "string" }, { "Name": "second_column", "Type": "string" }, ], }, "PartitionKeys": [ { "Name": "first_partition_column", "Type": "string" }, { "Name": "second_partition_column", "Type": "string" }, ], } } actual = AthenaUtil.get_table_columns(mock_au, "test") self.assertEqual(actual, expected)
def _get_athena_util(self) -> AthenaUtil: if self._athena is None: import uuid self._athena = AthenaUtil( database=self.__settings.source_database, conn=AwsConnectionManager(self.__settings.connection_settings), output_key=f"athena_results/{uuid.uuid4().hex}", output_bucket=self.__settings.target_s3_bucket, ) return self._athena
def test__get_table_data_location__should_return_an_s3_location(self): mock_au = Mock() expected = ("abc", "def/pqr/") # mock_au._get_glue_table_metadata.return_value = { 'Table': { 'StorageDescriptor': { 'Location': "s3://abc/def/pqr/", } } } actual = AthenaUtil.get_table_data_location(mock_au, "test") self.assertEqual(actual, expected)
def test__sql_inspector__should__append_explain_statement_to_query(self): expected_values = [ { "schemaName": "foo", "tableName": "bar" }, { "schemaName": "baz", "tableName": "qux" }, ] query = """SELECT col1, col2 FROM foo.bar INNER JOIN baz.qux""" sql_inspector = SqlInspector(query=query, athena_util=AthenaUtil(database="test", conn=None)) sql_inspector.query_explaination = ( explained_queries.SIMPLE_SELECT_EXPLAINED_QUERY) sql_inspector.extract_tables_from_explaination() assert sql_inspector.table_schema_entries == expected_values
def setUpClass(cls): cls.au = AthenaUtil(database="test", conn=None)
def test__should__create_table__with__geo_performance_report(self): aws_setting = AwsConnectionSettings( region="ap-southeast-2", secrets_manager=AwsSecretsManager( access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID", secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY", use_session_token=True, aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN"), profile=None) target_bucket = "test-bucket" target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id="1111111111", secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_geo_performance_report" etl_settings = AdWordsReportsToAthenaSettings( source_query=( ReportQueryBuilder().Select( # Attributes 'AccountDescriptiveName', 'CampaignId', 'CityCriteriaId', 'CountryCriteriaId', 'CustomerDescriptiveName', 'ExternalCustomerId', 'IsTargetingLocation', 'MetroCriteriaId', 'MostSpecificCriteriaId', 'RegionCriteriaId', # Segments 'Date', # Metrics 'Impressions', 'Clicks', 'ConversionRate', 'Conversions', 'ConversionValue', 'Cost', 'CostPerConversion').From('GEO_PERFORMANCE_REPORT').During( start_date="20200601", end_date="20200701").Build()), source_include_zero_impressions=False, source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)], target_file_prefix="data", transformation_field_type_mask={ "country__territory": np.int, "region": np.int, "most_specific_location": np.int }) etl = AdWordsReportsToAthena(etl_settings) etl.transfer() etl.create_athena_table() etl.add_partitions() au = AthenaUtil(database="dev", conn=AwsConnectionManager(aws_setting), output_bucket=os.environ["S3_TEST_BUCKET"]) actual = au.run_query(query_string=""" select * from dev.test_adwords_geo_performance_report limit 10 """, return_result=True) print(actual) expected = 11 self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
def _get_athena_util(self): return AthenaUtil( database=self.__settings.target_database, conn=AwsConnectionManager( settings=self.__settings.target_connection_settings), output_bucket=self.__settings.target_bucket)
def _drop_temporary_table(self) -> None: au = AthenaUtil( database=self.__settings.temporary_database, conn=AwsConnectionManager(self.__settings.connection_settings) ) au.drop_table(self.__settings.temporary_table)