def __init__(self, settings: AthenaToCassandraSettings):
     self.__settings = settings
     self._athena = AthenaUtil(
         database=self.__settings.source_database,
         conn=AwsConnectionManager(
             self.__settings.source_connection_settings))
     (bucket, key) = self._athena.get_table_data_location(
         self.__settings.source_table)
     super().__init__(
         S3ToCassandraSettings(
             source_bucket=bucket,
             source_key_prefix=key,
             source_connection_settings=self.__settings.
             source_connection_settings,
             destination_keyspace=self.__settings.destination_keyspace,
             destination_table=self.__settings.destination_table,
             destination_table_primary_keys=self.__settings.
             destination_table_primary_keys,
             destination_table_partition_key=self.__settings.
             destination_table_partition_key,
             destination_table_options_statement=self.__settings.
             destination_table_options_statement,
             destination_batch_size=self.__settings.destination_batch_size,
             destination_connection_settings=self.__settings.
             destination_connection_settings,
         ))
Exemplo n.º 2
0
    def test__should__get_correct_estimations__with__etl_get_parallel_payloads(
            self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_to_athena_table_creation"
        etl_settings = AdWordsToAthenaSettings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)])
        etl = AdWordsToAthena(etl_settings)

        actual_payloads = etl.get_parallel_payloads(page_size=1000,
                                                    number_of_workers=3)
        expected_payloads = [{
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 0,
            'worker': 0
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 393000,
            'worker': 1
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 786000,
            'worker': 2
        }]
        self.assertListEqual(expected_payloads, actual_payloads)
        etl.create_athena_table()
        conn = AwsConnectionManager(aws_setting)
        au = AthenaUtil("dev", conn)
        actual = au.get_glue_table_metadata(target_table)
        print(actual)
Exemplo n.º 3
0
    def test__should__create_table__with__a_general_report(self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_negative_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(ReportQueryBuilder().Select(
                'AccountDescriptiveName', 'CampaignId', 'CampaignName',
                'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From(
                    'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build()),
            source_include_zero_impressions=True,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask=None)
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
        select * from dev.test_adwords_negative_report limit 10
        """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
class AthenaToCassandra(S3ToCassandra):
    """
    Class to transfer parquet data from s3 to Cassandra
    Args:
        settings (AthenaToCassandraSettings): the settings around the etl to be executed
    """
    def __init__(self, settings: AthenaToCassandraSettings):
        self.__settings = settings
        self._athena = AthenaUtil(
            database=self.__settings.source_database,
            conn=AwsConnectionManager(
                self.__settings.source_connection_settings))
        (bucket, key) = self._athena.get_table_data_location(
            self.__settings.source_table)
        super().__init__(
            S3ToCassandraSettings(
                source_bucket=bucket,
                source_key_prefix=key,
                source_connection_settings=self.__settings.
                source_connection_settings,
                destination_keyspace=self.__settings.destination_keyspace,
                destination_table=self.__settings.destination_table,
                destination_table_primary_keys=self.__settings.
                destination_table_primary_keys,
                destination_table_options_statement=self.__settings.
                destination_table_options_statement,
                destination_batch_size=self.__settings.destination_batch_size,
                destination_connection_settings=self.__settings.
                destination_connection_settings,
            ))
Exemplo n.º 5
0
 def _get_athena_util(self):
     if self._athena is None:
         self._athena = AthenaUtil(
             database=self.__settings.source_database,
             conn=AwsConnectionManager(
                 self.__settings.source_connection_settings))
     return self._athena
Exemplo n.º 6
0
    def test__get_table_columns__should_return_tuple_when_partition_is_present(
            self):
        mock_au = Mock()
        expected = (
            [
                {
                    "Name": "first_column",
                    "Type": "string"
                },
                {
                    "Name": "second_column",
                    "Type": "string"
                },
            ],
            [
                {
                    "Name": "first_partition_column",
                    "Type": "string"
                },
                {
                    "Name": "second_partition_column",
                    "Type": "string"
                },
            ],
        )
        #
        mock_au.get_glue_table_metadata.return_value = {
            "Table": {
                "Name":
                "test",
                "DatabaseName":
                "test",
                "StorageDescriptor": {
                    "Columns": [
                        {
                            "Name": "first_column",
                            "Type": "string"
                        },
                        {
                            "Name": "second_column",
                            "Type": "string"
                        },
                    ],
                },
                "PartitionKeys": [
                    {
                        "Name": "first_partition_column",
                        "Type": "string"
                    },
                    {
                        "Name": "second_partition_column",
                        "Type": "string"
                    },
                ],
            }
        }

        actual = AthenaUtil.get_table_columns(mock_au, "test")
        self.assertEqual(actual, expected)
Exemplo n.º 7
0
 def _get_athena_util(self) -> AthenaUtil:
     if self._athena is None:
         import uuid
         self._athena = AthenaUtil(
             database=self.__settings.source_database,
             conn=AwsConnectionManager(self.__settings.connection_settings),
             output_key=f"athena_results/{uuid.uuid4().hex}",
             output_bucket=self.__settings.target_s3_bucket,
         )
     return self._athena
Exemplo n.º 8
0
 def test__get_table_data_location__should_return_an_s3_location(self):
     mock_au = Mock()
     expected = ("abc", "def/pqr/")
     #
     mock_au._get_glue_table_metadata.return_value = {
         'Table': {
             'StorageDescriptor': {
                 'Location': "s3://abc/def/pqr/",
             }
         }
     }
     actual = AthenaUtil.get_table_data_location(mock_au, "test")
     self.assertEqual(actual, expected)
Exemplo n.º 9
0
    def test__sql_inspector__should__append_explain_statement_to_query(self):
        expected_values = [
            {
                "schemaName": "foo",
                "tableName": "bar"
            },
            {
                "schemaName": "baz",
                "tableName": "qux"
            },
        ]

        query = """SELECT col1, col2 FROM foo.bar INNER JOIN baz.qux"""

        sql_inspector = SqlInspector(query=query,
                                     athena_util=AthenaUtil(database="test",
                                                            conn=None))

        sql_inspector.query_explaination = (
            explained_queries.SIMPLE_SELECT_EXPLAINED_QUERY)

        sql_inspector.extract_tables_from_explaination()

        assert sql_inspector.table_schema_entries == expected_values
Exemplo n.º 10
0
 def setUpClass(cls):
     cls.au = AthenaUtil(database="test", conn=None)
Exemplo n.º 11
0
    def test__should__create_table__with__geo_performance_report(self):
        aws_setting = AwsConnectionSettings(
            region="ap-southeast-2",
            secrets_manager=AwsSecretsManager(
                access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID",
                secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY",
                use_session_token=True,
                aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN"),
            profile=None)
        target_bucket = "test-bucket"
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id="1111111111",
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_geo_performance_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(
                ReportQueryBuilder().Select(
                    # Attributes
                    'AccountDescriptiveName',
                    'CampaignId',
                    'CityCriteriaId',
                    'CountryCriteriaId',
                    'CustomerDescriptiveName',
                    'ExternalCustomerId',
                    'IsTargetingLocation',
                    'MetroCriteriaId',
                    'MostSpecificCriteriaId',
                    'RegionCriteriaId',

                    # Segments
                    'Date',

                    # Metrics
                    'Impressions',
                    'Clicks',
                    'ConversionRate',
                    'Conversions',
                    'ConversionValue',
                    'Cost',
                    'CostPerConversion').From('GEO_PERFORMANCE_REPORT').During(
                        start_date="20200601", end_date="20200701").Build()),
            source_include_zero_impressions=False,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask={
                "country__territory": np.int,
                "region": np.int,
                "most_specific_location": np.int
            })
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
            select * from dev.test_adwords_geo_performance_report limit 10
            """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
Exemplo n.º 12
0
 def _get_athena_util(self):
     return AthenaUtil(
         database=self.__settings.target_database,
         conn=AwsConnectionManager(
             settings=self.__settings.target_connection_settings),
         output_bucket=self.__settings.target_bucket)
Exemplo n.º 13
0
 def _drop_temporary_table(self) -> None:
     au = AthenaUtil(
         database=self.__settings.temporary_database,
         conn=AwsConnectionManager(self.__settings.connection_settings)
     )
     au.drop_table(self.__settings.temporary_table)