예제 #1
0
 def setUpClass(cls):
     cls.bucket = "TEST_BUCKET"
     conn = AwsConnectionManager(
         AwsConnectionSettings(region="us-east-1",
                               secrets_manager=AwsSecretsManager(),
                               profile=None))
     cls.s3 = S3Util(conn=conn, bucket=cls.bucket)
예제 #2
0
    def test__list_source_files__should_list_the_correct_source_files_as_an_array(self):
        test_bucket = "test"
        test_source_key_prefix = "test_for_s3_to_cassandra"
        aws_conn_settings = AwsConnectionSettings(
            region="ap-southeast-2",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        s3 = S3Util(conn=AwsConnectionManager(aws_conn_settings), bucket=test_bucket)
        s3.create_bucket()
        for itr in range(3):
            s3.upload_file(self.sample_file_location, f"{test_source_key_prefix}/{itr}.abc")
        expected = [
            f"{test_source_key_prefix}/0.abc",
            f"{test_source_key_prefix}/1.abc",
            f"{test_source_key_prefix}/2.abc", ]

        util = S3ToCassandra(S3ToCassandraSettings(
            source_bucket=test_bucket,
            source_key_prefix=test_source_key_prefix,
            source_connection_settings=aws_conn_settings,
            destination_keyspace="test",
            destination_table="test",
            destination_table_primary_keys=[],
            destination_table_options_statement="",
            destination_batch_size=2,
            destination_connection_settings=Mock(),
        ))

        actual = util.list_source_files()
        self.assertListEqual(actual, expected)
예제 #3
0
    def test_the_etl_should_download_first_file_correctly(self):
        test_bucket = "SOME_OTHER_BUCKET"
        test_key = "some/key"
        aws_conn = AwsConnectionSettings(region="ap-southeast-2",
                                         secrets_manager=AwsSecretsManager(),
                                         profile=None)
        s3_util = S3Util(conn=AwsConnectionManager(aws_conn),
                         bucket=test_bucket)
        df1 = pd.DataFrame(dict(A=range(10)),
                           index=pd.date_range('20130101',
                                               periods=10,
                                               freq='d'))
        df2 = pd.DataFrame(dict(A=range(10, 20)),
                           index=pd.date_range('20130111',
                                               periods=10,
                                               freq='d'))

        s3_util.create_bucket()
        s3_util.upload_dataframe_as_parquet(df1, test_key, "df1")
        s3_util.upload_dataframe_as_parquet(df2, test_key, "df2")

        settings = S3ToDataFrameSettings(source_bucket=test_bucket,
                                         source_key_prefix=test_key,
                                         source_connection_settings=aws_conn)

        etl = S3ToDataFrame(settings)

        expected = pd.DataFrame(dict(A=range(20)),
                                index=pd.date_range('20130101',
                                                    periods=20,
                                                    freq='d'))
        assert_frame_equal(expected, etl.get_all_files_as_data_frame())
예제 #4
0
    def test_the_etl_should_list_files_correctly(self):
        test_bucket = "TEST_BUCKET_ITR"
        test_key = "some/key"
        aws_conn = AwsConnectionSettings(region="ap-southeast-2",
                                         secrets_manager=AwsSecretsManager(),
                                         profile=None)
        s3_util = S3Util(conn=AwsConnectionManager(aws_conn),
                         bucket=test_bucket)
        df1 = pd.DataFrame(dict(A=range(10000)),
                           index=pd.date_range('20130101',
                                               periods=10000,
                                               freq='s'))
        df2 = pd.DataFrame(dict(A=range(10000)),
                           index=pd.date_range('20140101',
                                               periods=10000,
                                               freq='s'))
        df3 = pd.DataFrame(dict(A=range(10000)),
                           index=pd.date_range('20150101',
                                               periods=10000,
                                               freq='s'))

        s3_util.create_bucket()
        s3_util.upload_dataframe_as_parquet(df1, test_key, "df1")
        s3_util.upload_dataframe_as_parquet(df2, test_key, "df2")
        s3_util.upload_dataframe_as_parquet(df3, test_key, "df3")

        settings = S3ToDataFrameSettings(source_bucket=test_bucket,
                                         source_key_prefix=test_key,
                                         source_connection_settings=aws_conn)

        etl = S3ToDataFrame(settings)

        assert_frame_equal(df1, etl.next())
        assert_frame_equal(df2, etl.next())
        assert_frame_equal(df3, etl.next())
예제 #5
0
    def integration_test_should__load_sheet_to_athena__when_using_sheetUtil(
            self):
        with open('../resources/key-file.json', 'r') as f:
            obj = json.load(f)

        GoogleSheetToAthena(
            GoogleSheetsToAthenaSettings(
                source_workbook_url='https://docs.google.com/spreadsheets/d'
                '/1W1vICFsHacKyrzCBLfsQM/edit?usp=sharing',
                source_sheet='spec_example',
                source_row_range=None,
                source_field_names_row_number=5,
                source_field_types_row_number=4,
                source_data_start_row_number=6,
                source_connection_settings=GoogleApiConnectionSettings(
                    secrets_manager=GoogleApiSecretsManager(
                        source=DictKeyValueSource({"key_json": obj}),
                        key_json_var="key_json")),
                manual_partition_key_value={
                    "column": "start_date",
                    "value": "2020-03-11"
                },
                target_database='dev',
                target_table_name='test_sheets_example_v2',
                target_s3_bucket='au-com-hipages-data-scratchpad',
                target_s3_dir='sheets_example_v2',
                target_connection_settings=AwsConnectionSettings(
                    region='us-east-1',
                    profile='default',
                    secrets_manager=None),
                target_table_ddl_progress=True)).load_sheet_to_athena()
예제 #6
0
    def test_should__copy_file_from_one_bucket_to_another__when_valid_locations_are_given(
            self):
        dest_bucket = "TEST_BUCKET_DEST"
        conn = AwsConnectionManager(
            AwsConnectionSettings(region="ap-southeast-2",
                                  secrets_manager=AwsSecretsManager(),
                                  profile=None))
        s3_util_for_destination = S3Util(conn=conn, bucket=dest_bucket)
        s3_util_for_source = self.s3

        s3_util_for_source.create_bucket()
        s3_util_for_destination.create_bucket()

        tmp_file_path = "/tmp/testfile.txt"
        dirname = os.path.dirname(tmp_file_path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(tmp_file_path, "w+") as file:
            file.write(str("Test file content"))

        s3_util_for_source.upload_file(tmp_file_path, "test/testfile.txt")

        s3_util_for_source.move_recursive_to_different_bucket(
            source_key_prefix="test/",
            destination_bucket_name=dest_bucket,
            destination_key_prefix="{}/test_copy/".format(dest_bucket))
        actual = s3_util_for_destination.read_lines_as_list("test_copy")[0]

        expected = "Test file content"
        self.assertEqual(actual, expected)
예제 #7
0
def create_batch_s3_uploader(batch_s3_uploader_config=None):
    """
    Factory method to generate a kafka batch uploadxr

    Args:
        batch_s3_uploader_config (BatchS3UploaderConfig): Configuration
            object for the s3 uploader
    Returns (KafkaS3BatchExporter): Instantiated Exporter

    """

    batch_s3_uploader_config = batch_s3_uploader_config \
        if batch_s3_uploader_config is not None else BatchS3UploaderConfig()

    aws_region = os.environ.get("AWS_DEFAULT_REGION")

    conn = AwsConnectionManager(
        AwsConnectionSettings(region=aws_region,
                              secrets_manager=AwsSecretsManager(),
                              profile=None))

    s3_client = S3Util(conn, batch_s3_uploader_config.bucket)

    return KafkaS3BatchExporter(batch_s3_uploader_config.root_path, s3_client,
                                batch_s3_uploader_config.ts_col_nm,
                                batch_s3_uploader_config.partition_key_nm)
예제 #8
0
    def test_the_transformation_works(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        cassandra_conn_setting = Mock()

        settings = AthenaToAdWordsOfflineConversionSettings(
            source_database=os.getenv("dummy_athena_database"),
            source_table=os.getenv("dummy_athena_table"),
            source_connection_settings=aws_conn,
            etl_identifier="xxxx",
            destination_batch_size=100,
            etl_state_manager_connection=cassandra_conn_setting,
            etl_state_manager_keyspace="test",
            transformation_column_mapping={
                'abc': 'googleClickId',
                'def1': 'conversionName',
                'def2': 'conversionTime',
                'def4': 'conversionValue'
            },
            destination_connection_settings=GoogleAdWordsConnectionSettings(
                client_id=os.getenv("adwords_client_id"),
                user_agent="Tester",
                client_customer_id=os.getenv("adwords_client_customer_id"),
                secrets_manager=GoogleAdWordsSecretsManager()),
        )
        etl = AthenaToAdWordsOfflineConversion(settings)

        df = DataFrame([{
            "abc": "123",
            "def1": "123",
            "def2": "123",
            "def3": "123",
            "def4": "123",
            "def5": "123",
        }, {
            "abc": "222",
            "def1": "333",
            "def2": "444",
            "def3": "333",
            "def4": "333",
            "def5": "333",
        }])
        result = etl._data_frame_to_destination_dict(df)
        expected = [{
            'conversionName': '123',
            'conversionTime': '123',
            'conversionValue': '123',
            'googleClickId': '123'
        }, {
            'conversionName': '333',
            'conversionTime': '444',
            'conversionValue': '333',
            'googleClickId': '222'
        }]
        self.assertListEqual(result, expected)
예제 #9
0
    def test__should__get_correct_estimations__with__etl_get_parallel_payloads(
            self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_to_athena_table_creation"
        etl_settings = AdWordsToAthenaSettings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)])
        etl = AdWordsToAthena(etl_settings)

        actual_payloads = etl.get_parallel_payloads(page_size=1000,
                                                    number_of_workers=3)
        expected_payloads = [{
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 0,
            'worker': 0
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 393000,
            'worker': 1
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 786000,
            'worker': 2
        }]
        self.assertListEqual(expected_payloads, actual_payloads)
        etl.create_athena_table()
        conn = AwsConnectionManager(aws_setting)
        au = AthenaUtil("dev", conn)
        actual = au.get_glue_table_metadata(target_table)
        print(actual)
예제 #10
0
    def test_full_integration_with_local_cassandra(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            conn = verify_container_is_up(cassandra_conn_setting)
            # conn.get_session('system').execute(""" DROP TABLE test.etl_sink_record_state""")

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="test",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'google_click_id': 'googleClickId',
                    'conversion_name': 'conversionName',
                    'conversion_time': 'conversionTime',
                    'conversion_value': 'conversionValue',
                    'conversion_currency_code': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            files_actual = etl.list_source_files()
            #
            # self.assertListEqual(files_actual, [])

            # etl.upload_all()
            act = etl.upload_all()
            self.assertListEqual(act, [])
예제 #11
0
    def test__should__create_s3_file_for_the_given_indices(self):
        # Load secrets via env vars
        execfile("../../secrets.py")

        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"
        conn = AwsConnectionManager(aws_setting)
        s3u = S3Util(conn=conn, bucket=target_bucket)
        s3u.delete_recursive(target_key_prefix)
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        adword_to_s3_util = AdWordsToS3(settings=AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select(
                # Attributes
                'BaseAdGroupId',
                'Id',
                'CampaignId',
                'CampaignName',
                'Name',
                'Status',
                'ContentBidCriterionTypeGroup',
                'BaseCampaignId',
                'TrackingUrlTemplate',
                'FinalUrlSuffix',
                'UrlCustomParameters',
                'AdGroupType').OrderBy('Id'),
            source_service="AdGroupService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting))
        adword_to_s3_util.build_query(start_index=35000,
                                      page_size=1000,
                                      num_iterations=1)
        adword_to_s3_util.transfer_all()
        actual = s3u.get_keys(target_key_prefix)
        expected = [
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_35000__35999.parquet'
        ]

        self.assertListEqual(expected, actual)
예제 #12
0
    def test__should__get_correct_estimations__with__etl_get_parallel_payloads(
            self):
        # Load secrets via env vars
        execfile("../../secrets.py")

        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        etl_settings = AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting)
        etl = AdWordsToS3(etl_settings)

        actual_payloads = etl.get_parallel_payloads(page_size=1000,
                                                    number_of_workers=3)
        expected_payloads = [{
            'worker': 0,
            'start_index': 0,
            'number_of_pages': 435,
            'page_size': 1000
        }, {
            'worker': 1,
            'start_index': 435000,
            'number_of_pages': 435,
            'page_size': 1000
        }, {
            'worker': 2,
            'start_index': 870000,
            'number_of_pages': 435,
            'page_size': 1000
        }]

        self.assertListEqual(expected_payloads, actual_payloads)
예제 #13
0
    def test__should__create_table__with__a_general_report(self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_negative_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(ReportQueryBuilder().Select(
                'AccountDescriptiveName', 'CampaignId', 'CampaignName',
                'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From(
                    'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build()),
            source_include_zero_impressions=True,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask=None)
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
        select * from dev.test_adwords_negative_report limit 10
        """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
def transfer_files():
    """
    transfers files from one bucket to another

    Returns: None

    """
    # These Aws setting assume that you have your Aws Access keys in the Standard env vars
    aws_setting = AwsConnectionSettings(region="ap-southeast-2",
                                        secrets_manager=AwsSecretsManager(),
                                        profile=None)

    #  If you want to use the Aws profiles stored by the swd cli tool, uncomment the following code:
    # aws_setting = AwsConnectionSettings(
    #     region="us-east-1",
    #     secrets_manager=None,
    #     profile="default")

    # Define the ETL instance
    etl = S3ToS3FileCopy(
        source=S3SourceSettings(
            bucket="my_source_bucket",
            key_prefix="source/prefix",
            suffix=None,
            connection_settings=aws_setting,
        ),
        sink=S3SinkSettings(
            bucket="my_target_bucket",
            connection_settings=aws_setting,
        ),
        transformers=[
            AddTargetS3KeyTransformer(target_key_prefix="target/prefix")
        ],
    )
    # Check the files that will be transferred
    files = etl.list_source_files()
    print(files)

    # If you want to transfer files in a loop
    while etl.has_next():
        etl.execute_next()

    # Reset the source state
    etl.reset_source()

    # If you want to transfer all files sequentially
    etl.execute_all()
예제 #15
0
    def test__transfer_file__should_work(self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        source_bucket = "TEST_SOURCE_BUCKET"
        target_bucket = "TEST_TARGET_BUCKET"
        conn = AwsConnectionManager(aws_setting)
        s3_util_for_destination = S3Util(conn=conn, bucket=target_bucket)
        s3_util_for_source = S3Util(conn=conn, bucket=source_bucket)

        s3_util_for_source.create_bucket()
        s3_util_for_destination.create_bucket()

        file = NamedTemporaryFile("w+", delete=False)
        file.write(str("Test file content"))

        s3_util_for_source.upload_file(file.name,
                                       "source/prefix/test_file.txt")

        file2 = NamedTemporaryFile("w+", delete=False)
        file2.write(str("Test file content"))
        s3_util_for_source.upload_file(file2.name,
                                       "source/prefix/txt_file.parquet")
        etl = S3ToS3FileCopy(
            source=s3.S3SourceSettings(
                bucket=source_bucket,
                key_prefix="source/prefix",
                suffix=".txt",
                connection_settings=aws_setting,
            ),
            sink=s3.S3SinkSettings(
                bucket=target_bucket,
                connection_settings=aws_setting,
            ),
            transformers=[
                AddTargetS3KeyTransformer(target_key_prefix="target/prefix")
            ],
        )
        etl.execute_next()
        actual = s3_util_for_destination.get_keys("")
        expected_destination_keys = ['target/prefix/test_file.txt']
        self.assertListEqual(expected_destination_keys, actual)
예제 #16
0
    def test_the_etl_should_work(self):
        test_database = "xxx"
        test_table = "xxx"
        aws_conn = AwsConnectionSettings(region="ap-southeast-2",
                                         secrets_manager=None,
                                         profile="default")

        settings = AthenaToDataFrameSettings(
            source_database=test_database,
            source_table=test_table,
            source_connection_settings=aws_conn)

        etl = AthenaToDataFrame(settings)

        expected_s3_files = ['xxx']
        self.assertEqual(expected_s3_files, etl.list_source_files())

        self.assertEqual((47848, 28), etl.next().shape)

        self.assertEqual((47848, 28), etl.get_all_files_as_data_frame())
예제 #17
0
    def test__should__transfer_correct_amount_of_files__with__one_parallel_fragment(
            self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "tmp/test/hip_data_tools/adwords_to_s3/test"
        conn = AwsConnectionManager(aws_setting)
        s3u = S3Util(conn=conn, bucket=target_bucket)
        s3u.delete_recursive(target_key_prefix)
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        etl_settings = AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting)
        etl = AdWordsToS3(etl_settings)
        etl.build_query(start_index=0, page_size=5, num_iterations=2)

        etl.transfer_all()

        actual = s3u.get_keys(target_key_prefix)
        print(actual)
        expected = [
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_0__4.parquet',
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_5__9.parquet'
        ]
        self.assertListEqual(expected, actual)
예제 #18
0
    def test__transfer_all_files__should_work_without_transformers(self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        source_bucket = "TEST_SOURCE_BUCKET"
        target_bucket = "TEST_TARGET_BUCKET"
        conn = AwsConnectionManager(aws_setting)
        s3_util_for_destination = S3Util(conn=conn, bucket=target_bucket)
        s3_util_for_source = S3Util(conn=conn, bucket=source_bucket)

        s3_util_for_source.create_bucket()
        s3_util_for_destination.create_bucket()

        for itr in range(10):
            file = NamedTemporaryFile("w+", delete=False)
            file.write(str("Test file content"))
            s3_util_for_source.upload_file(
                file.name, f"source/prefix/txt_file{itr}.parquet")

        etl = S3ToS3FileCopy(
            source=s3.S3SourceSettings(
                bucket=source_bucket,
                key_prefix="source/prefix",
                suffix=None,
                connection_settings=aws_setting,
            ),
            sink=s3.S3SinkSettings(
                bucket=target_bucket,
                connection_settings=aws_setting,
            ),
        )
        etl.execute_all()
        actual = s3_util_for_destination.get_keys("")
        expected_destination_keys = [
            f'source/prefix/txt_file{itr}.parquet' for itr in range(10)
        ]
        self.assertListEqual(expected_destination_keys, actual)
예제 #19
0
    def test_adwords_upload_with_duplicates_in_same_batch(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            verify_container_is_up(cassandra_conn_setting)

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="xxxx",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'googleClickId': 'googleClickId',
                    'conversionName': 'conversionName',
                    'conversionTime': 'conversionTime',
                    'conversionValue': 'conversionValue',
                    'conversionCurrencyCode': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            test_df = DataFrame([
                {
                    'googleClickId': 'xxx',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 074353 UTC',
                    'conversionValue':
                    17.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 074353 UTC',  # Duplicate with same time
                    'conversionValue':
                    14.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 084353 UTC',  # Duplicate with diff time
                    'conversionValue':
                    14.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "EAIaIQobChMI6oiGy_vz5wIVkjUrCh3IcgAuEAAYASAAEgLRk_D_BwE",
                    'conversionName': "claim_attempts_testing",
                    'conversionTime': '20200309 023001 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
            ])

            actual = etl._process_data_frame(test_df)
            expected = [  # The duplicate with same time has been Picked out as an issue
                {
                    'error':
                    "Current State 'EtlStates.Processing' cannot transition to "
                    "'EtlStates.Processing'",
                    'data': {
                        'googleClickId':
                        'Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn'
                        '-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB',
                        'conversionName':
                        'claim_attempts_testing',
                        'conversionTime':
                        '20200309 074353 UTC',
                        'conversionValue':
                        14.0,
                        'conversionCurrencyCode':
                        'AUD'
                    }
                },
            ]

            self.assertListEqual(actual, expected)
예제 #20
0
    def test_multiple_runs_of_same_data_and_verify_deduplication(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            conn = verify_container_is_up(cassandra_conn_setting)
            # conn.get_session('system').execute(""" DROP TABLE test.etl_sink_record_state""")

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="test",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'google_click_id': 'googleClickId',
                    'conversion_name': 'conversionName',
                    'conversion_time': 'conversionTime',
                    'conversion_value': 'conversionValue',
                    'conversion_currency_code': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            source_data = [
                {
                    'google_click_id': 'theFirst',
                    'conversion_name': 'claim_attempts_testing',
                    'conversion_time': '20200309 074357 UTC',
                    'conversion_value': 17.0,
                    'conversion_currency_code': 'AUD',
                },
                {
                    'google_click_id': 'failedSecond',
                    'conversion_name': 'claim_attempts_testing',
                    'conversion_time': '20200309 074357 UTC',
                    'conversion_value': 17.0,
                    'conversion_currency_code': 'AUD',
                },
            ]
            test_df = DataFrame(source_data)
            #  Mock upload_conversions in AdWordsUtil so no actual data is transmitted
            etl._upload_conversions = MagicMock(return_value=([
                {
                    'googleClickId': 'theFirst',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
            ], [
                {
                    'fieldPath':
                    'operations[0].operand',
                    'fieldPathElements': [{
                        'field': 'operations',
                        'index': 0
                    }, {
                        'field': 'operand',
                        'index': None
                    }],
                    'trigger':
                    None,
                    'errorString':
                    'OfflineConversionError.UNPARSEABLE_GCLID',
                    'ApiError.Type':
                    'OfflineConversionError',
                    'reason':
                    'UNPARSEABLE_GCLID',
                    'data': {
                        'googleClickId': 'failedSecond',
                        'conversionName': 'claim_attempts_testing',
                        'conversionTime': '20200309 074357 UTC',
                        'conversionValue': 17.0,
                        'conversionCurrencyCode': 'AUD',
                    },
                },
            ]))
            # etl._process_data_frame(test_df)
            first_actual = etl._process_data_frame(test_df)
            self.assertListEqual(first_actual, [])

            # Repeat process to cause Duplicates
            actual = etl._process_data_frame(test_df)
            # actual = etl.upload_next()
            expected = [{
                'data': {
                    'conversionCurrencyCode': 'AUD',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'googleClickId': 'theFirst'
                },
                'error': 'Current state is not Ready'
            }, {
                'data': {
                    'conversionCurrencyCode': 'AUD',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'googleClickId': 'failedSecond'
                },
                'error': 'Current state is not Ready'
            }]

            self.assertListEqual(actual, expected)
예제 #21
0
    def test__should__create_table__with__geo_performance_report(self):
        aws_setting = AwsConnectionSettings(
            region="ap-southeast-2",
            secrets_manager=AwsSecretsManager(
                access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID",
                secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY",
                use_session_token=True,
                aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN"),
            profile=None)
        target_bucket = "test-bucket"
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id="1111111111",
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_geo_performance_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(
                ReportQueryBuilder().Select(
                    # Attributes
                    'AccountDescriptiveName',
                    'CampaignId',
                    'CityCriteriaId',
                    'CountryCriteriaId',
                    'CustomerDescriptiveName',
                    'ExternalCustomerId',
                    'IsTargetingLocation',
                    'MetroCriteriaId',
                    'MostSpecificCriteriaId',
                    'RegionCriteriaId',

                    # Segments
                    'Date',

                    # Metrics
                    'Impressions',
                    'Clicks',
                    'ConversionRate',
                    'Conversions',
                    'ConversionValue',
                    'Cost',
                    'CostPerConversion').From('GEO_PERFORMANCE_REPORT').During(
                        start_date="20200601", end_date="20200701").Build()),
            source_include_zero_impressions=False,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask={
                "country__territory": np.int,
                "region": np.int,
                "most_specific_location": np.int
            })
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
            select * from dev.test_adwords_geo_performance_report limit 10
            """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))