def test__list_source_files__should_list_the_correct_source_files_as_an_array(self): test_bucket = "test" test_source_key_prefix = "test_for_s3_to_cassandra" aws_conn_settings = AwsConnectionSettings( region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None) s3 = S3Util(conn=AwsConnectionManager(aws_conn_settings), bucket=test_bucket) s3.create_bucket() for itr in range(3): s3.upload_file(self.sample_file_location, f"{test_source_key_prefix}/{itr}.abc") expected = [ f"{test_source_key_prefix}/0.abc", f"{test_source_key_prefix}/1.abc", f"{test_source_key_prefix}/2.abc", ] util = S3ToCassandra(S3ToCassandraSettings( source_bucket=test_bucket, source_key_prefix=test_source_key_prefix, source_connection_settings=aws_conn_settings, destination_keyspace="test", destination_table="test", destination_table_primary_keys=[], destination_table_options_statement="", destination_batch_size=2, destination_connection_settings=Mock(), )) actual = util.list_source_files() self.assertListEqual(actual, expected)
def test__aws_secrets_manager_should_instantiate_with_sensible_defaults( self): os.environ["AWS_ACCESS_KEY_ID"] = "abc" os.environ["AWS_SECRET_ACCESS_KEY"] = "def" actual = AwsSecretsManager() self.assertEqual(actual.aws_access_key_id, "abc") self.assertEqual(actual.aws_secret_access_key, "def")
def test_the_etl_should_download_first_file_correctly(self): test_bucket = "SOME_OTHER_BUCKET" test_key = "some/key" aws_conn = AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None) s3_util = S3Util(conn=AwsConnectionManager(aws_conn), bucket=test_bucket) df1 = pd.DataFrame(dict(A=range(10)), index=pd.date_range('20130101', periods=10, freq='d')) df2 = pd.DataFrame(dict(A=range(10, 20)), index=pd.date_range('20130111', periods=10, freq='d')) s3_util.create_bucket() s3_util.upload_dataframe_as_parquet(df1, test_key, "df1") s3_util.upload_dataframe_as_parquet(df2, test_key, "df2") settings = S3ToDataFrameSettings(source_bucket=test_bucket, source_key_prefix=test_key, source_connection_settings=aws_conn) etl = S3ToDataFrame(settings) expected = pd.DataFrame(dict(A=range(20)), index=pd.date_range('20130101', periods=20, freq='d')) assert_frame_equal(expected, etl.get_all_files_as_data_frame())
def test_the_etl_should_list_files_correctly(self): test_bucket = "TEST_BUCKET_ITR" test_key = "some/key" aws_conn = AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None) s3_util = S3Util(conn=AwsConnectionManager(aws_conn), bucket=test_bucket) df1 = pd.DataFrame(dict(A=range(10000)), index=pd.date_range('20130101', periods=10000, freq='s')) df2 = pd.DataFrame(dict(A=range(10000)), index=pd.date_range('20140101', periods=10000, freq='s')) df3 = pd.DataFrame(dict(A=range(10000)), index=pd.date_range('20150101', periods=10000, freq='s')) s3_util.create_bucket() s3_util.upload_dataframe_as_parquet(df1, test_key, "df1") s3_util.upload_dataframe_as_parquet(df2, test_key, "df2") s3_util.upload_dataframe_as_parquet(df3, test_key, "df3") settings = S3ToDataFrameSettings(source_bucket=test_bucket, source_key_prefix=test_key, source_connection_settings=aws_conn) etl = S3ToDataFrame(settings) assert_frame_equal(df1, etl.next()) assert_frame_equal(df2, etl.next()) assert_frame_equal(df3, etl.next())
def test_should__copy_file_from_one_bucket_to_another__when_valid_locations_are_given( self): dest_bucket = "TEST_BUCKET_DEST" conn = AwsConnectionManager( AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None)) s3_util_for_destination = S3Util(conn=conn, bucket=dest_bucket) s3_util_for_source = self.s3 s3_util_for_source.create_bucket() s3_util_for_destination.create_bucket() tmp_file_path = "/tmp/testfile.txt" dirname = os.path.dirname(tmp_file_path) if not os.path.exists(dirname): os.makedirs(dirname) with open(tmp_file_path, "w+") as file: file.write(str("Test file content")) s3_util_for_source.upload_file(tmp_file_path, "test/testfile.txt") s3_util_for_source.move_recursive_to_different_bucket( source_key_prefix="test/", destination_bucket_name=dest_bucket, destination_key_prefix="{}/test_copy/".format(dest_bucket)) actual = s3_util_for_destination.read_lines_as_list("test_copy")[0] expected = "Test file content" self.assertEqual(actual, expected)
def setUpClass(cls): cls.bucket = "TEST_BUCKET" conn = AwsConnectionManager( AwsConnectionSettings(region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None)) cls.s3 = S3Util(conn=conn, bucket=cls.bucket)
def create_batch_s3_uploader(batch_s3_uploader_config=None): """ Factory method to generate a kafka batch uploadxr Args: batch_s3_uploader_config (BatchS3UploaderConfig): Configuration object for the s3 uploader Returns (KafkaS3BatchExporter): Instantiated Exporter """ batch_s3_uploader_config = batch_s3_uploader_config \ if batch_s3_uploader_config is not None else BatchS3UploaderConfig() aws_region = os.environ.get("AWS_DEFAULT_REGION") conn = AwsConnectionManager( AwsConnectionSettings(region=aws_region, secrets_manager=AwsSecretsManager(), profile=None)) s3_client = S3Util(conn, batch_s3_uploader_config.bucket) return KafkaS3BatchExporter(batch_s3_uploader_config.root_path, s3_client, batch_s3_uploader_config.ts_col_nm, batch_s3_uploader_config.partition_key_nm)
def test__should__get_correct_estimations__with__etl_get_parallel_payloads( self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_to_athena_table_creation" etl_settings = AdWordsToAthenaSettings( source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy( 'Id'), source_service="AdGroupAdService", source_service_version="v201809", source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)]) etl = AdWordsToAthena(etl_settings) actual_payloads = etl.get_parallel_payloads(page_size=1000, number_of_workers=3) expected_payloads = [{ 'number_of_pages': 393, 'page_size': 1000, 'start_index': 0, 'worker': 0 }, { 'number_of_pages': 393, 'page_size': 1000, 'start_index': 393000, 'worker': 1 }, { 'number_of_pages': 393, 'page_size': 1000, 'start_index': 786000, 'worker': 2 }] self.assertListEqual(expected_payloads, actual_payloads) etl.create_athena_table() conn = AwsConnectionManager(aws_setting) au = AthenaUtil("dev", conn) actual = au.get_glue_table_metadata(target_table) print(actual)
def test__should__create_s3_file_for_the_given_indices(self): # Load secrets via env vars execfile("../../secrets.py") aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" conn = AwsConnectionManager(aws_setting) s3u = S3Util(conn=conn, bucket=target_bucket) s3u.delete_recursive(target_key_prefix) adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) adword_to_s3_util = AdWordsToS3(settings=AdWordsToS3Settings( source_query_fragment=ServiceQueryBuilder().Select( # Attributes 'BaseAdGroupId', 'Id', 'CampaignId', 'CampaignName', 'Name', 'Status', 'ContentBidCriterionTypeGroup', 'BaseCampaignId', 'TrackingUrlTemplate', 'FinalUrlSuffix', 'UrlCustomParameters', 'AdGroupType').OrderBy('Id'), source_service="AdGroupService", source_service_version="v201809", source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_file_prefix=None, target_connection_settings=aws_setting)) adword_to_s3_util.build_query(start_index=35000, page_size=1000, num_iterations=1) adword_to_s3_util.transfer_all() actual = s3u.get_keys(target_key_prefix) expected = [ 'tmp/test/hip_data_tools/adwords_to_s3/test/index_35000__35999.parquet' ] self.assertListEqual(expected, actual)
def test__should__get_correct_estimations__with__etl_get_parallel_payloads( self): # Load secrets via env vars execfile("../../secrets.py") aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) etl_settings = AdWordsToS3Settings( source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy( 'Id'), source_service="AdGroupAdService", source_service_version="v201809", source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_file_prefix=None, target_connection_settings=aws_setting) etl = AdWordsToS3(etl_settings) actual_payloads = etl.get_parallel_payloads(page_size=1000, number_of_workers=3) expected_payloads = [{ 'worker': 0, 'start_index': 0, 'number_of_pages': 435, 'page_size': 1000 }, { 'worker': 1, 'start_index': 435000, 'number_of_pages': 435, 'page_size': 1000 }, { 'worker': 2, 'start_index': 870000, 'number_of_pages': 435, 'page_size': 1000 }] self.assertListEqual(expected_payloads, actual_payloads)
def test__should__create_table__with__a_general_report(self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_negative_report" etl_settings = AdWordsReportsToAthenaSettings( source_query=(ReportQueryBuilder().Select( 'AccountDescriptiveName', 'CampaignId', 'CampaignName', 'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From( 'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build()), source_include_zero_impressions=True, source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)], target_file_prefix="data", transformation_field_type_mask=None) etl = AdWordsReportsToAthena(etl_settings) etl.transfer() etl.create_athena_table() etl.add_partitions() au = AthenaUtil(database="dev", conn=AwsConnectionManager(aws_setting), output_bucket=os.environ["S3_TEST_BUCKET"]) actual = au.run_query(query_string=""" select * from dev.test_adwords_negative_report limit 10 """, return_result=True) print(actual) expected = 11 self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
def transfer_files(): """ transfers files from one bucket to another Returns: None """ # These Aws setting assume that you have your Aws Access keys in the Standard env vars aws_setting = AwsConnectionSettings(region="ap-southeast-2", secrets_manager=AwsSecretsManager(), profile=None) # If you want to use the Aws profiles stored by the swd cli tool, uncomment the following code: # aws_setting = AwsConnectionSettings( # region="us-east-1", # secrets_manager=None, # profile="default") # Define the ETL instance etl = S3ToS3FileCopy( source=S3SourceSettings( bucket="my_source_bucket", key_prefix="source/prefix", suffix=None, connection_settings=aws_setting, ), sink=S3SinkSettings( bucket="my_target_bucket", connection_settings=aws_setting, ), transformers=[ AddTargetS3KeyTransformer(target_key_prefix="target/prefix") ], ) # Check the files that will be transferred files = etl.list_source_files() print(files) # If you want to transfer files in a loop while etl.has_next(): etl.execute_next() # Reset the source state etl.reset_source() # If you want to transfer all files sequentially etl.execute_all()
def test__transfer_file__should_work(self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) source_bucket = "TEST_SOURCE_BUCKET" target_bucket = "TEST_TARGET_BUCKET" conn = AwsConnectionManager(aws_setting) s3_util_for_destination = S3Util(conn=conn, bucket=target_bucket) s3_util_for_source = S3Util(conn=conn, bucket=source_bucket) s3_util_for_source.create_bucket() s3_util_for_destination.create_bucket() file = NamedTemporaryFile("w+", delete=False) file.write(str("Test file content")) s3_util_for_source.upload_file(file.name, "source/prefix/test_file.txt") file2 = NamedTemporaryFile("w+", delete=False) file2.write(str("Test file content")) s3_util_for_source.upload_file(file2.name, "source/prefix/txt_file.parquet") etl = S3ToS3FileCopy( source=s3.S3SourceSettings( bucket=source_bucket, key_prefix="source/prefix", suffix=".txt", connection_settings=aws_setting, ), sink=s3.S3SinkSettings( bucket=target_bucket, connection_settings=aws_setting, ), transformers=[ AddTargetS3KeyTransformer(target_key_prefix="target/prefix") ], ) etl.execute_next() actual = s3_util_for_destination.get_keys("") expected_destination_keys = ['target/prefix/test_file.txt'] self.assertListEqual(expected_destination_keys, actual)
def test__should__transfer_correct_amount_of_files__with__one_parallel_fragment( self): # Load secrets via env vars execfile("../../secrets.py") aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) target_bucket = os.getenv('S3_TEST_BUCKET') target_key_prefix = "tmp/test/hip_data_tools/adwords_to_s3/test" conn = AwsConnectionManager(aws_setting) s3u = S3Util(conn=conn, bucket=target_bucket) s3u.delete_recursive(target_key_prefix) adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id=os.getenv("adwords_client_customer_id"), secrets_manager=GoogleAdWordsSecretsManager()) etl_settings = AdWordsToS3Settings( source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy( 'Id'), source_service="AdGroupAdService", source_service_version="v201809", source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_file_prefix=None, target_connection_settings=aws_setting) etl = AdWordsToS3(etl_settings) etl.build_query(start_index=0, page_size=5, num_iterations=2) etl.transfer_all() actual = s3u.get_keys(target_key_prefix) print(actual) expected = [ 'tmp/test/hip_data_tools/adwords_to_s3/test/index_0__4.parquet', 'tmp/test/hip_data_tools/adwords_to_s3/test/index_5__9.parquet' ] self.assertListEqual(expected, actual)
def test__transfer_all_files__should_work_without_transformers(self): aws_setting = AwsConnectionSettings( region="us-east-1", secrets_manager=AwsSecretsManager(), profile=None) source_bucket = "TEST_SOURCE_BUCKET" target_bucket = "TEST_TARGET_BUCKET" conn = AwsConnectionManager(aws_setting) s3_util_for_destination = S3Util(conn=conn, bucket=target_bucket) s3_util_for_source = S3Util(conn=conn, bucket=source_bucket) s3_util_for_source.create_bucket() s3_util_for_destination.create_bucket() for itr in range(10): file = NamedTemporaryFile("w+", delete=False) file.write(str("Test file content")) s3_util_for_source.upload_file( file.name, f"source/prefix/txt_file{itr}.parquet") etl = S3ToS3FileCopy( source=s3.S3SourceSettings( bucket=source_bucket, key_prefix="source/prefix", suffix=None, connection_settings=aws_setting, ), sink=s3.S3SinkSettings( bucket=target_bucket, connection_settings=aws_setting, ), ) etl.execute_all() actual = s3_util_for_destination.get_keys("") expected_destination_keys = [ f'source/prefix/txt_file{itr}.parquet' for itr in range(10) ] self.assertListEqual(expected_destination_keys, actual)
def func(): AwsSecretsManager(access_key_id_var="SOMEUNKNOWNVAR")
def test__should__create_table__with__geo_performance_report(self): aws_setting = AwsConnectionSettings( region="ap-southeast-2", secrets_manager=AwsSecretsManager( access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID", secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY", use_session_token=True, aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN"), profile=None) target_bucket = "test-bucket" target_key_prefix = "something/test" # Load secrets via env vars execfile("../../secrets.py") adwords_settings = GoogleAdWordsConnectionSettings( client_id=os.getenv("adwords_client_id"), user_agent="Tester", client_customer_id="1111111111", secrets_manager=GoogleAdWordsSecretsManager()) target_table = "test_adwords_geo_performance_report" etl_settings = AdWordsReportsToAthenaSettings( source_query=( ReportQueryBuilder().Select( # Attributes 'AccountDescriptiveName', 'CampaignId', 'CityCriteriaId', 'CountryCriteriaId', 'CustomerDescriptiveName', 'ExternalCustomerId', 'IsTargetingLocation', 'MetroCriteriaId', 'MostSpecificCriteriaId', 'RegionCriteriaId', # Segments 'Date', # Metrics 'Impressions', 'Clicks', 'ConversionRate', 'Conversions', 'ConversionValue', 'Cost', 'CostPerConversion').From('GEO_PERFORMANCE_REPORT').During( start_date="20200601", end_date="20200701").Build()), source_include_zero_impressions=False, source_connection_settings=adwords_settings, target_bucket=target_bucket, target_key_prefix=target_key_prefix, target_connection_settings=aws_setting, target_database="dev", target_table=target_table, target_table_ddl_progress=True, is_partitioned_table=True, partition_values=[("abc", "def"), ("pqr", 123)], target_file_prefix="data", transformation_field_type_mask={ "country__territory": np.int, "region": np.int, "most_specific_location": np.int }) etl = AdWordsReportsToAthena(etl_settings) etl.transfer() etl.create_athena_table() etl.add_partitions() au = AthenaUtil(database="dev", conn=AwsConnectionManager(aws_setting), output_bucket=os.environ["S3_TEST_BUCKET"]) actual = au.run_query(query_string=""" select * from dev.test_adwords_geo_performance_report limit 10 """, return_result=True) print(actual) expected = 11 self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))