Exemplo n.º 1
0
    def test_s3_delete_prefix(self):
        bucket = "testbucket"
        key_pattern = "path/data"
        n_keys = 3
        keys = [key_pattern + str(i) for i in range(n_keys)]

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        for k in keys:
            conn.upload_fileobj(Bucket=bucket,
                                Key=k,
                                Fileobj=io.BytesIO(b"input"))

        # The objects should be detected before the DELETE action is taken
        objects_in_dest_bucket = conn.list_objects(Bucket=bucket,
                                                   Prefix=key_pattern)
        assert len(objects_in_dest_bucket['Contents']) == n_keys
        assert sorted(
            x['Key']
            for x in objects_in_dest_bucket['Contents']) == sorted(keys)

        op = S3DeleteObjectsOperator(task_id="test_task_s3_delete_prefix",
                                     bucket=bucket,
                                     prefix=key_pattern)
        op.execute(None)

        # There should be no object found in the bucket created earlier
        assert 'Contents' not in conn.list_objects(Bucket=bucket,
                                                   Prefix=key_pattern)
    def test_s3_delete_single_object(self):
        bucket = "testbucket"
        key = "path/data.txt"

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket, Key=key, Fileobj=io.BytesIO(b"input"))

        # The object should be detected before the DELETE action is taken
        objects_in_dest_bucket = conn.list_objects(Bucket=bucket, Prefix=key)
        self.assertEqual(len(objects_in_dest_bucket['Contents']), 1)
        self.assertEqual(objects_in_dest_bucket['Contents'][0]['Key'], key)

        op = S3DeleteObjectsOperator(task_id="test_task_s3_delete_single_object", bucket=bucket, keys=key)
        op.execute(None)

        # There should be no object found in the bucket created earlier
        self.assertFalse('Contents' in conn.list_objects(Bucket=bucket, Prefix=key))
Exemplo n.º 3
0
        salesforce_query=
        "SELECT Id, Name, Company, Phone, Email, LastModifiedDate, IsActive FROM Customers",
        s3_bucket_name="landing-bucket",
        s3_key=f"{BASE_PATH}/{FILE_NAME}",
        salesforce_conn_id="salesforce",
        replace=True,
    )
    # [END howto_operator_salesforce_to_s3_transfer]

    date_prefixes = "{{ execution_date.strftime('%Y/%m/%d') }}"

    store_to_s3_data_lake = S3CopyObjectOperator(
        task_id="store_to_s3_data_lake",
        source_bucket_key=upload_salesforce_data_to_s3_landing.
        output["s3_uri"],
        dest_bucket_name="data_lake",
        dest_bucket_key=f"{BASE_PATH}/{date_prefixes}/{FILE_NAME}",
    )

    delete_data_from_s3_landing = S3DeleteObjectsOperator(
        task_id="delete_data_from_s3_landing",
        bucket=upload_salesforce_data_to_s3_landing.output["s3_bucket_name"],
        keys=upload_salesforce_data_to_s3_landing.output["s3_key"],
    )

    store_to_s3_data_lake >> delete_data_from_s3_landing

    # Task dependencies created via `XComArgs`:
    #   upload_salesforce_data_to_s3_landing >> store_to_s3_data_lake
    #   upload_salesforce_data_to_s3_landing >> delete_data_from_s3_landing
Exemplo n.º 4
0
    'retry_delay': timedelta(minutes=10)
}

with DAG('minio-fifa-spark-operator',
         default_args=default_args,
         schedule_interval='@daily',
         tags=['development', 's3', 'minio', 'spark-operator']) as dag:

    etl_fifa_spark_operator = SparkKubernetesOperator(
        task_id='etl_fifa_spark_operator',
        namespace='processing',
        application_file='etl-fifa.yaml',
        kubernetes_conn_id='minikube',
        do_xcom_push=True)

    monitor_spark_app_status = SparkKubernetesSensor(
        task_id='monitor_spark_app_status',
        namespace="processing",
        application_name=
        "{{ task_instance.xcom_pull(task_ids='etl_fifa_spark_operator')['metadata']['name'] }}",
        kubernetes_conn_id="minikube")

    delete_s3_file_raw_zone = S3DeleteObjectsOperator(
        task_id='delete_s3_file_raw_zone',
        bucket=RAW_ZONE,
        keys='data.csv',
        aws_conn_id='minio',
        do_xcom_push=True)

    etl_fifa_spark_operator >> monitor_spark_app_status >> delete_s3_file_raw_zone