def test_s3_delete_multiple_objects(self):
        bucket = "testbucket"
        key_pattern = "path/data"
        n_keys = 3
        keys = [key_pattern + str(i) for i in range(n_keys)]

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        for k in keys:
            conn.upload_fileobj(Bucket=bucket,
                                Key=k,
                                Fileobj=io.BytesIO(b"input"))

        # The objects should be detected before the DELETE action is taken
        objects_in_dest_bucket = conn.list_objects(Bucket=bucket,
                                                   Prefix=key_pattern)
        self.assertEqual(len(objects_in_dest_bucket['Contents']), n_keys)
        self.assertEqual(
            sorted([x['Key'] for x in objects_in_dest_bucket['Contents']]),
            sorted(keys))

        t = S3DeleteObjectsOperator(
            task_id="test_task_s3_delete_multiple_objects",
            bucket=bucket,
            keys=keys)
        t.execute(None)

        # There should be no object found in the bucket created earlier
        self.assertFalse(
            'Contents' in conn.list_objects(Bucket=bucket, Prefix=key_pattern))
    def test_s3_delete_single_object(self):
        bucket = "testbucket"
        key = "path/data.txt"

        conn = boto3.client('s3')
        conn.create_bucket(Bucket=bucket)
        conn.upload_fileobj(Bucket=bucket,
                            Key=key,
                            Fileobj=io.BytesIO(b"input"))

        # The object should be detected before the DELETE action is taken
        objects_in_dest_bucket = conn.list_objects(Bucket=bucket, Prefix=key)
        self.assertEqual(len(objects_in_dest_bucket['Contents']), 1)
        self.assertEqual(objects_in_dest_bucket['Contents'][0]['Key'], key)

        t = S3DeleteObjectsOperator(
            task_id="test_task_s3_delete_single_object",
            bucket=bucket,
            keys=key)
        t.execute(None)

        # There should be no object found in the bucket created earlier
        self.assertFalse(
            'Contents' in conn.list_objects(Bucket=bucket, Prefix=key))
        '0 12 8-14,22-28 * 6',  # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs
        default_args=default_args,
        catchup=False  # enable if you don't want historical dag runs to run
) as dag:

    t0 = DummyOperator(task_id='start')

    for i in range(0, 10):  # generates 10 tasks
        generate_files = PythonOperator(
            task_id='generate_file_{0}_{1}'.format(
                name, i),  # note the task id is dynamic
            python_callable=upload_to_s3,
            op_kwargs={'file_name': i})

        copy_files = S3CopyObjectOperator(
            task_id='copy_{0}_file_{1}'.format(name, i),
            source_bucket_key='globetelecom/{0}_file_{1}_testfile_exist.txt'.
            format(name, i),
            dest_bucket_key='globetelecom/copy_{0}_file_{1}.txt'.format(
                name, i),
            source_bucket_name=BUCKET,
            dest_bucket_name=BUCKET,
            aws_conn_id=S3_CONN_ID)

        delete_files = S3DeleteObjectsOperator(
            task_id='delete_{0}_file_{1}'.format(name, i),
            keys='globetelecom_copy/{0}_file_{1}'.format(name, i),
            bucket=BUCKET,
            aws_conn_id=S3_CONN_ID)

        t0 >> generate_files >> copy_files >> delete_files  # Make sure this is indented inside the scope of the loop
Exemplo n.º 4
0
file_sensor = S3KeySensor(
    task_id='recap_cntrl_file_sensor',
    poke_interval=60,  # (seconds); checking file every 60 seconds
    timeout=60 * 60 * 18,  # timeout in 18 hours
    bucket_key="s3://vivek-mathew/recap-cntrl.txt",
    bucket_name=None,
    wildcard_match=False,
    dag=dag)

terminate_cluster = EmrTerminateJobFlowOperator(
    task_id="terminate_cluster",
    job_flow_id=
    "{{ task_instance.xcom_pull('create_cluster', key='return_value') }}",
    aws_conn_id='aws_default',
    dag=dag)

recap_file_delete = S3DeleteObjectsOperator(task_id="delete_recap_cntrl_file",
                                            bucket="vivek-mathew",
                                            keys="recap-cntrl.txt",
                                            dag=dag)

job_flow_file_delete = S3DeleteObjectsOperator(
    task_id="delete_job_flow_file_delete",
    bucket="vivek-mathew",
    keys="job-flow.txt",
    dag=dag)

create_cluster >> create_job_flow_variable >> create_job_flow_file >> file_sensor >> terminate_cluster \
>> recap_file_delete >> job_flow_file_delete
Exemplo n.º 5
0
    #         "location","favorite_count","retweet_count","sentiment","topic"
    #         ],
    #     region_name='us-east-1',
    #     s3_key='s3://j17devbucket/analyzed_tweet_data_' + timestamp + '.json',
    #     json_key='tweets'
    # )

    sentiment_results_to_dynamoDB = S3ToDynamoDBOperator(
        task_id='write_sentiment_to_dynamoDB',
        description='Writes sentiment results to dynamoDB',
        table_name='sentiment-results',
        table_keys=[
            "topic", "timestamp", "maxNegText", "maxPosText", "sentiment"
        ],
        region_name='us-east-1',
        s3_key='s3://j17devbucket/sentiment_results_' + timestamp + '.json',
        json_key='results')

    clean_up = S3DeleteObjectsOperator(
        task_id='clean_up_s3',
        description='Clean up files on s3',
        bucket='j17devbucket',
        keys=[
            'tweet_data.' + timestamp, 'cleaned_tweet_data.' + timestamp,
            'analyzed_tweet_data_' + timestamp + '.json',
            'sentiment_results_' + timestamp + '.json'
        ],
    )

    tweets_to_s3 >> etl_tweets >> get_sentiment >> summarize_sentiment >> sentiment_results_to_dynamoDB >> clean_up
    #tweets_to_s3 >> etl_tweets >> get_sentiment >> summarize_sentiment >> [tweets_to_dynamoDB,sentiment_results_to_dynamoDB] >> clean_up