Exemplo n.º 1
0
 def setUp(self):
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     dag = DAG('test_dag_id', default_args=args)
     self.dag = dag
     self.sql = 'SELECT 1'
     self.hook = AwsDynamoDBHook(
         aws_conn_id='aws_default', region_name='us-east-1')
    def execute(self, context):
        self.log.info("Going to start S3 to Dynamo operator")
        dynamo_hook = AwsDynamoDBHook(
            aws_conn_id=self.dynamodb_conn_id,
            table_keys=self.table_keys,
            table_name=self.table_name,
            region_name=self.region_name,
        )
        s3_hook = S3Hook(self.s3_conn_id)

        self.log.info("Downloading s3 file")
        source_obj = s3_hook.get_key(self.s3_key, self.s3_bucket)
        with NamedTemporaryFile() as source_file:
            with open(source_file.name, "wb") as opened_source_file:
                source_obj.download_fileobj(opened_source_file)

            self.log.info("Writing file to Dynamo")
            with open(source_file.name, "r") as opened_source_file:
                reader = opened_source_file.readlines()
                json_data = [
                    self.clean_empty(json.loads(x, parse_float=Decimal))
                    for x in reader
                ]
                dynamo_hook.write_batch_data(json_data)

        self.log.info("Finished S3 to Dynamo operator")
        return True
Exemplo n.º 3
0
def persist_data(**kwargs):
    hook = AwsDynamoDBHook(
        table_name="TABLE_NAME",  #TABLE_NAME
        aws_conn_id='aws_default')
    faceIndexDetails = kwargs['ti'].xcom_pull(key='FaceIndexDetails')
    thumbnailDetails = kwargs['ti'].xcom_pull(key='ThumbnailDetails')
    conf = kwargs['dag_run'].conf
    dynamoItem = {
        "UserId": conf["userId"],
        "s3Bucket": conf["s3Bucket"],
        "s3Key": conf["s3Key"],
        "faceId": faceIndexDetails['FaceId'],
        "thumbnail": thumbnailDetails['thumbnail']
    }
    items = [dynamoItem]
    hook.write_batch_data(items)
 def setUp(self):
     configuration.load_test_config()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     dag = DAG('test_dag_id', default_args=args)
     self.dag = dag
     self.sql = 'SELECT 1'
     self.hook = AwsDynamoDBHook(
         aws_conn_id='aws_default', region_name='us-east-1')
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        logging.info('Extracting data from Hive')
        logging.info(self.sql)

        data = hive.get_pandas_df(self.sql, schema=self.schema)
        dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id,
                                   table_name=self.table_name, table_keys=self.table_keys, region_name=self.region_name)

        logging.info('Inserting rows into dynamodb')

        if self.pre_process is None:
            dynamodb.write_batch_data(
                json.loads(data.to_json(orient='records')))
        else:
            dynamodb.write_batch_data(
                self.pre_process(data=data, args=self.pre_process_args, kwargs=self.pre_process_kwargs))

        logging.info('Done.')
    def execute(self, context):
        self.log.info("Going to start Xcom DynamoDB Key Operator")
        dynamo_hook = AwsDynamoDBHook(
            aws_conn_id=self.dynamodb_conn_id,
            table_keys=self.table_keys,
            table_name=self.table_name,
            region_name=self.region_name,
        )
        condition = Key(self.partition_key_condition[0]).eq(
            self.partition_key_condition[1])
        if self.sort_key_condition:
            condition = condition & self.sort_key_condition

        conn = dynamo_hook.get_conn()
        table = conn.Table(self.table_name)
        response = table.query(KeyConditionExpression=condition)
        if response["Items"]:
            self.log.info("Got response, validating")
            return [x[self.return_key] for x in response["Items"]]
        else:
            self.log.info("Responde didn't return results")
            return False
Exemplo n.º 7
0
 def execute(self, context):
     table = AwsDynamoDBHook().get_conn().Table(self.dynamodb_table_name)
     scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {}
     err = None
     f = NamedTemporaryFile()
     try:
         f = self._scan_dynamodb_and_upload_to_s3(f, scan_kwargs, table)
     except Exception as e:
         err = e
         raise e
     finally:
         if err is None:
             _upload_file_to_s3(f, self.s3_bucket_name, self.s3_key_prefix)
         f.close()
Exemplo n.º 8
0
    def execute(self, context):
        s3 = S3Hook(aws_conn_id=self.aws_conn_id)

        dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id,
                                   table_name=self.table_name,
                                   table_keys=self.table_keys,
                                   region_name=self.region_name)

        if not s3.check_for_key(self.s3_key):
            raise AirflowException("The source key {0} does not exist".format(
                self.s3_key))

        s3_key_object = s3.get_key(self.s3_key)
        s3_key_json = json.loads(
            s3_key_object.get()['Body'].read().decode('utf-8'))
        json_list = s3_key_json[self.json_key]

        json_list = self._convert_float_to_decimal(json_list)

        logging.info('Inserting rows into dynamodb table %s', self.table_name)
        dynamodb.write_batch_data(json_list)
        logging.info('Finished inserting %d rows into dynamodb table %s',
                     len(json_list), self.table_name)
    def test_insert_batch_items_dynamodb_table(self):

        hook = AwsDynamoDBHook(aws_conn_id='aws_default',
                               table_name='test_airflow', table_keys=['id'], region_name='us-east-1')

        # this table needs to be created in production
        table = hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'name',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        table = hook.get_conn().Table('test_airflow')

        items = [{'id': str(uuid.uuid4()), 'name': 'airflow'}
                 for _ in range(10)]

        hook.write_batch_data(items)

        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')
        self.assertEqual(table.item_count, 10)
    def test_insert_batch_items_dynamodb_table(self):

        hook = AwsDynamoDBHook(aws_conn_id='aws_default',
                               table_name='test_airflow',
                               table_keys=['id'],
                               region_name='us-east-1')

        # this table needs to be created in production
        table = hook.get_conn().create_table(TableName='test_airflow',
                                             KeySchema=[
                                                 {
                                                     'AttributeName': 'id',
                                                     'KeyType': 'HASH'
                                                 },
                                             ],
                                             AttributeDefinitions=[{
                                                 'AttributeName':
                                                 'name',
                                                 'AttributeType':
                                                 'S'
                                             }],
                                             ProvisionedThroughput={
                                                 'ReadCapacityUnits': 10,
                                                 'WriteCapacityUnits': 10
                                             })

        table = hook.get_conn().Table('test_airflow')

        items = [{
            'id': str(uuid.uuid4()),
            'name': 'airflow'
        } for _ in range(10)]

        hook.write_batch_data(items)

        table.meta.client.get_waiter('table_exists').wait(
            TableName='test_airflow')
        self.assertEqual(table.item_count, 10)
Exemplo n.º 11
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        self.log.info('Extracting data from Hive')
        self.log.info(self.sql)

        data = hive.get_pandas_df(self.sql, schema=self.schema)
        dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id,
                                   table_name=self.table_name,
                                   table_keys=self.table_keys,
                                   region_name=self.region_name)

        self.log.info('Inserting rows into dynamodb')

        if self.pre_process is None:
            dynamodb.write_batch_data(
                json.loads(data.to_json(orient='records')))
        else:
            dynamodb.write_batch_data(
                self.pre_process(data=data,
                                 args=self.pre_process_args,
                                 kwargs=self.pre_process_kwargs))

        self.log.info('Done.')
 def test_get_conn_returns_a_boto3_connection(self):
     hook = AwsDynamoDBHook(aws_conn_id='aws_default')
     self.assertIsNotNone(hook.get_conn())
Exemplo n.º 13
0
 def test_get_conn_returns_a_boto3_connection(self):
     hook = AwsDynamoDBHook(aws_conn_id='aws_default')
     self.assertIsNotNone(hook.get_conn())
Exemplo n.º 14
0
class TestHiveToDynamoDBTransferOperator(unittest.TestCase):

    def setUp(self):
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
        dag = DAG('test_dag_id', default_args=args)
        self.dag = dag
        self.sql = 'SELECT 1'
        self.hook = AwsDynamoDBHook(
            aws_conn_id='aws_default', region_name='us-east-1')

    @staticmethod
    def process_data(data, *args, **kwargs):
        return json.loads(data.to_json(orient='records'))

    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_get_conn_returns_a_boto3_connection(self):
        hook = AwsDynamoDBHook(aws_conn_id='aws_default')
        self.assertIsNotNone(hook.get_conn())

    @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df',
                return_value=pd.DataFrame(data=[('1', 'sid')], columns=['id', 'name']))
    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_get_records_with_schema(self, mock_get_pandas_df):
        # this table needs to be created in production
        self.hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'id',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator(
            sql=self.sql,
            table_name="test_airflow",
            task_id='hive_to_dynamodb_check',
            table_keys=['id'],
            dag=self.dag)

        operator.execute(None)

        table = self.hook.get_conn().Table('test_airflow')
        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')
        self.assertEqual(table.item_count, 1)

    @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df',
                return_value=pd.DataFrame(data=[('1', 'sid'), ('1', 'gupta')], columns=['id', 'name']))
    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_pre_process_records_with_schema(self, mock_get_pandas_df):
        # this table needs to be created in production
        self.hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'id',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator(
            sql=self.sql,
            table_name='test_airflow',
            task_id='hive_to_dynamodb_check',
            table_keys=['id'],
            pre_process=self.process_data,
            dag=self.dag)

        operator.execute(None)

        table = self.hook.get_conn().Table('test_airflow')
        table.meta.client.get_waiter('table_exists').wait(TableName='test_airflow')
        self.assertEqual(table.item_count, 1)
class HiveToDynamoDBTransferOperatorTest(unittest.TestCase):

    def setUp(self):
        configuration.load_test_config()
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
        dag = DAG('test_dag_id', default_args=args)
        self.dag = dag
        self.sql = 'SELECT 1'
        self.hook = AwsDynamoDBHook(
            aws_conn_id='aws_default', region_name='us-east-1')

    def process_data(self, data, *args, **kwargs):
        return json.loads(data.to_json(orient='records'))

    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_get_conn_returns_a_boto3_connection(self):
        hook = AwsDynamoDBHook(aws_conn_id='aws_default')
        self.assertIsNotNone(hook.get_conn())

    @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df',
                return_value=pd.DataFrame(data=[('1', 'sid')], columns=['id', 'name']))
    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_get_records_with_schema(self, get_results_mock):

        # this table needs to be created in production
        table = self.hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'name',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator(
            sql=self.sql,
            table_name="test_airflow",
            task_id='hive_to_dynamodb_check',
            table_keys=['id'],
            dag=self.dag)

        operator.execute(None)

        table = self.hook.get_conn().Table('test_airflow')
        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')
        self.assertEqual(table.item_count, 1)

    @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df',
                return_value=pd.DataFrame(data=[('1', 'sid'), ('1', 'gupta')], columns=['id', 'name']))
    @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present')
    @mock_dynamodb2
    def test_pre_process_records_with_schema(self, get_results_mock):

         # this table needs to be created in production
        table = self.hook.get_conn().create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'name',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator(
            sql=self.sql,
            table_name='test_airflow',
            task_id='hive_to_dynamodb_check',
            table_keys=['id'],
            pre_process=self.process_data,
            dag=self.dag)

        operator.execute(None)

        table = self.hook.get_conn().Table('test_airflow')
        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')
        self.assertEqual(table.item_count, 1)