def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('test_dag_id', default_args=args) self.dag = dag self.sql = 'SELECT 1' self.hook = AwsDynamoDBHook( aws_conn_id='aws_default', region_name='us-east-1')
def execute(self, context): self.log.info("Going to start S3 to Dynamo operator") dynamo_hook = AwsDynamoDBHook( aws_conn_id=self.dynamodb_conn_id, table_keys=self.table_keys, table_name=self.table_name, region_name=self.region_name, ) s3_hook = S3Hook(self.s3_conn_id) self.log.info("Downloading s3 file") source_obj = s3_hook.get_key(self.s3_key, self.s3_bucket) with NamedTemporaryFile() as source_file: with open(source_file.name, "wb") as opened_source_file: source_obj.download_fileobj(opened_source_file) self.log.info("Writing file to Dynamo") with open(source_file.name, "r") as opened_source_file: reader = opened_source_file.readlines() json_data = [ self.clean_empty(json.loads(x, parse_float=Decimal)) for x in reader ] dynamo_hook.write_batch_data(json_data) self.log.info("Finished S3 to Dynamo operator") return True
def persist_data(**kwargs): hook = AwsDynamoDBHook( table_name="TABLE_NAME", #TABLE_NAME aws_conn_id='aws_default') faceIndexDetails = kwargs['ti'].xcom_pull(key='FaceIndexDetails') thumbnailDetails = kwargs['ti'].xcom_pull(key='ThumbnailDetails') conf = kwargs['dag_run'].conf dynamoItem = { "UserId": conf["userId"], "s3Bucket": conf["s3Bucket"], "s3Key": conf["s3Key"], "faceId": faceIndexDetails['FaceId'], "thumbnail": thumbnailDetails['thumbnail'] } items = [dynamoItem] hook.write_batch_data(items)
def setUp(self): configuration.load_test_config() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('test_dag_id', default_args=args) self.dag = dag self.sql = 'SELECT 1' self.hook = AwsDynamoDBHook( aws_conn_id='aws_default', region_name='us-east-1')
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info('Extracting data from Hive') logging.info(self.sql) data = hive.get_pandas_df(self.sql, schema=self.schema) dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id, table_name=self.table_name, table_keys=self.table_keys, region_name=self.region_name) logging.info('Inserting rows into dynamodb') if self.pre_process is None: dynamodb.write_batch_data( json.loads(data.to_json(orient='records'))) else: dynamodb.write_batch_data( self.pre_process(data=data, args=self.pre_process_args, kwargs=self.pre_process_kwargs)) logging.info('Done.')
def execute(self, context): self.log.info("Going to start Xcom DynamoDB Key Operator") dynamo_hook = AwsDynamoDBHook( aws_conn_id=self.dynamodb_conn_id, table_keys=self.table_keys, table_name=self.table_name, region_name=self.region_name, ) condition = Key(self.partition_key_condition[0]).eq( self.partition_key_condition[1]) if self.sort_key_condition: condition = condition & self.sort_key_condition conn = dynamo_hook.get_conn() table = conn.Table(self.table_name) response = table.query(KeyConditionExpression=condition) if response["Items"]: self.log.info("Got response, validating") return [x[self.return_key] for x in response["Items"]] else: self.log.info("Responde didn't return results") return False
def execute(self, context): table = AwsDynamoDBHook().get_conn().Table(self.dynamodb_table_name) scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {} err = None f = NamedTemporaryFile() try: f = self._scan_dynamodb_and_upload_to_s3(f, scan_kwargs, table) except Exception as e: err = e raise e finally: if err is None: _upload_file_to_s3(f, self.s3_bucket_name, self.s3_key_prefix) f.close()
def execute(self, context): s3 = S3Hook(aws_conn_id=self.aws_conn_id) dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id, table_name=self.table_name, table_keys=self.table_keys, region_name=self.region_name) if not s3.check_for_key(self.s3_key): raise AirflowException("The source key {0} does not exist".format( self.s3_key)) s3_key_object = s3.get_key(self.s3_key) s3_key_json = json.loads( s3_key_object.get()['Body'].read().decode('utf-8')) json_list = s3_key_json[self.json_key] json_list = self._convert_float_to_decimal(json_list) logging.info('Inserting rows into dynamodb table %s', self.table_name) dynamodb.write_batch_data(json_list) logging.info('Finished inserting %d rows into dynamodb table %s', len(json_list), self.table_name)
def test_insert_batch_items_dynamodb_table(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default', table_name='test_airflow', table_keys=['id'], region_name='us-east-1') # this table needs to be created in production table = hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) table = hook.get_conn().Table('test_airflow') items = [{'id': str(uuid.uuid4()), 'name': 'airflow'} for _ in range(10)] hook.write_batch_data(items) table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 10)
def test_insert_batch_items_dynamodb_table(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default', table_name='test_airflow', table_keys=['id'], region_name='us-east-1') # this table needs to be created in production table = hook.get_conn().create_table(TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[{ 'AttributeName': 'name', 'AttributeType': 'S' }], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 }) table = hook.get_conn().Table('test_airflow') items = [{ 'id': str(uuid.uuid4()), 'name': 'airflow' } for _ in range(10)] hook.write_batch_data(items) table.meta.client.get_waiter('table_exists').wait( TableName='test_airflow') self.assertEqual(table.item_count, 10)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) self.log.info('Extracting data from Hive') self.log.info(self.sql) data = hive.get_pandas_df(self.sql, schema=self.schema) dynamodb = AwsDynamoDBHook(aws_conn_id=self.aws_conn_id, table_name=self.table_name, table_keys=self.table_keys, region_name=self.region_name) self.log.info('Inserting rows into dynamodb') if self.pre_process is None: dynamodb.write_batch_data( json.loads(data.to_json(orient='records'))) else: dynamodb.write_batch_data( self.pre_process(data=data, args=self.pre_process_args, kwargs=self.pre_process_kwargs)) self.log.info('Done.')
def test_get_conn_returns_a_boto3_connection(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default') self.assertIsNotNone(hook.get_conn())
def test_get_conn_returns_a_boto3_connection(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default') self.assertIsNotNone(hook.get_conn())
class TestHiveToDynamoDBTransferOperator(unittest.TestCase): def setUp(self): args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('test_dag_id', default_args=args) self.dag = dag self.sql = 'SELECT 1' self.hook = AwsDynamoDBHook( aws_conn_id='aws_default', region_name='us-east-1') @staticmethod def process_data(data, *args, **kwargs): return json.loads(data.to_json(orient='records')) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_get_conn_returns_a_boto3_connection(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default') self.assertIsNotNone(hook.get_conn()) @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df', return_value=pd.DataFrame(data=[('1', 'sid')], columns=['id', 'name'])) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_get_records_with_schema(self, mock_get_pandas_df): # this table needs to be created in production self.hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'id', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator( sql=self.sql, table_name="test_airflow", task_id='hive_to_dynamodb_check', table_keys=['id'], dag=self.dag) operator.execute(None) table = self.hook.get_conn().Table('test_airflow') table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 1) @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df', return_value=pd.DataFrame(data=[('1', 'sid'), ('1', 'gupta')], columns=['id', 'name'])) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_pre_process_records_with_schema(self, mock_get_pandas_df): # this table needs to be created in production self.hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'id', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator( sql=self.sql, table_name='test_airflow', task_id='hive_to_dynamodb_check', table_keys=['id'], pre_process=self.process_data, dag=self.dag) operator.execute(None) table = self.hook.get_conn().Table('test_airflow') table.meta.client.get_waiter('table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 1)
class HiveToDynamoDBTransferOperatorTest(unittest.TestCase): def setUp(self): configuration.load_test_config() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} dag = DAG('test_dag_id', default_args=args) self.dag = dag self.sql = 'SELECT 1' self.hook = AwsDynamoDBHook( aws_conn_id='aws_default', region_name='us-east-1') def process_data(self, data, *args, **kwargs): return json.loads(data.to_json(orient='records')) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_get_conn_returns_a_boto3_connection(self): hook = AwsDynamoDBHook(aws_conn_id='aws_default') self.assertIsNotNone(hook.get_conn()) @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df', return_value=pd.DataFrame(data=[('1', 'sid')], columns=['id', 'name'])) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_get_records_with_schema(self, get_results_mock): # this table needs to be created in production table = self.hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator( sql=self.sql, table_name="test_airflow", task_id='hive_to_dynamodb_check', table_keys=['id'], dag=self.dag) operator.execute(None) table = self.hook.get_conn().Table('test_airflow') table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 1) @mock.patch('airflow.hooks.hive_hooks.HiveServer2Hook.get_pandas_df', return_value=pd.DataFrame(data=[('1', 'sid'), ('1', 'gupta')], columns=['id', 'name'])) @unittest.skipIf(mock_dynamodb2 is None, 'mock_dynamodb2 package not present') @mock_dynamodb2 def test_pre_process_records_with_schema(self, get_results_mock): # this table needs to be created in production table = self.hook.get_conn().create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) operator = airflow.contrib.operators.hive_to_dynamodb.HiveToDynamoDBTransferOperator( sql=self.sql, table_name='test_airflow', task_id='hive_to_dynamodb_check', table_keys=['id'], pre_process=self.process_data, dag=self.dag) operator.execute(None) table = self.hook.get_conn().Table('test_airflow') table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 1)