Exemplo n.º 1
0
    def test_get_partitions(self, mock_get_conn):
        response = [{
            'Partitions': [{
                'Values': ['2015-01-01']
            }]
        }]
        mock_paginator = mock.Mock()
        mock_paginator.paginate.return_value = response
        mock_conn = mock.Mock()
        mock_conn.get_paginator.return_value = mock_paginator
        mock_get_conn.return_value = mock_conn
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        result = hook.get_partitions('db',
                                     'tbl',
                                     expression='foo=bar',
                                     page_size=2,
                                     max_items=3)

        self.assertEqual(result, set([('2015-01-01',)]))
        mock_conn.get_paginator.assert_called_once_with('get_partitions')
        mock_paginator.paginate.assert_called_once_with(DatabaseName='db',
                                                        TableName='tbl',
                                                        Expression='foo=bar',
                                                        PaginationConfig={
                                                            'PageSize': 2,
                                                            'MaxItems': 3})
    def test_get_partitions(self, mock_get_conn):
        response = [{
            'Partitions': [{
                'Values': ['2015-01-01']
            }]
        }]
        mock_paginator = mock.Mock()
        mock_paginator.paginate.return_value = response
        mock_conn = mock.Mock()
        mock_conn.get_paginator.return_value = mock_paginator
        mock_get_conn.return_value = mock_conn
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        result = hook.get_partitions('db',
                                     'tbl',
                                     expression='foo=bar',
                                     page_size=2,
                                     max_items=3)

        self.assertEqual(result, set([('2015-01-01',)]))
        mock_conn.get_paginator.assert_called_once_with('get_partitions')
        mock_paginator.paginate.assert_called_once_with(DatabaseName='db',
                                                        TableName='tbl',
                                                        Expression='foo=bar',
                                                        PaginationConfig={
                                                            'PageSize': 2,
                                                            'MaxItems': 3})
Exemplo n.º 3
0
    def test_check_for_partition(self, mock_get_partitions):
        mock_get_partitions.return_value = {('2018-01-01', )}
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertTrue(hook.check_for_partition('db', 'tbl', 'expr'))
        mock_get_partitions.assert_called_once_with('db',
                                                    'tbl',
                                                    'expr',
                                                    max_items=1)
Exemplo n.º 4
0
def aws_crawler_function(ds, **kwargs):
    """
    This can be any python code you want and is called from the python operator. The code is not executed until
    the task is run by the airflow scheduler.
    """
    crawler_name = kwargs['crawler_name']
    print('crawler_name is ', crawler_name)
    hook = AwsGlueCatalogHook(aws_conn_id='aws_default',
                              region_name='us-east-1')
    client = hook.get_conn()
    return client.start_crawler(Name=crawler_name)
Exemplo n.º 5
0
    def __init__(self, *args, partner: str, **kwargs):
        super().__init__(poke_interval=3 * 60,
                         timeout=SourcePartitionsSensor.TIMEOUT_HOURS * 3600,
                         *args,
                         **kwargs)

        dag = PMIDAG.get_dag(**kwargs)

        self._database = dag.env_config['raw_events_glue_db_name']
        self._table = partner

        self._glue_hook = AwsGlueCatalogHook(aws_conn_id=dag.aws_conn_id)
Exemplo n.º 6
0
    def get_hook(self):
        """get_hook returns a connection to AWS Glue.

        Returns:
            AWSGlueCatalogHook: A connection object to AWS Glue.
        """
        return AwsGlueCatalogHook(aws_conn_id=self.aws_conn_id,
                                  region_name=self.region_name)
Exemplo n.º 7
0
    def get_hook(self):
        """
        Gets the AwsGlueCatalogHook
        """
        if not hasattr(self, 'hook'):
            from airflow.contrib.hooks.aws_glue_catalog_hook import AwsGlueCatalogHook
            self.hook = AwsGlueCatalogHook(aws_conn_id=self.aws_conn_id,
                                           region_name=self.region_name)

        return self.hook
Exemplo n.º 8
0
class SourcePartitionsSensor(BaseSensorOperator):
    """Sensor implementation for source partitions availability.

    The sensor makes sure that data for all 24 hours of the given data date are
    available in the raw events data source. It does it by checking the presence
    of the corresponding partitions in the partner's raw events Glue table.

    Parameters
    ----------
    partner
        The partner.
    args, kwargs
        Standard Airflow sensor arguments.

    """

    ui_color = '#C5CAE9'

    TIMEOUT_HOURS = 3

    @apply_defaults
    def __init__(self, *args, partner: str, **kwargs):
        super().__init__(poke_interval=3 * 60,
                         timeout=SourcePartitionsSensor.TIMEOUT_HOURS * 3600,
                         *args,
                         **kwargs)

        dag = PMIDAG.get_dag(**kwargs)

        self._database = dag.env_config['raw_events_glue_db_name']
        self._table = partner

        self._glue_hook = AwsGlueCatalogHook(aws_conn_id=dag.aws_conn_id)

    def poke(self, context: Mapping[str, Any]) -> bool:

        # get partitions matching the data date
        utcdate_base = datetime(*(int(p) for p in context['ds'].split('-')),
                                tzinfo=PMIDAG.DATA_DATE_TZ).astimezone(
                                    timezone.utc)
        partitions: Dict[str, List[str]] = {}
        for hour in range(24):
            utcdatehour = utcdate_base + timedelta(hours=hour)
            utcdate = utcdatehour.strftime('%Y%m%d')
            utchour = utcdatehour.strftime('%H')
            if utcdate in partitions:
                partitions[utcdate].append(utchour)
            else:
                partitions[utcdate] = [utchour]

        # build test expression
        expression = ' OR '.join((
            f"(utcdate = '{utcdate}' AND utchour BETWEEN '{utchours[0]}' AND '{utchours[-1]}')"
        ) for utcdate, utchours in partitions.items())

        # query partitions
        self.log.info("Poking for table %s.%s, expression %s", self._database,
                      self._table, expression)
        partition_descs = self._glue_hook.get_partitions(
            database_name=self._database,
            table_name=self._table,
            expression=expression,
            max_items=24)

        # check if all 24 hours available
        return len(partition_descs) == 24
Exemplo n.º 9
0
 def test_region(self):
     hook = AwsGlueCatalogHook(region_name="us-west-2")
     self.assertEqual(hook.region_name, 'us-west-2')
Exemplo n.º 10
0
    def test_get_partitions_empty(self, mock_get_conn):
        response = set()
        mock_get_conn.get_paginator.paginate.return_value = response
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertEqual(hook.get_partitions('db', 'tbl'), set())
Exemplo n.º 11
0
 def test_get_conn_returns_a_boto3_connection(self):
     hook = AwsGlueCatalogHook(region_name="us-east-1")
     self.assertIsNotNone(hook.get_conn())
Exemplo n.º 12
0
 def test_conn_id(self):
     hook = AwsGlueCatalogHook(aws_conn_id='my_aws_conn_id',
                               region_name="us-east-1")
     self.assertEqual(hook.aws_conn_id, 'my_aws_conn_id')
Exemplo n.º 13
0
class TestAwsGlueCatalogHook(unittest.TestCase):
    @mock_glue
    def setUp(self):
        self.client = boto3.client('glue', region_name='us-east-1')
        self.hook = AwsGlueCatalogHook(region_name="us-east-1")

    @mock_glue
    def test_get_conn_returns_a_boto3_connection(self):
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        self.assertIsNotNone(hook.get_conn())

    @mock_glue
    def test_conn_id(self):
        hook = AwsGlueCatalogHook(aws_conn_id='my_aws_conn_id',
                                  region_name="us-east-1")
        self.assertEqual(hook.aws_conn_id, 'my_aws_conn_id')

    @mock_glue
    def test_region(self):
        hook = AwsGlueCatalogHook(region_name="us-west-2")
        self.assertEqual(hook.region_name, 'us-west-2')

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_conn')
    def test_get_partitions_empty(self, mock_get_conn):
        response = set()
        mock_get_conn.get_paginator.paginate.return_value = response
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertEqual(hook.get_partitions('db', 'tbl'), set())

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_conn')
    def test_get_partitions(self, mock_get_conn):
        response = [{'Partitions': [{'Values': ['2015-01-01']}]}]
        mock_paginator = mock.Mock()
        mock_paginator.paginate.return_value = response
        mock_conn = mock.Mock()
        mock_conn.get_paginator.return_value = mock_paginator
        mock_get_conn.return_value = mock_conn
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        result = hook.get_partitions('db',
                                     'tbl',
                                     expression='foo=bar',
                                     page_size=2,
                                     max_items=3)

        self.assertEqual(result, {('2015-01-01', )})
        mock_conn.get_paginator.assert_called_once_with('get_partitions')
        mock_paginator.paginate.assert_called_once_with(DatabaseName='db',
                                                        TableName='tbl',
                                                        Expression='foo=bar',
                                                        PaginationConfig={
                                                            'PageSize': 2,
                                                            'MaxItems': 3
                                                        })

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_partitions')
    def test_check_for_partition(self, mock_get_partitions):
        mock_get_partitions.return_value = {('2018-01-01', )}
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertTrue(hook.check_for_partition('db', 'tbl', 'expr'))
        mock_get_partitions.assert_called_once_with('db',
                                                    'tbl',
                                                    'expr',
                                                    max_items=1)

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_partitions')
    def test_check_for_partition_false(self, mock_get_partitions):
        mock_get_partitions.return_value = set()
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertFalse(hook.check_for_partition('db', 'tbl', 'expr'))

    @mock_glue
    def test_get_table_exists(self):
        self.client.create_database(DatabaseInput={'Name': DB_NAME})
        self.client.create_table(DatabaseName=DB_NAME, TableInput=TABLE_INPUT)

        result = self.hook.get_table(DB_NAME, TABLE_NAME)

        self.assertEqual(result['Name'], TABLE_INPUT['Name'])
        self.assertEqual(result['StorageDescriptor']['Location'],
                         TABLE_INPUT['StorageDescriptor']['Location'])

    @mock_glue
    def test_get_table_not_exists(self):
        self.client.create_database(DatabaseInput={'Name': DB_NAME})
        self.client.create_table(DatabaseName=DB_NAME, TableInput=TABLE_INPUT)

        with self.assertRaises(Exception):
            self.hook.get_table(DB_NAME, 'dummy_table')

    @mock_glue
    def test_get_table_location(self):
        self.client.create_database(DatabaseInput={'Name': DB_NAME})
        self.client.create_table(DatabaseName=DB_NAME, TableInput=TABLE_INPUT)

        result = self.hook.get_table_location(DB_NAME, TABLE_NAME)
        self.assertEqual(result, TABLE_INPUT['StorageDescriptor']['Location'])
Exemplo n.º 14
0
 def setUp(self):
     self.client = boto3.client('glue', region_name='us-east-1')
     self.hook = AwsGlueCatalogHook(region_name="us-east-1")
    def test_check_for_partition_false(self, mock_get_partitions):
        mock_get_partitions.return_value = set()
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertFalse(hook.check_for_partition('db', 'tbl', 'expr'))
Exemplo n.º 16
0
    def test_check_for_partition_false(self, mock_get_partitions):
        mock_get_partitions.return_value = set()
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertFalse(hook.check_for_partition('db', 'tbl', 'expr'))
class TestAwsGlueCatalogHook(unittest.TestCase):

    @mock_glue
    def setUp(self):
        self.client = boto3.client('glue', region_name='us-east-1')
        self.hook = AwsGlueCatalogHook(region_name="us-east-1")

    @mock_glue
    def test_get_conn_returns_a_boto3_connection(self):
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        self.assertIsNotNone(hook.get_conn())

    @mock_glue
    def test_conn_id(self):
        hook = AwsGlueCatalogHook(aws_conn_id='my_aws_conn_id', region_name="us-east-1")
        self.assertEqual(hook.aws_conn_id, 'my_aws_conn_id')

    @mock_glue
    def test_region(self):
        hook = AwsGlueCatalogHook(region_name="us-west-2")
        self.assertEqual(hook.region_name, 'us-west-2')

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_conn')
    def test_get_partitions_empty(self, mock_get_conn):
        response = set()
        mock_get_conn.get_paginator.paginate.return_value = response
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertEqual(hook.get_partitions('db', 'tbl'), set())

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_conn')
    def test_get_partitions(self, mock_get_conn):
        response = [{
            'Partitions': [{
                'Values': ['2015-01-01']
            }]
        }]
        mock_paginator = mock.Mock()
        mock_paginator.paginate.return_value = response
        mock_conn = mock.Mock()
        mock_conn.get_paginator.return_value = mock_paginator
        mock_get_conn.return_value = mock_conn
        hook = AwsGlueCatalogHook(region_name="us-east-1")
        result = hook.get_partitions('db',
                                     'tbl',
                                     expression='foo=bar',
                                     page_size=2,
                                     max_items=3)

        self.assertEqual(result, set([('2015-01-01',)]))
        mock_conn.get_paginator.assert_called_once_with('get_partitions')
        mock_paginator.paginate.assert_called_once_with(DatabaseName='db',
                                                        TableName='tbl',
                                                        Expression='foo=bar',
                                                        PaginationConfig={
                                                            'PageSize': 2,
                                                            'MaxItems': 3})

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_partitions')
    def test_check_for_partition(self, mock_get_partitions):
        mock_get_partitions.return_value = set([('2018-01-01',)])
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertTrue(hook.check_for_partition('db', 'tbl', 'expr'))
        mock_get_partitions.assert_called_once_with('db', 'tbl', 'expr', max_items=1)

    @mock_glue
    @mock.patch.object(AwsGlueCatalogHook, 'get_partitions')
    def test_check_for_partition_false(self, mock_get_partitions):
        mock_get_partitions.return_value = set()
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertFalse(hook.check_for_partition('db', 'tbl', 'expr'))

    @mock_glue
    def test_get_table_exists(self):
        self.client.create_database(
            DatabaseInput={
                'Name': DB_NAME
            }
        )
        self.client.create_table(
            DatabaseName=DB_NAME,
            TableInput=TABLE_INPUT
        )

        result = self.hook.get_table(DB_NAME, TABLE_NAME)

        self.assertEqual(result['Name'], TABLE_INPUT['Name'])
        self.assertEqual(result['StorageDescriptor']['Location'],
                         TABLE_INPUT['StorageDescriptor']['Location'])

    @mock_glue
    def test_get_table_not_exists(self):
        self.client.create_database(
            DatabaseInput={
                'Name': DB_NAME
            }
        )
        self.client.create_table(
            DatabaseName=DB_NAME,
            TableInput=TABLE_INPUT
        )

        with self.assertRaises(Exception):
            self.hook.get_table(DB_NAME, 'dummy_table')

    @mock_glue
    def test_get_table_location(self):
        self.client.create_database(
            DatabaseInput={
                'Name': DB_NAME
            }
        )
        self.client.create_table(
            DatabaseName=DB_NAME,
            TableInput=TABLE_INPUT
        )

        result = self.hook.get_table_location(DB_NAME, TABLE_NAME)
        self.assertEqual(result, TABLE_INPUT['StorageDescriptor']['Location'])
    def test_check_for_partition(self, mock_get_partitions):
        mock_get_partitions.return_value = set([('2018-01-01',)])
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertTrue(hook.check_for_partition('db', 'tbl', 'expr'))
        mock_get_partitions.assert_called_once_with('db', 'tbl', 'expr', max_items=1)
 def setUp(self):
     self.client = boto3.client('glue', region_name='us-east-1')
     self.hook = AwsGlueCatalogHook(region_name="us-east-1")
 def test_get_conn_returns_a_boto3_connection(self):
     hook = AwsGlueCatalogHook(region_name="us-east-1")
     self.assertIsNotNone(hook.get_conn())
    def test_get_partitions_empty(self, mock_get_conn):
        response = set()
        mock_get_conn.get_paginator.paginate.return_value = response
        hook = AwsGlueCatalogHook(region_name="us-east-1")

        self.assertEqual(hook.get_partitions('db', 'tbl'), set())