Пример #1
0
    def _get_partition_watermarks(self, table, tableRef, partitions):
        if "field" in table["timePartitioning"]:
            field = table["timePartitioning"]["field"]
        else:
            field = "_PARTITIONTIME"

        low = min(partitions, key=lambda t: t.partition_id)
        low_wm = Watermark(
            datetime.datetime.fromtimestamp(float(
                low.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"),
            "bigquery",
            tableRef["datasetId"],
            tableRef["tableId"],
            "{field}={partition_id}".format(field=field,
                                            partition_id=low.partition_id),
            part_type="low_watermark",
            cluster=tableRef["projectId"],
        )

        high = max(partitions, key=lambda t: t.partition_id)
        high_wm = Watermark(
            datetime.datetime.fromtimestamp(float(
                high.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"),
            "bigquery",
            tableRef["datasetId"],
            tableRef["tableId"],
            "{field}={partition_id}".format(field=field,
                                            partition_id=high.partition_id),
            part_type="high_watermark",
            cluster=tableRef["projectId"],
        )

        return low_wm, high_wm
Пример #2
0
    def _get_partition_watermarks(
            self, table: Dict[str, Any], tableRef: Dict[str, str],
            partitions: List[PartitionInfo]) -> Tuple[Watermark, Watermark]:
        if 'field' in table['timePartitioning']:
            field = table['timePartitioning']['field']
        else:
            field = '_PARTITIONTIME'

        low = min(partitions, key=lambda t: t.partition_id)
        low_wm = Watermark(datetime.datetime.fromtimestamp(
            float(low.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'),
                           'bigquery',
                           tableRef['datasetId'],
                           tableRef['tableId'],
                           '{field}={partition_id}'.format(
                               field=field, partition_id=low.partition_id),
                           part_type="low_watermark",
                           cluster=tableRef['projectId'])

        high = max(partitions, key=lambda t: t.partition_id)
        high_wm = Watermark(datetime.datetime.fromtimestamp(
            float(high.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'),
                            'bigquery',
                            tableRef['datasetId'],
                            tableRef['tableId'],
                            '{field}={partition_id}'.format(
                                field=field, partition_id=high.partition_id),
                            part_type="high_watermark",
                            cluster=tableRef['projectId'])

        return low_wm, high_wm
Пример #3
0
 def test_index_with_data(self) -> None:
     extractor = self._get_extractor([self.index_with_data_1])
     expected = [
         Watermark(
             database='elasticsearch',
             cluster='cluster_name',
             schema='schema_name',
             table_name='index_with_data_1',
             create_time=datetime.fromtimestamp(1641863003).strftime(
                 '%Y-%m-%d %H:%M:%S'),
             part_name=
             f"time={datetime.fromtimestamp(1641863055).strftime('%Y-%m-%d')}",
             part_type='low_watermark'),
         Watermark(
             database='elasticsearch',
             cluster='cluster_name',
             schema='schema_name',
             table_name='index_with_data_1',
             create_time=datetime.fromtimestamp(1641863003).strftime(
                 '%Y-%m-%d %H:%M:%S'),
             part_name=
             f"time={datetime.fromtimestamp(1641949455).strftime('%Y-%m-%d')}",
             part_type='high_watermark')
     ]
     self._extract_and_compare(extractor, expected)
    def setUp(self) -> None:
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)
        self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/'
        self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.expected_node_result = GraphNode(key=self.start_key,
                                              label='Watermark',
                                              attributes={
                                                  'partition_key':
                                                  'ds',
                                                  'partition_value':
                                                  '2017-09-18/feature_id=9',
                                                  'create_time':
                                                  '2017-09-18T00:00:00'
                                              })

        self.expected_serialized_node_results = [{
            NODE_KEY:
            self.start_key,
            NODE_LABEL:
            'Watermark',
            'partition_key':
            'ds',
            'partition_value':
            '2017-09-18/feature_id=9',
            'create_time':
            '2017-09-18T00:00:00'
        }]

        self.expected_relation_result = GraphRelationship(
            start_label='Watermark',
            end_label='Table',
            start_key=self.start_key,
            end_key=self.end_key,
            type='BELONG_TO_TABLE',
            reverse_type='WATERMARK',
            attributes={})

        self.expected_serialized_relation_results = [{
            RELATION_START_KEY:
            self.start_key,
            RELATION_START_LABEL:
            'Watermark',
            RELATION_END_KEY:
            self.end_key,
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE:
            'WATERMARK'
        }]
    def _retrieve_tables(self, dataset):
        # type: () -> Any
        sharded_table_watermarks = {}

        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                if self._is_table_match_regex(tableRef):
                    table_id = tableRef['tableId']

                    # BigQuery tables that have 8 digits as last characters are
                    # considered date range tables and are grouped together in the UI.
                    # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                    # We use these suffixes to determine high and low watermarks
                    if self._is_sharded_table(table_id):
                        suffix = table_id[-BigQueryWatermarkExtractor.DATE_LENGTH:]
                        prefix = table_id[:-BigQueryWatermarkExtractor.DATE_LENGTH]

                        if prefix in sharded_table_watermarks:
                            sharded_table_watermarks[prefix]['low'] = min(sharded_table_watermarks[prefix]['low'], suffix)
                            sharded_table_watermarks[prefix]['high'] = max(sharded_table_watermarks[prefix]['high'], suffix)
                        else:
                            sharded_table_watermarks[prefix] = {'high': suffix, 'low': suffix, 'table': table}
                    else:
                        partitions = self._get_partitions(table, tableRef)
                        if not partitions:
                            continue
                        low, high = self._get_partition_watermarks(table, tableRef, partitions)
                        yield low
                        yield high

            for prefix, td in sharded_table_watermarks.items():
                table = td['table']
                tableRef = table['tableReference']

                yield Watermark(
                    datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'),
                    'bigquery',
                    tableRef['datasetId'],
                    prefix,
                    '__table__={partition_id}'.format(partition_id=td['low']),
                    part_type="low_watermark",
                    cluster=tableRef['projectId']
                )

                yield Watermark(
                    datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'),
                    'bigquery',
                    tableRef['datasetId'],
                    prefix,
                    '__table__={partition_id}'.format(partition_id=td['high']),
                    part_type="high_watermark",
                    cluster=tableRef['projectId']
                )
    def setUp(self):
        # type: () -> None
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)

        self.expected_node_result = {
            NODE_KEY:
            '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            NODE_LABEL:
            'Watermark',
            'partition_key':
            'ds',
            'partition_value':
            '2017-09-18/feature_id=9',
            'create_time':
            '2017-09-18T00:00:00'
        }

        self.expected_relation_result = {
            RELATION_START_KEY:
            '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            RELATION_START_LABEL:
            'Watermark',
            RELATION_END_KEY:
            '{database}://{cluster}.{schema}/{table}'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower()),
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE:
            'WATERMARK'
        }
Пример #7
0
    def _get_extract_iter(self) -> Iterator[Union[Watermark, None]]:
        # Get all the indices
        indices: Dict = self._get_indexes()

        # Iterate over indices
        for index_name, index_metadata in indices.items():
            creation_date: Optional[float] = self._get_index_creation_date(index_metadata)
            watermark_bounds: Optional[Tuple[float, float]] = self._get_index_watermark_bounds(index_name=index_name)
            watermark_min: Optional[float] = None if watermark_bounds is None else watermark_bounds[0]
            watermark_max: Optional[float] = None if watermark_bounds is None else watermark_bounds[1]

            if creation_date is None or watermark_min is None or watermark_max is None:
                continue

            creation_date_str: str = datetime.fromtimestamp(creation_date / 1000).strftime('%Y-%m-%d %H:%M:%S')
            watermark_min_str: str = datetime.fromtimestamp(watermark_min / 1000).strftime('%Y-%m-%d')
            watermark_max_str: str = datetime.fromtimestamp(watermark_max / 1000).strftime('%Y-%m-%d')

            yield Watermark(
                database=self.database,
                cluster=self.cluster,
                schema=self.schema,
                table_name=index_name,
                create_time=creation_date_str,
                part_name=f'{self._time_field}={watermark_min_str}',
                part_type='low_watermark'
            )

            yield Watermark(
                database=self.database,
                cluster=self.cluster,
                schema=self.schema,
                table_name=index_name,
                create_time=creation_date_str,
                part_name=f'{self._time_field}={watermark_max_str}',
                part_type='high_watermark'
            )
class TestWatermark(unittest.TestCase):
    def setUp(self) -> None:
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)
        self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/'
        self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.expected_node_result = GraphNode(key=self.start_key,
                                              label='Watermark',
                                              attributes={
                                                  'partition_key':
                                                  'ds',
                                                  'partition_value':
                                                  '2017-09-18/feature_id=9',
                                                  'create_time':
                                                  '2017-09-18T00:00:00'
                                              })

        self.expected_serialized_node_result = {
            NODE_KEY: self.start_key,
            NODE_LABEL: 'Watermark',
            'partition_key': 'ds',
            'partition_value': '2017-09-18/feature_id=9',
            'create_time': '2017-09-18T00:00:00'
        }

        self.expected_relation_result = GraphRelationship(
            start_label='Watermark',
            end_label='Table',
            start_key=self.start_key,
            end_key=self.end_key,
            type='BELONG_TO_TABLE',
            reverse_type='WATERMARK',
            attributes={})

        self.expected_serialized_relation_result = {
            RELATION_START_KEY: self.start_key,
            RELATION_START_LABEL: 'Watermark',
            RELATION_END_KEY: self.end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: 'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE: 'WATERMARK'
        }

    def test_get_watermark_model_key(self) -> None:
        watermark = self.watermark.get_watermark_model_key()
        self.assertEqual(
            watermark, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/')

    def test_get_metadata_model_key(self) -> None:
        metadata = self.watermark.get_metadata_model_key()
        self.assertEqual(metadata, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}')

    def test_create_nodes(self) -> None:
        nodes = self.watermark.create_nodes()
        self.assertEquals(len(nodes), 1)

        self.assertEquals(nodes[0], self.expected_node_result)
        self.assertEqual(neo4_serializer.serialize_node(nodes[0]),
                         self.expected_serialized_node_result)

    def test_create_nodes_neptune(self) -> None:
        nodes = self.watermark.create_nodes()

        expected_serialized_node_result = {
            NEPTUNE_HEADER_ID:
            self.start_key,
            NEPTUNE_HEADER_LABEL:
            'Watermark',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB,
            'partition_key:String(single)':
            'ds',
            'partition_value:String(single)':
            '2017-09-18/feature_id=9',
            'create_time:String(single)':
            '2017-09-18T00:00:00'
        }

        serialized_node = neptune_serializer.convert_node(nodes[0])
        self.assertDictEqual(expected_serialized_node_result, serialized_node)

    def test_create_relation(self) -> None:
        relation = self.watermark.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(relation[0], self.expected_relation_result)
        self.assertEqual(neo4_serializer.serialize_relationship(relation[0]),
                         self.expected_serialized_relation_result)

    def test_create_relation_neptune(self) -> None:
        relation = self.watermark.create_relation()
        serialized_relation = neptune_serializer.convert_relationship(
            relation[0])
        expected = [{
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id=self.start_key,
                to_vertex_id=self.end_key,
                label='BELONG_TO_TABLE'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            self.start_key,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            self.end_key,
            NEPTUNE_HEADER_LABEL:
            'BELONG_TO_TABLE',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }, {
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id=self.end_key,
                to_vertex_id=self.start_key,
                label='WATERMARK'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            self.end_key,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            self.start_key,
            NEPTUNE_HEADER_LABEL:
            'WATERMARK',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }]

        self.assertListEqual(serialized_relation, expected)

    def test_create_next_node(self) -> None:
        next_node = self.watermark.create_next_node()
        self.assertEquals(neo4_serializer.serialize_node(next_node),
                          self.expected_serialized_node_result)

    def test_create_next_relation(self) -> None:
        next_relation = self.watermark.create_next_relation()
        self.assertEquals(
            neo4_serializer.serialize_relationship(next_relation),
            self.expected_serialized_relation_result)
class TestWatermark(unittest.TestCase):

    def setUp(self):
        # type: () -> None
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema_name=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)

        self.expected_node_result = {
            NODE_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'
            .format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            NODE_LABEL: 'Watermark',
            'partition_key': 'ds',
            'partition_value': '2017-09-18/feature_id=9',
            'create_time': '2017-09-18T00:00:00'
        }

        self.expected_relation_result = {
            RELATION_START_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'
            .format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            RELATION_START_LABEL: 'Watermark',
            RELATION_END_KEY: '{database}://{cluster}.{schema}/{table}'
            .format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower()),
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: 'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE: 'WATERMARK'
        }

    def test_get_watermark_model_key(self):
        # type: () -> None
        watermark = self.watermark.get_watermark_model_key()
        self.assertEquals(
            watermark, '{database}://{cluster}.{schema}/{table}/{part_type}/'
            .format(database=DATABASE.lower(),
                    cluster=CLUSTER.lower(),
                    schema=SCHEMA.lower(),
                    table=TABLE.lower(),
                    part_type=PART_TYPE.lower()))

    def test_get_metadata_model_key(self):
        # type: () -> None
        metadata = self.watermark.get_metadata_model_key()
        self.assertEquals(metadata, '{database}://{cluster}.{schema}/{table}'
                          .format(database=DATABASE.lower(),
                                  cluster=CLUSTER.lower(),
                                  schema=SCHEMA.lower(),
                                  table=TABLE.lower()))

    def test_create_nodes(self):
        # type: () -> None
        nodes = self.watermark.create_nodes()
        self.assertEquals(len(nodes), 1)
        self.assertEquals(nodes[0], self.expected_node_result)

    def test_create_relation(self):
        # type: () -> None
        relation = self.watermark.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(relation[0], self.expected_relation_result)

    def test_create_next_node(self):
        # type: () -> None
        next_node = self.watermark.create_next_node()
        self.assertEquals(next_node, self.expected_node_result)

    def test_create_next_relation(self):
        # type: () -> None
        next_relation = self.watermark.create_next_relation()
        self.assertEquals(next_relation, self.expected_relation_result)
Пример #10
0
    def _retrieve_tables(self, dataset: DatasetRef) -> Iterator[Watermark]:
        sharded_table_watermarks: Dict[str, Dict[str, Union[str, Any]]] = {}
        cutoff_time_in_epoch = timegm(
            time.strptime(self.cutoff_time,
                          BigQueryWatermarkExtractor.DATE_TIME_FORMAT))

        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']
                table_creation_time = float(table['creationTime']) / 1000
                # only extract watermark metadata for tables created before the cut-off time
                if table_creation_time < cutoff_time_in_epoch:
                    # BigQuery tables that have numeric suffix starts with a date are
                    # considered date range tables.
                    # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                    # We use these dates in the suffixes to determine high and low watermarks
                    if self._is_sharded_table(table_id):
                        suffix = self._get_sharded_table_suffix(table_id)
                        prefix = table_id[:-len(suffix)]
                        date = suffix[:BaseBigQueryExtractor.DATE_LENGTH]

                        if prefix in sharded_table_watermarks:
                            sharded_table_watermarks[prefix]['low'] = min(
                                sharded_table_watermarks[prefix]['low'], date)
                            sharded_table_watermarks[prefix]['high'] = max(
                                sharded_table_watermarks[prefix]['high'], date)
                        else:
                            sharded_table_watermarks[prefix] = {
                                'high': date,
                                'low': date,
                                'table': table
                            }
                    else:
                        partitions = self._get_partitions(table, tableRef)
                        if not partitions:
                            continue
                        low, high = self._get_partition_watermarks(
                            table, tableRef, partitions)
                        yield low
                        yield high

            for prefix, td in sharded_table_watermarks.items():
                table = td['table']
                tableRef = table['tableReference']

                yield Watermark(datetime.datetime.fromtimestamp(
                    float(table['creationTime']) /
                    1000).strftime('%Y-%m-%d %H:%M:%S'),
                                'bigquery',
                                tableRef['datasetId'],
                                prefix,
                                f'__table__={td["low"]}',
                                part_type="low_watermark",
                                cluster=tableRef['projectId'])

                yield Watermark(datetime.datetime.fromtimestamp(
                    float(table['creationTime']) /
                    1000).strftime('%Y-%m-%d %H:%M:%S'),
                                'bigquery',
                                tableRef['datasetId'],
                                prefix,
                                f'__table__={td["high"]}',
                                part_type="high_watermark",
                                cluster=tableRef['projectId'])
class TestWatermark(unittest.TestCase):
    def setUp(self) -> None:
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)
        start_key = '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
            database=DATABASE,
            cluster=CLUSTER,
            schema=SCHEMA,
            table=TABLE,
            part_type=PART_TYPE)
        end_key = '{database}://{cluster}.{schema}/{table}'.format(
            database=DATABASE, cluster=CLUSTER, schema=SCHEMA, table=TABLE)
        self.expected_node_result = GraphNode(key=start_key,
                                              label='Watermark',
                                              attributes={
                                                  'partition_key':
                                                  'ds',
                                                  'partition_value':
                                                  '2017-09-18/feature_id=9',
                                                  'create_time':
                                                  '2017-09-18T00:00:00'
                                              })

        self.expected_serialized_node_result = {
            NODE_KEY: start_key,
            NODE_LABEL: 'Watermark',
            'partition_key': 'ds',
            'partition_value': '2017-09-18/feature_id=9',
            'create_time': '2017-09-18T00:00:00'
        }

        self.expected_relation_result = GraphRelationship(
            start_label='Watermark',
            end_label='Table',
            start_key=start_key,
            end_key=end_key,
            type='BELONG_TO_TABLE',
            reverse_type='WATERMARK',
            attributes={})

        self.expected_serialized_relation_result = {
            RELATION_START_KEY: start_key,
            RELATION_START_LABEL: 'Watermark',
            RELATION_END_KEY: end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: 'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE: 'WATERMARK'
        }

    def test_get_watermark_model_key(self) -> None:
        watermark = self.watermark.get_watermark_model_key()
        self.assertEqual(
            watermark,
            '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
                database=DATABASE,
                cluster=CLUSTER,
                schema=SCHEMA,
                table=TABLE,
                part_type=PART_TYPE))

    def test_get_metadata_model_key(self) -> None:
        metadata = self.watermark.get_metadata_model_key()
        self.assertEqual(
            metadata,
            '{database}://{cluster}.{schema}/{table}'.format(database=DATABASE,
                                                             cluster=CLUSTER,
                                                             schema=SCHEMA,
                                                             table=TABLE))

    def test_create_nodes(self) -> None:
        nodes = self.watermark.create_nodes()
        self.assertEquals(len(nodes), 1)

        self.assertEquals(nodes[0], self.expected_node_result)
        self.assertEqual(neo4_serializer.serialize_node(nodes[0]),
                         self.expected_serialized_node_result)

    def test_create_relation(self) -> None:
        relation = self.watermark.create_relation()
        self.assertEquals(len(relation), 1)
        self.assertEquals(relation[0], self.expected_relation_result)
        self.assertEqual(neo4_serializer.serialize_relationship(relation[0]),
                         self.expected_serialized_relation_result)

    def test_create_next_node(self) -> None:
        next_node = self.watermark.create_next_node()
        self.assertEquals(neo4_serializer.serialize_node(next_node),
                          self.expected_serialized_node_result)

    def test_create_next_relation(self) -> None:
        next_relation = self.watermark.create_next_relation()
        self.assertEquals(
            neo4_serializer.serialize_relationship(next_relation),
            self.expected_serialized_relation_result)
Пример #12
0
    def create_table_watermarks(
        self, table: ScrapedTableMetadata
    ) -> Optional[List[Tuple[Watermark, Watermark]]]:  # noqa c901
        """
        Creates the watermark objects that reflect the highest and lowest values in the partition columns
        """
        def _is_show_partitions_supported(t: ScrapedTableMetadata) -> bool:
            try:
                self.spark.sql(f'show partitions {t.schema}.{t.table}')
                return True
            except Exception as e:
                # pyspark.sql.utils.AnalysisException: SHOW PARTITIONS is not allowed on a table that is not partitioned
                LOGGER.warning(e)
                return False

        def _fetch_minmax(table: ScrapedTableMetadata,
                          partition_column: str) -> Tuple[str, str]:
            LOGGER.info(
                f'Fetching partition info for {partition_column} in {table.schema}.{table.table}'
            )
            min_water = ""
            max_water = ""
            try:
                if is_show_partitions_supported:
                    LOGGER.info('Using SHOW PARTITION')
                    min_water = str(
                        self.spark.sql(
                            f'show partitions {table.schema}.{table.table}').
                        orderBy(partition_column,
                                ascending=True).first()[partition_column])
                    max_water = str(
                        self.spark.sql(
                            f'show partitions {table.schema}.{table.table}').
                        orderBy(partition_column,
                                ascending=False).first()[partition_column])
                else:
                    LOGGER.info('Using DESCRIBE EXTENDED')
                    part_info = (self.spark.sql(
                        f'describe extended {table.schema}.{table.table} {partition_column}'
                    ).collect())
                    minmax = {}
                    for mm in list(
                            filter(lambda x: x['info_name'] in ['min', 'max'],
                                   part_info)):
                        minmax[mm['info_name']] = mm['info_value']
                    min_water = minmax['min']
                    max_water = minmax['max']
            except Exception as e:
                LOGGER.warning(f'Failed fetching partition watermarks: {e}')
            return max_water, min_water

        if not table.table_detail:
            LOGGER.info(f'No table details found in {table}, skipping')
            return None

        if 'partitionColumns' not in table.table_detail or len(
                table.table_detail['partitionColumns']) < 1:
            LOGGER.info(f'No partitions found in {table}, skipping')
            return None

        is_show_partitions_supported: bool = _is_show_partitions_supported(
            table)

        if not is_show_partitions_supported:
            LOGGER.info('Analyzing table, this can take a while...')
            partition_columns = ','.join(
                table.table_detail['partitionColumns'])
            self.spark.sql(
                f"analyze table {table.schema}.{table.table} compute statistics for columns {partition_columns}"
            )

        # It makes little sense to get watermarks from a string value, with no concept of high and low.
        # Just imagine a dataset with a partition by country...
        valid_types = ['int', 'float', 'date', 'datetime']
        if table.columns:
            _table_columns = table.columns
        else:
            _table_columns = []
        columns_with_valid_type = list(
            map(
                lambda l: l.name,
                filter(lambda l: str(l.data_type).lower() in valid_types,
                       _table_columns)))

        r = []
        for partition_column in table.table_detail['partitionColumns']:
            if partition_column not in columns_with_valid_type:
                continue

            last, first = _fetch_minmax(table, partition_column)
            low = Watermark(create_time=table.table_detail['createdAt'],
                            database=self._db,
                            schema=table.schema,
                            table_name=table.table,
                            part_name=f'{partition_column}={first}',
                            part_type='low_watermark',
                            cluster=self._cluster)
            high = Watermark(create_time=table.table_detail['createdAt'],
                             database=self._db,
                             schema=table.schema,
                             table_name=table.table,
                             part_name=f'{partition_column}={last}',
                             part_type='high_watermark',
                             cluster=self._cluster)
            r.append((high, low))
        return r
class TestWatermark(unittest.TestCase):

    def setUp(self) -> None:
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(
            create_time='2017-09-18T00:00:00',
            database=DATABASE,
            schema=SCHEMA,
            table_name=TABLE,
            cluster=CLUSTER,
            part_type=PART_TYPE,
            part_name=NESTED_PART
        )
        self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/'
        self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.expected_node_result = GraphNode(
            key=self.start_key,
            label='Watermark',
            attributes={
                'partition_key': 'ds',
                'partition_value': '2017-09-18/feature_id=9',
                'create_time': '2017-09-18T00:00:00'
            }
        )

        self.expected_serialized_node_results = [{
            NODE_KEY: self.start_key,
            NODE_LABEL: 'Watermark',
            'partition_key': 'ds',
            'partition_value': '2017-09-18/feature_id=9',
            'create_time': '2017-09-18T00:00:00'
        }]

        self.expected_relation_result = GraphRelationship(
            start_label='Watermark',
            end_label='Table',
            start_key=self.start_key,
            end_key=self.end_key,
            type='BELONG_TO_TABLE',
            reverse_type='WATERMARK',
            attributes={}
        )

        self.expected_serialized_relation_results = [{
            RELATION_START_KEY: self.start_key,
            RELATION_START_LABEL: 'Watermark',
            RELATION_END_KEY: self.end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: 'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE: 'WATERMARK'
        }]

    def test_get_watermark_model_key(self) -> None:
        watermark = self.watermark.get_watermark_model_key()
        self.assertEqual(watermark, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/')

    def test_get_metadata_model_key(self) -> None:
        metadata = self.watermark.get_metadata_model_key()
        self.assertEqual(metadata, f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}')

    def test_create_nodes(self) -> None:
        actual = []
        node = self.watermark.create_next_node()
        while node:
            serialized_node = neo4_serializer.serialize_node(node)
            actual.append(serialized_node)
            node = self.watermark.create_next_node()

        self.assertEqual(actual, self.expected_serialized_node_results)

    def test_create_nodes_neptune(self) -> None:
        expected_serialized_node_results = [{
            NEPTUNE_HEADER_ID: 'Watermark:' + self.start_key,
            METADATA_KEY_PROPERTY_NAME: 'Watermark:' + self.start_key,
            NEPTUNE_HEADER_LABEL: 'Watermark',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB,
            'partition_key:String(single)': 'ds',
            'partition_value:String(single)': '2017-09-18/feature_id=9',
            'create_time:String(single)': '2017-09-18T00:00:00'
        }]

        actual = []
        node = self.watermark.create_next_node()
        while node:
            serialized_node = neptune_serializer.convert_node(node)
            actual.append(serialized_node)
            node = self.watermark.create_next_node()

        self.assertEqual(expected_serialized_node_results, actual)

    def test_create_relation(self) -> None:
        actual = []
        relation = self.watermark.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(relation)
            actual.append(serialized_relation)
            relation = self.watermark.create_next_relation()

        self.assertEqual(actual, self.expected_serialized_relation_results)

    def test_create_relation_neptune(self) -> None:
        actual = []
        relation = self.watermark.create_next_relation()
        while relation:
            serialized_relation = neptune_serializer.convert_relationship(relation)
            actual.append(serialized_relation)
            relation = self.watermark.create_next_relation()

        expected = [
            [
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Watermark:" + self.start_key,
                        to_vertex_id="Table:" + self.end_key,
                        label='BELONG_TO_TABLE'
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Watermark:" + self.start_key,
                        to_vertex_id="Table:" + self.end_key,
                        label='BELONG_TO_TABLE'
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: "Watermark:" + self.start_key,
                    NEPTUNE_RELATIONSHIP_HEADER_TO: "Table:" + self.end_key,
                    NEPTUNE_HEADER_LABEL: 'BELONG_TO_TABLE',
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                },
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Table:" + self.end_key,
                        to_vertex_id="Watermark:" + self.start_key,
                        label='WATERMARK'
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Table:" + self.end_key,
                        to_vertex_id="Watermark:" + self.start_key,
                        label='WATERMARK'
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: "Table:" + self.end_key,
                    NEPTUNE_RELATIONSHIP_HEADER_TO: "Watermark:" + self.start_key,
                    NEPTUNE_HEADER_LABEL: 'WATERMARK',
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                }
            ]
        ]

        self.assertListEqual(actual, expected)

    def test_create_records(self) -> None:
        expected = [{
            'rk': self.start_key,
            'partition_key': 'ds',
            'partition_value': '2017-09-18/feature_id=9',
            'create_time': '2017-09-18T00:00:00',
            'table_rk': self.end_key
        }]

        actual = []
        record = self.watermark.create_next_record()
        while record:
            serialized_record = mysql_serializer.serialize_record(record)
            actual.append(serialized_record)
            record = self.watermark.create_next_record()

        self.assertEqual(actual, expected)