Python Watermark.Watermark примеры, databuilder.models.watermark.Watermark.Watermark Python примеры использования

Пример #1

0

Показать файл

    def _get_partition_watermarks(self, table, tableRef, partitions):
        if "field" in table["timePartitioning"]:
            field = table["timePartitioning"]["field"]
        else:
            field = "_PARTITIONTIME"

        low = min(partitions, key=lambda t: t.partition_id)
        low_wm = Watermark(
            datetime.datetime.fromtimestamp(float(
                low.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"),
            "bigquery",
            tableRef["datasetId"],
            tableRef["tableId"],
            "{field}={partition_id}".format(field=field,
                                            partition_id=low.partition_id),
            part_type="low_watermark",
            cluster=tableRef["projectId"],
        )

        high = max(partitions, key=lambda t: t.partition_id)
        high_wm = Watermark(
            datetime.datetime.fromtimestamp(float(
                high.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"),
            "bigquery",
            tableRef["datasetId"],
            tableRef["tableId"],
            "{field}={partition_id}".format(field=field,
                                            partition_id=high.partition_id),
            part_type="high_watermark",
            cluster=tableRef["projectId"],
        )

        return low_wm, high_wm

Пример #2

0

Показать файл

    def _get_partition_watermarks(
            self, table: Dict[str, Any], tableRef: Dict[str, str],
            partitions: List[PartitionInfo]) -> Tuple[Watermark, Watermark]:
        if 'field' in table['timePartitioning']:
            field = table['timePartitioning']['field']
        else:
            field = '_PARTITIONTIME'

        low = min(partitions, key=lambda t: t.partition_id)
        low_wm = Watermark(datetime.datetime.fromtimestamp(
            float(low.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'),
                           'bigquery',
                           tableRef['datasetId'],
                           tableRef['tableId'],
                           '{field}={partition_id}'.format(
                               field=field, partition_id=low.partition_id),
                           part_type="low_watermark",
                           cluster=tableRef['projectId'])

        high = max(partitions, key=lambda t: t.partition_id)
        high_wm = Watermark(datetime.datetime.fromtimestamp(
            float(high.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'),
                            'bigquery',
                            tableRef['datasetId'],
                            tableRef['tableId'],
                            '{field}={partition_id}'.format(
                                field=field, partition_id=high.partition_id),
                            part_type="high_watermark",
                            cluster=tableRef['projectId'])

        return low_wm, high_wm

Пример #3

0

Показать файл

 def test_index_with_data(self) -> None:
     extractor = self._get_extractor([self.index_with_data_1])
     expected = [
         Watermark(
             database='elasticsearch',
             cluster='cluster_name',
             schema='schema_name',
             table_name='index_with_data_1',
             create_time=datetime.fromtimestamp(1641863003).strftime(
                 '%Y-%m-%d %H:%M:%S'),
             part_name=
             f"time={datetime.fromtimestamp(1641863055).strftime('%Y-%m-%d')}",
             part_type='low_watermark'),
         Watermark(
             database='elasticsearch',
             cluster='cluster_name',
             schema='schema_name',
             table_name='index_with_data_1',
             create_time=datetime.fromtimestamp(1641863003).strftime(
                 '%Y-%m-%d %H:%M:%S'),
             part_name=
             f"time={datetime.fromtimestamp(1641949455).strftime('%Y-%m-%d')}",
             part_type='high_watermark')
     ]
     self._extract_and_compare(extractor, expected)

Пример #4

0

Показать файл

Файл: bigquery_watermark_extractor.py Проект: Victoriapm/whale

    def _retrieve_tables(self, dataset):
        # type: () -> Any
        sharded_table_watermarks = {}

        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                if self._is_table_match_regex(tableRef):
                    table_id = tableRef['tableId']

                    # BigQuery tables that have 8 digits as last characters are
                    # considered date range tables and are grouped together in the UI.
                    # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                    # We use these suffixes to determine high and low watermarks
                    if self._is_sharded_table(table_id):
                        suffix = table_id[-BigQueryWatermarkExtractor.DATE_LENGTH:]
                        prefix = table_id[:-BigQueryWatermarkExtractor.DATE_LENGTH]

                        if prefix in sharded_table_watermarks:
                            sharded_table_watermarks[prefix]['low'] = min(sharded_table_watermarks[prefix]['low'], suffix)
                            sharded_table_watermarks[prefix]['high'] = max(sharded_table_watermarks[prefix]['high'], suffix)
                        else:
                            sharded_table_watermarks[prefix] = {'high': suffix, 'low': suffix, 'table': table}
                    else:
                        partitions = self._get_partitions(table, tableRef)
                        if not partitions:
                            continue
                        low, high = self._get_partition_watermarks(table, tableRef, partitions)
                        yield low
                        yield high

            for prefix, td in sharded_table_watermarks.items():
                table = td['table']
                tableRef = table['tableReference']

                yield Watermark(
                    datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'),
                    'bigquery',
                    tableRef['datasetId'],
                    prefix,
                    '__table__={partition_id}'.format(partition_id=td['low']),
                    part_type="low_watermark",
                    cluster=tableRef['projectId']
                )

                yield Watermark(
                    datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'),
                    'bigquery',
                    tableRef['datasetId'],
                    prefix,
                    '__table__={partition_id}'.format(partition_id=td['high']),
                    part_type="high_watermark",
                    cluster=tableRef['projectId']
                )

Пример #5

0

Показать файл

Файл: test_watermark.py Проект: ponizvezdochka/amundsendatabuilder

    def setUp(self) -> None:
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)
        self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/'
        self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.expected_node_result = GraphNode(key=self.start_key,
                                              label='Watermark',
                                              attributes={
                                                  'partition_key':
                                                  'ds',
                                                  'partition_value':
                                                  '2017-09-18/feature_id=9',
                                                  'create_time':
                                                  '2017-09-18T00:00:00'
                                              })

        self.expected_serialized_node_results = [{
            NODE_KEY:
            self.start_key,
            NODE_LABEL:
            'Watermark',
            'partition_key':
            'ds',
            'partition_value':
            '2017-09-18/feature_id=9',
            'create_time':
            '2017-09-18T00:00:00'
        }]

        self.expected_relation_result = GraphRelationship(
            start_label='Watermark',
            end_label='Table',
            start_key=self.start_key,
            end_key=self.end_key,
            type='BELONG_TO_TABLE',
            reverse_type='WATERMARK',
            attributes={})

        self.expected_serialized_relation_results = [{
            RELATION_START_KEY:
            self.start_key,
            RELATION_START_LABEL:
            'Watermark',
            RELATION_END_KEY:
            self.end_key,
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE:
            'WATERMARK'
        }]

Пример #6

0

Показать файл

Файл: test_watermark.py Проект: saiharish97/amundsendatabuilder

    def setUp(self):
        # type: () -> None
        super(TestWatermark, self).setUp()
        self.watermark = Watermark(create_time='2017-09-18T00:00:00',
                                   database=DATABASE,
                                   schema=SCHEMA,
                                   table_name=TABLE,
                                   cluster=CLUSTER,
                                   part_type=PART_TYPE,
                                   part_name=NESTED_PART)

        self.expected_node_result = {
            NODE_KEY:
            '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            NODE_LABEL:
            'Watermark',
            'partition_key':
            'ds',
            'partition_value':
            '2017-09-18/feature_id=9',
            'create_time':
            '2017-09-18T00:00:00'
        }

        self.expected_relation_result = {
            RELATION_START_KEY:
            '{database}://{cluster}.{schema}/{table}/{part_type}/'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower(),
                part_type=PART_TYPE.lower()),
            RELATION_START_LABEL:
            'Watermark',
            RELATION_END_KEY:
            '{database}://{cluster}.{schema}/{table}'.format(
                database=DATABASE.lower(),
                cluster=CLUSTER.lower(),
                schema=SCHEMA.lower(),
                table=TABLE.lower()),
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            'BELONG_TO_TABLE',
            RELATION_REVERSE_TYPE:
            'WATERMARK'
        }

Пример #7

0

Показать файл

    def _get_extract_iter(self) -> Iterator[Union[Watermark, None]]:
        # Get all the indices
        indices: Dict = self._get_indexes()

        # Iterate over indices
        for index_name, index_metadata in indices.items():
            creation_date: Optional[float] = self._get_index_creation_date(index_metadata)
            watermark_bounds: Optional[Tuple[float, float]] = self._get_index_watermark_bounds(index_name=index_name)
            watermark_min: Optional[float] = None if watermark_bounds is None else watermark_bounds[0]
            watermark_max: Optional[float] = None if watermark_bounds is None else watermark_bounds[1]

            if creation_date is None or watermark_min is None or watermark_max is None:
                continue

            creation_date_str: str = datetime.fromtimestamp(creation_date / 1000).strftime('%Y-%m-%d %H:%M:%S')
            watermark_min_str: str = datetime.fromtimestamp(watermark_min / 1000).strftime('%Y-%m-%d')
            watermark_max_str: str = datetime.fromtimestamp(watermark_max / 1000).strftime('%Y-%m-%d')

            yield Watermark(
                database=self.database,
                cluster=self.cluster,
                schema=self.schema,
                table_name=index_name,
                create_time=creation_date_str,
                part_name=f'{self._time_field}={watermark_min_str}',
                part_type='low_watermark'
            )

            yield Watermark(
                database=self.database,
                cluster=self.cluster,
                schema=self.schema,
                table_name=index_name,
                create_time=creation_date_str,
                part_name=f'{self._time_field}={watermark_max_str}',
                part_type='high_watermark'
            )

Пример #8

0

Показать файл

    def _retrieve_tables(self, dataset: DatasetRef) -> Iterator[Watermark]:
        sharded_table_watermarks: Dict[str, Dict[str, Union[str, Any]]] = {}
        cutoff_time_in_epoch = timegm(
            time.strptime(self.cutoff_time,
                          BigQueryWatermarkExtractor.DATE_TIME_FORMAT))

        for page in self._page_table_list_results(dataset):
            if 'tables' not in page:
                continue

            for table in page['tables']:
                tableRef = table['tableReference']
                table_id = tableRef['tableId']
                table_creation_time = float(table['creationTime']) / 1000
                # only extract watermark metadata for tables created before the cut-off time
                if table_creation_time < cutoff_time_in_epoch:
                    # BigQuery tables that have numeric suffix starts with a date are
                    # considered date range tables.
                    # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. )
                    # We use these dates in the suffixes to determine high and low watermarks
                    if self._is_sharded_table(table_id):
                        suffix = self._get_sharded_table_suffix(table_id)
                        prefix = table_id[:-len(suffix)]
                        date = suffix[:BaseBigQueryExtractor.DATE_LENGTH]

                        if prefix in sharded_table_watermarks:
                            sharded_table_watermarks[prefix]['low'] = min(
                                sharded_table_watermarks[prefix]['low'], date)
                            sharded_table_watermarks[prefix]['high'] = max(
                                sharded_table_watermarks[prefix]['high'], date)
                        else:
                            sharded_table_watermarks[prefix] = {
                                'high': date,
                                'low': date,
                                'table': table
                            }
                    else:
                        partitions = self._get_partitions(table, tableRef)
                        if not partitions:
                            continue
                        low, high = self._get_partition_watermarks(
                            table, tableRef, partitions)
                        yield low
                        yield high

            for prefix, td in sharded_table_watermarks.items():
                table = td['table']
                tableRef = table['tableReference']

                yield Watermark(datetime.datetime.fromtimestamp(
                    float(table['creationTime']) /
                    1000).strftime('%Y-%m-%d %H:%M:%S'),
                                'bigquery',
                                tableRef['datasetId'],
                                prefix,
                                f'__table__={td["low"]}',
                                part_type="low_watermark",
                                cluster=tableRef['projectId'])

                yield Watermark(datetime.datetime.fromtimestamp(
                    float(table['creationTime']) /
                    1000).strftime('%Y-%m-%d %H:%M:%S'),
                                'bigquery',
                                tableRef['datasetId'],
                                prefix,
                                f'__table__={td["high"]}',
                                part_type="high_watermark",
                                cluster=tableRef['projectId'])

Пример #9

0

Показать файл

    def create_table_watermarks(
        self, table: ScrapedTableMetadata
    ) -> Optional[List[Tuple[Watermark, Watermark]]]:  # noqa c901
        """
        Creates the watermark objects that reflect the highest and lowest values in the partition columns
        """
        def _is_show_partitions_supported(t: ScrapedTableMetadata) -> bool:
            try:
                self.spark.sql(f'show partitions {t.schema}.{t.table}')
                return True
            except Exception as e:
                # pyspark.sql.utils.AnalysisException: SHOW PARTITIONS is not allowed on a table that is not partitioned
                LOGGER.warning(e)
                return False

        def _fetch_minmax(table: ScrapedTableMetadata,
                          partition_column: str) -> Tuple[str, str]:
            LOGGER.info(
                f'Fetching partition info for {partition_column} in {table.schema}.{table.table}'
            )
            min_water = ""
            max_water = ""
            try:
                if is_show_partitions_supported:
                    LOGGER.info('Using SHOW PARTITION')
                    min_water = str(
                        self.spark.sql(
                            f'show partitions {table.schema}.{table.table}').
                        orderBy(partition_column,
                                ascending=True).first()[partition_column])
                    max_water = str(
                        self.spark.sql(
                            f'show partitions {table.schema}.{table.table}').
                        orderBy(partition_column,
                                ascending=False).first()[partition_column])
                else:
                    LOGGER.info('Using DESCRIBE EXTENDED')
                    part_info = (self.spark.sql(
                        f'describe extended {table.schema}.{table.table} {partition_column}'
                    ).collect())
                    minmax = {}
                    for mm in list(
                            filter(lambda x: x['info_name'] in ['min', 'max'],
                                   part_info)):
                        minmax[mm['info_name']] = mm['info_value']
                    min_water = minmax['min']
                    max_water = minmax['max']
            except Exception as e:
                LOGGER.warning(f'Failed fetching partition watermarks: {e}')
            return max_water, min_water

        if not table.table_detail:
            LOGGER.info(f'No table details found in {table}, skipping')
            return None

        if 'partitionColumns' not in table.table_detail or len(
                table.table_detail['partitionColumns']) < 1:
            LOGGER.info(f'No partitions found in {table}, skipping')
            return None

        is_show_partitions_supported: bool = _is_show_partitions_supported(
            table)

        if not is_show_partitions_supported:
            LOGGER.info('Analyzing table, this can take a while...')
            partition_columns = ','.join(
                table.table_detail['partitionColumns'])
            self.spark.sql(
                f"analyze table {table.schema}.{table.table} compute statistics for columns {partition_columns}"
            )

        # It makes little sense to get watermarks from a string value, with no concept of high and low.
        # Just imagine a dataset with a partition by country...
        valid_types = ['int', 'float', 'date', 'datetime']
        if table.columns:
            _table_columns = table.columns
        else:
            _table_columns = []
        columns_with_valid_type = list(
            map(
                lambda l: l.name,
                filter(lambda l: str(l.data_type).lower() in valid_types,
                       _table_columns)))

        r = []
        for partition_column in table.table_detail['partitionColumns']:
            if partition_column not in columns_with_valid_type:
                continue

            last, first = _fetch_minmax(table, partition_column)
            low = Watermark(create_time=table.table_detail['createdAt'],
                            database=self._db,
                            schema=table.schema,
                            table_name=table.table,
                            part_name=f'{partition_column}={first}',
                            part_type='low_watermark',
                            cluster=self._cluster)
            high = Watermark(create_time=table.table_detail['createdAt'],
                             database=self._db,
                             schema=table.schema,
                             table_name=table.table,
                             part_name=f'{partition_column}={last}',
                             part_type='high_watermark',
                             cluster=self._cluster)
            r.append((high, low))
        return r

Python Watermark.Watermark примеры использования