def _get_partition_watermarks(self, table, tableRef, partitions): if "field" in table["timePartitioning"]: field = table["timePartitioning"]["field"] else: field = "_PARTITIONTIME" low = min(partitions, key=lambda t: t.partition_id) low_wm = Watermark( datetime.datetime.fromtimestamp(float( low.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"), "bigquery", tableRef["datasetId"], tableRef["tableId"], "{field}={partition_id}".format(field=field, partition_id=low.partition_id), part_type="low_watermark", cluster=tableRef["projectId"], ) high = max(partitions, key=lambda t: t.partition_id) high_wm = Watermark( datetime.datetime.fromtimestamp(float( high.epoch_created)).strftime("%Y-%m-%d %H:%M:%S"), "bigquery", tableRef["datasetId"], tableRef["tableId"], "{field}={partition_id}".format(field=field, partition_id=high.partition_id), part_type="high_watermark", cluster=tableRef["projectId"], ) return low_wm, high_wm
def _get_partition_watermarks( self, table: Dict[str, Any], tableRef: Dict[str, str], partitions: List[PartitionInfo]) -> Tuple[Watermark, Watermark]: if 'field' in table['timePartitioning']: field = table['timePartitioning']['field'] else: field = '_PARTITIONTIME' low = min(partitions, key=lambda t: t.partition_id) low_wm = Watermark(datetime.datetime.fromtimestamp( float(low.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], tableRef['tableId'], '{field}={partition_id}'.format( field=field, partition_id=low.partition_id), part_type="low_watermark", cluster=tableRef['projectId']) high = max(partitions, key=lambda t: t.partition_id) high_wm = Watermark(datetime.datetime.fromtimestamp( float(high.epoch_created)).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], tableRef['tableId'], '{field}={partition_id}'.format( field=field, partition_id=high.partition_id), part_type="high_watermark", cluster=tableRef['projectId']) return low_wm, high_wm
def test_index_with_data(self) -> None: extractor = self._get_extractor([self.index_with_data_1]) expected = [ Watermark( database='elasticsearch', cluster='cluster_name', schema='schema_name', table_name='index_with_data_1', create_time=datetime.fromtimestamp(1641863003).strftime( '%Y-%m-%d %H:%M:%S'), part_name= f"time={datetime.fromtimestamp(1641863055).strftime('%Y-%m-%d')}", part_type='low_watermark'), Watermark( database='elasticsearch', cluster='cluster_name', schema='schema_name', table_name='index_with_data_1', create_time=datetime.fromtimestamp(1641863003).strftime( '%Y-%m-%d %H:%M:%S'), part_name= f"time={datetime.fromtimestamp(1641949455).strftime('%Y-%m-%d')}", part_type='high_watermark') ] self._extract_and_compare(extractor, expected)
def _retrieve_tables(self, dataset): # type: () -> Any sharded_table_watermarks = {} for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] if self._is_table_match_regex(tableRef): table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) # We use these suffixes to determine high and low watermarks if self._is_sharded_table(table_id): suffix = table_id[-BigQueryWatermarkExtractor.DATE_LENGTH:] prefix = table_id[:-BigQueryWatermarkExtractor.DATE_LENGTH] if prefix in sharded_table_watermarks: sharded_table_watermarks[prefix]['low'] = min(sharded_table_watermarks[prefix]['low'], suffix) sharded_table_watermarks[prefix]['high'] = max(sharded_table_watermarks[prefix]['high'], suffix) else: sharded_table_watermarks[prefix] = {'high': suffix, 'low': suffix, 'table': table} else: partitions = self._get_partitions(table, tableRef) if not partitions: continue low, high = self._get_partition_watermarks(table, tableRef, partitions) yield low yield high for prefix, td in sharded_table_watermarks.items(): table = td['table'] tableRef = table['tableReference'] yield Watermark( datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, '__table__={partition_id}'.format(partition_id=td['low']), part_type="low_watermark", cluster=tableRef['projectId'] ) yield Watermark( datetime.datetime.fromtimestamp(float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, '__table__={partition_id}'.format(partition_id=td['high']), part_type="high_watermark", cluster=tableRef['projectId'] )
def setUp(self) -> None: super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.start_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}/{PART_TYPE}/' self.end_key = f'{DATABASE}://{CLUSTER}.{SCHEMA}/{TABLE}' self.expected_node_result = GraphNode(key=self.start_key, label='Watermark', attributes={ 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }) self.expected_serialized_node_results = [{ NODE_KEY: self.start_key, NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' }] self.expected_relation_result = GraphRelationship( start_label='Watermark', end_label='Table', start_key=self.start_key, end_key=self.end_key, type='BELONG_TO_TABLE', reverse_type='WATERMARK', attributes={}) self.expected_serialized_relation_results = [{ RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: self.end_key, RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' }]
def setUp(self): # type: () -> None super(TestWatermark, self).setUp() self.watermark = Watermark(create_time='2017-09-18T00:00:00', database=DATABASE, schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, part_type=PART_TYPE, part_name=NESTED_PART) self.expected_node_result = { NODE_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), NODE_LABEL: 'Watermark', 'partition_key': 'ds', 'partition_value': '2017-09-18/feature_id=9', 'create_time': '2017-09-18T00:00:00' } self.expected_relation_result = { RELATION_START_KEY: '{database}://{cluster}.{schema}/{table}/{part_type}/'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower(), part_type=PART_TYPE.lower()), RELATION_START_LABEL: 'Watermark', RELATION_END_KEY: '{database}://{cluster}.{schema}/{table}'.format( database=DATABASE.lower(), cluster=CLUSTER.lower(), schema=SCHEMA.lower(), table=TABLE.lower()), RELATION_END_LABEL: 'Table', RELATION_TYPE: 'BELONG_TO_TABLE', RELATION_REVERSE_TYPE: 'WATERMARK' }
def _get_extract_iter(self) -> Iterator[Union[Watermark, None]]: # Get all the indices indices: Dict = self._get_indexes() # Iterate over indices for index_name, index_metadata in indices.items(): creation_date: Optional[float] = self._get_index_creation_date(index_metadata) watermark_bounds: Optional[Tuple[float, float]] = self._get_index_watermark_bounds(index_name=index_name) watermark_min: Optional[float] = None if watermark_bounds is None else watermark_bounds[0] watermark_max: Optional[float] = None if watermark_bounds is None else watermark_bounds[1] if creation_date is None or watermark_min is None or watermark_max is None: continue creation_date_str: str = datetime.fromtimestamp(creation_date / 1000).strftime('%Y-%m-%d %H:%M:%S') watermark_min_str: str = datetime.fromtimestamp(watermark_min / 1000).strftime('%Y-%m-%d') watermark_max_str: str = datetime.fromtimestamp(watermark_max / 1000).strftime('%Y-%m-%d') yield Watermark( database=self.database, cluster=self.cluster, schema=self.schema, table_name=index_name, create_time=creation_date_str, part_name=f'{self._time_field}={watermark_min_str}', part_type='low_watermark' ) yield Watermark( database=self.database, cluster=self.cluster, schema=self.schema, table_name=index_name, create_time=creation_date_str, part_name=f'{self._time_field}={watermark_max_str}', part_type='high_watermark' )
def _retrieve_tables(self, dataset: DatasetRef) -> Iterator[Watermark]: sharded_table_watermarks: Dict[str, Dict[str, Union[str, Any]]] = {} cutoff_time_in_epoch = timegm( time.strptime(self.cutoff_time, BigQueryWatermarkExtractor.DATE_TIME_FORMAT)) for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] table_creation_time = float(table['creationTime']) / 1000 # only extract watermark metadata for tables created before the cut-off time if table_creation_time < cutoff_time_in_epoch: # BigQuery tables that have numeric suffix starts with a date are # considered date range tables. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) # We use these dates in the suffixes to determine high and low watermarks if self._is_sharded_table(table_id): suffix = self._get_sharded_table_suffix(table_id) prefix = table_id[:-len(suffix)] date = suffix[:BaseBigQueryExtractor.DATE_LENGTH] if prefix in sharded_table_watermarks: sharded_table_watermarks[prefix]['low'] = min( sharded_table_watermarks[prefix]['low'], date) sharded_table_watermarks[prefix]['high'] = max( sharded_table_watermarks[prefix]['high'], date) else: sharded_table_watermarks[prefix] = { 'high': date, 'low': date, 'table': table } else: partitions = self._get_partitions(table, tableRef) if not partitions: continue low, high = self._get_partition_watermarks( table, tableRef, partitions) yield low yield high for prefix, td in sharded_table_watermarks.items(): table = td['table'] tableRef = table['tableReference'] yield Watermark(datetime.datetime.fromtimestamp( float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, f'__table__={td["low"]}', part_type="low_watermark", cluster=tableRef['projectId']) yield Watermark(datetime.datetime.fromtimestamp( float(table['creationTime']) / 1000).strftime('%Y-%m-%d %H:%M:%S'), 'bigquery', tableRef['datasetId'], prefix, f'__table__={td["high"]}', part_type="high_watermark", cluster=tableRef['projectId'])
def create_table_watermarks( self, table: ScrapedTableMetadata ) -> Optional[List[Tuple[Watermark, Watermark]]]: # noqa c901 """ Creates the watermark objects that reflect the highest and lowest values in the partition columns """ def _is_show_partitions_supported(t: ScrapedTableMetadata) -> bool: try: self.spark.sql(f'show partitions {t.schema}.{t.table}') return True except Exception as e: # pyspark.sql.utils.AnalysisException: SHOW PARTITIONS is not allowed on a table that is not partitioned LOGGER.warning(e) return False def _fetch_minmax(table: ScrapedTableMetadata, partition_column: str) -> Tuple[str, str]: LOGGER.info( f'Fetching partition info for {partition_column} in {table.schema}.{table.table}' ) min_water = "" max_water = "" try: if is_show_partitions_supported: LOGGER.info('Using SHOW PARTITION') min_water = str( self.spark.sql( f'show partitions {table.schema}.{table.table}'). orderBy(partition_column, ascending=True).first()[partition_column]) max_water = str( self.spark.sql( f'show partitions {table.schema}.{table.table}'). orderBy(partition_column, ascending=False).first()[partition_column]) else: LOGGER.info('Using DESCRIBE EXTENDED') part_info = (self.spark.sql( f'describe extended {table.schema}.{table.table} {partition_column}' ).collect()) minmax = {} for mm in list( filter(lambda x: x['info_name'] in ['min', 'max'], part_info)): minmax[mm['info_name']] = mm['info_value'] min_water = minmax['min'] max_water = minmax['max'] except Exception as e: LOGGER.warning(f'Failed fetching partition watermarks: {e}') return max_water, min_water if not table.table_detail: LOGGER.info(f'No table details found in {table}, skipping') return None if 'partitionColumns' not in table.table_detail or len( table.table_detail['partitionColumns']) < 1: LOGGER.info(f'No partitions found in {table}, skipping') return None is_show_partitions_supported: bool = _is_show_partitions_supported( table) if not is_show_partitions_supported: LOGGER.info('Analyzing table, this can take a while...') partition_columns = ','.join( table.table_detail['partitionColumns']) self.spark.sql( f"analyze table {table.schema}.{table.table} compute statistics for columns {partition_columns}" ) # It makes little sense to get watermarks from a string value, with no concept of high and low. # Just imagine a dataset with a partition by country... valid_types = ['int', 'float', 'date', 'datetime'] if table.columns: _table_columns = table.columns else: _table_columns = [] columns_with_valid_type = list( map( lambda l: l.name, filter(lambda l: str(l.data_type).lower() in valid_types, _table_columns))) r = [] for partition_column in table.table_detail['partitionColumns']: if partition_column not in columns_with_valid_type: continue last, first = _fetch_minmax(table, partition_column) low = Watermark(create_time=table.table_detail['createdAt'], database=self._db, schema=table.schema, table_name=table.table, part_name=f'{partition_column}={first}', part_type='low_watermark', cluster=self._cluster) high = Watermark(create_time=table.table_detail['createdAt'], database=self._db, schema=table.schema, table_name=table.table, part_name=f'{partition_column}={last}', part_type='high_watermark', cluster=self._cluster) r.append((high, low)) return r