def test_make_column_schema_for_jaql_formula_should_process_all_parts( self, mock_make_column_schema_for_jaql): metadata = { 'formula': 'AVG([OrderDateYears], [CountOrderID])', 'context': { '[OrderDateYears]': { 'dim': '[Orders.OrderDate (Calendar)]', 'level': 'years', }, '[CountOrderID]': { 'dim': '[Orders.OrderID]', 'agg': 'count', }, }, 'title': 'AVG test', } column = datacatalog.ColumnSchema() mock_make_column_schema_for_jaql.return_value = column column = self.__factory \ ._DataCatalogEntryFactory__make_column_schema_for_jaql_formula( metadata) self.assertEqual('formula', column.column) self.assertEqual('array', column.type) self.assertEqual('The AVG test formula', column.description) self.assertEqual(2, len(column.subcolumns))
def test_make_column_schema_for_jaql_filter_by_should_process_all_fields( self, mock_make_column_schema_for_jaql): metadata = { 'dim': '[TableA.ColumnA]', 'datatype': 'text', 'title': 'Test Table and Column A', 'filter': { 'by': { 'dim': '[TableB.ColumnB]', 'datatype': 'numeric', 'agg': 'sum', 'title': 'Test Table and Column B', }, }, } column = datacatalog.ColumnSchema() mock_make_column_schema_for_jaql.return_value = column column = self.__factory \ ._DataCatalogEntryFactory__make_column_schema_for_jaql_filter_by( metadata) self.assertEqual('filterBy', column.column) self.assertEqual('array', column.type) self.assertEqual('The Test Table and Column A nested filter', column.description) self.assertEqual(1, len(column.subcolumns))
def __make_filters_column_for_widget( cls, widget_metadata: Dict[str, Any]) -> Optional[ColumnSchema]: if not (widget_metadata.get('metadata') and widget_metadata['metadata'].get('panels')): return panels = widget_metadata['metadata']['panels'] filters = next( (panel.get('items') for panel in panels if panel.get('name') == constants.WIDGET_FILTERS_PANEL_NAME), None) if not filters: return filters_column = datacatalog.ColumnSchema() filters_column.column = constants.ENTRY_COLUMN_FILTERS filters_column.type = 'array' filters_column.description = 'The Widget filters' for widget_filter in filters: filters_column.subcolumns.append( cls.__make_column_schema_for_jaql(widget_filter.get('jaql'))) return filters_column if filters_column.subcolumns else None
def make_entry_for_tables(self, table, table_container_name): """Create Datacatalog entries from a table dict. :param table: :param table_container_name: :return: entry_id, entry """ entry_id = self._format_id('{}__{}'.format(table_container_name, table['name'])) entry = datacatalog.Entry() entry.user_specified_type = self.__metadata_definition['table_def'][ 'type'] entry.user_specified_system = self.__entry_group_id entry.display_name = self._format_display_name(table['name']) entry.name = datacatalog.DataCatalogClient.entry_path( self.__project_id, self.__location_id, self.__entry_group_id, entry_id) desc = table.get('desc') if pd.isna(desc): desc = '' entry.description = desc entry.linked_resource = '//{}//{}'.format( self.__metadata_host_server, self._format_id(table['name'])) create_time, update_time = \ DataCatalogEntryFactory.__convert_source_system_timestamp_fields( table.get('create_time'), table.get('update_time')) if create_time and update_time: created_timestamp = timestamp_pb2.Timestamp() created_timestamp.FromSeconds(create_time) entry.source_system_timestamps.create_time = created_timestamp updated_timestamp = timestamp_pb2.Timestamp() updated_timestamp.FromSeconds(update_time) entry.source_system_timestamps.update_time = updated_timestamp columns = [] for column in table['columns']: desc = column.get('desc') if pd.isna(desc): desc = '' columns.append( datacatalog.ColumnSchema( column=self._format_id(column['name']), description=desc, type=DataCatalogEntryFactory.__format_entry_column_type( column['type']))) entry.schema.columns.extend(columns) return entry_id, entry
def make_entry_for_table(self, table_metadata, database_name): entry_id = '{}__{}'.format(database_name, table_metadata.name) # Force lowercase since hive is case insensitive entry_id = entry_id.lower() entry = datacatalog.Entry() entry.user_specified_type = 'table' entry.user_specified_system = 'hive' entry.display_name = table_metadata.name entry.name = datacatalog.DataCatalogClient.entry_path( self.__project_id, self.__location_id, self.__entry_group_id, entry_id) # For now we are using the first table_storage relationship, # with table partitions we might have to deal # with more than one record table_storage = table_metadata.table_storages[0] entry.linked_resource = \ '//{}//{}'.format(self.__metadata_host_server, table_storage.location) created_timestamp = timestamp_pb2.Timestamp() created_timestamp.FromSeconds(table_metadata.create_time) entry.source_system_timestamps.create_time = created_timestamp update_time_seconds = \ DataCatalogEntryFactory. \ __extract_update_time_from_table_metadata(table_metadata) if update_time_seconds is not None: updated_timestamp = timestamp_pb2.Timestamp() updated_timestamp.FromSeconds(update_time_seconds) entry.source_system_timestamps.update_time = updated_timestamp else: entry.source_system_timestamps.update_time = created_timestamp columns = [] for column in table_storage.columns: columns.append( datacatalog.ColumnSchema( column=column.name, type=DataCatalogEntryFactory.__format_entry_column_type( column.type), description=column.comment)) entry.schema.columns.extend(columns) return entry_id, entry
def test_make_schema_for_widget_make_filters_column( self, mock_make_filters_column_for_widget): metadata = {'metadata': {'panels': [{}]}} column = datacatalog.ColumnSchema() column.column = 'test' mock_make_filters_column_for_widget.return_value = column schema = self.__factory\ ._DataCatalogEntryFactory__make_schema_for_widget(metadata) mock_make_filters_column_for_widget.assert_called_once_with(metadata) self.assertEqual(column, schema.columns[0])
def make_entry_for_table(self, table_metadata, database_name): entry_id = self.__make_entry_id_for_table(database_name, table_metadata) entry = datacatalog.Entry() entry.user_specified_type = 'table' entry.user_specified_system = 'hive' entry.display_name = self._format_display_name(table_metadata.name) entry.name = datacatalog.DataCatalogClient.entry_path( self.__project_id, self.__location_id, self.__entry_group_id, entry_id) table_storage = table_metadata.table_storages[0] entry.linked_resource = \ self._format_linked_resource( '//{}//{}'.format(self.__metadata_host_server, table_storage.location)) created_timestamp = timestamp_pb2.Timestamp() created_timestamp.FromSeconds(table_metadata.create_time) entry.source_system_timestamps.create_time = created_timestamp update_time_seconds = \ DataCatalogEntryFactory. \ __extract_update_time_from_table_metadata(table_metadata) if update_time_seconds is not None: updated_timestamp = timestamp_pb2.Timestamp() updated_timestamp.FromSeconds(update_time_seconds) entry.source_system_timestamps.update_time = updated_timestamp else: entry.source_system_timestamps.update_time = created_timestamp columns = [] for column in table_storage.columns: columns.append( datacatalog.ColumnSchema( column=column.name, type=DataCatalogEntryFactory.__format_entry_column_type( column.type), description=column.comment)) entry.schema.columns.extend(columns) return entry_id, entry
def test_make_column_schema_for_jaql_should_set_all_available_fields( self, mock_make_column_schema_for_jaql_formula, mock_make_column_schema_for_jaql_filter_by): metadata = {'datatype': 'datetime', 'title': 'TEST'} column = datacatalog.ColumnSchema() column.column = 'formula' mock_make_column_schema_for_jaql_formula.return_value = column column = datacatalog.ColumnSchema() column.column = 'filterBy' mock_make_column_schema_for_jaql_filter_by.return_value = column column = self.__factory\ ._DataCatalogEntryFactory__make_column_schema_for_jaql(metadata) self.assertEqual('TEST', column.column) self.assertEqual('datetime', column.type) mock_make_column_schema_for_jaql_formula.assert_called_once_with( metadata) mock_make_column_schema_for_jaql_filter_by.assert_called_once_with( metadata)
def test_make_schema_for_dashboard_should_make_filters_column( self, mock_make_column_schema_for_jaql): jaql_metadata = {'datatype': 'datetime', 'title': 'TEST'} metadata = {'filters': [{'jaql': jaql_metadata}]} column = datacatalog.ColumnSchema() mock_make_column_schema_for_jaql.return_value = column schema = \ self.__factory._DataCatalogEntryFactory__make_schema_for_dashboard( metadata) self.assertEqual('filters', schema.columns[0].column) mock_make_column_schema_for_jaql.assert_called_once_with(jaql_metadata) self.assertEqual(column, schema.columns[0].subcolumns[0])
def __make_column_schema_for_jaql_filter_by( cls, jaql_metadata: Dict[str, Any]) -> Optional[ColumnSchema]: jaql_filter = jaql_metadata.get(constants.JAQL_FILTER_FIELD_NAME) if not jaql_filter: return filter_by = jaql_filter.get(constants.JAQL_FILTER_BY_FIELD_NAME) if not filter_by: return column = datacatalog.ColumnSchema() column.column = constants.ENTRY_COLUMN_FILTER_BY column.type = 'array' column.description = f'The {jaql_metadata.get("title")} nested filter' column.subcolumns.append(cls.__make_column_schema_for_jaql(filter_by)) return column
def __make_column_schema_for_jaql_formula( cls, jaql_metadata: Dict[str, Any]) -> Optional[ColumnSchema]: formula = jaql_metadata.get(constants.JAQL_FORMULA_FIELD_NAME) context = jaql_metadata.get(constants.JAQL_CONTEXT_FIELD_NAME) if not (formula and context): return column = datacatalog.ColumnSchema() column.column = constants.ENTRY_COLUMN_FORMULA column.type = 'array' column.description = \ f'The {jaql_metadata.get("title")} formula' parts = re.findall(r'\[(.*?)]', formula) for part in parts: column.subcolumns.append( cls.__make_column_schema_for_jaql(context.get(f'[{part}]'))) return column
def __create_schema(cls, entry, columns): entry_columns = [] if columns: for column in columns: column_data = column.get('data') if column_data: column_attributes = column_data.get('attributes') data_type = attr_normalizer.\ DataCatalogAttributeNormalizer.\ get_column_data_type(column_attributes) column_name = column_attributes.get('name') column_desc = column_attributes.get('comment') if data_type and column_name: column_name = attr_normalizer.\ DataCatalogAttributeNormalizer.format_name( column_name) entry_columns.append( datacatalog.ColumnSchema(column=column_name, description=column_desc, type=data_type)) entry.schema.columns.extend(entry_columns)
def __make_column_schema_for_jaql( cls, jaql_metadata: Dict[str, Any]) -> ColumnSchema: column = datacatalog.ColumnSchema() column.column = sisense_connector_strings_helper\ .SisenseConnectorStringsHelper\ .format_column_name(jaql_metadata.get('title')) column.type = jaql_metadata.get('datatype') or jaql_metadata.get( 'type') or 'unknown' formula_subcolumn = cls.__make_column_schema_for_jaql_formula( jaql_metadata) if formula_subcolumn: column.subcolumns.append(formula_subcolumn) filter_by_subcolumn = cls.__make_column_schema_for_jaql_filter_by( jaql_metadata) if filter_by_subcolumn: column.subcolumns.append(filter_by_subcolumn) return column
def __make_schema_for_dashboard( cls, dashboard_metadata: Dict[str, Any]) -> Optional[Schema]: if not dashboard_metadata.get('filters'): return filters_column = datacatalog.ColumnSchema() filters_column.column = constants.ENTRY_COLUMN_FILTERS filters_column.type = 'array' filters_column.description = 'The Dashboard filters' for dashboard_filter in dashboard_metadata[ constants.DASHBOARD_FILTERS_FIELD_NAME]: filters_column.subcolumns.append( cls.__make_column_schema_for_jaql( dashboard_filter.get('jaql'))) schema = datacatalog.Schema() schema.columns.append(filters_column) return schema
def test_make_filters_column_for_widget_should_return_column( self, mock_make_column_schema_for_jaql): jaql_metadata = {'datatype': 'datetime', 'title': 'TEST'} metadata = { 'metadata': { 'panels': [{ 'name': 'filters', 'items': [{ 'jaql': jaql_metadata }] }] } } column = datacatalog.ColumnSchema() mock_make_column_schema_for_jaql.return_value = column schema = self.__factory\ ._DataCatalogEntryFactory__make_filters_column_for_widget(metadata) mock_make_column_schema_for_jaql.assert_called_once_with(jaql_metadata) self.assertEqual(column, schema.subcolumns[0])
def __make_fields_column_for_widget( cls, widget_metadata: Dict[str, Any]) -> Optional[ColumnSchema]: if not (widget_metadata.get('metadata') and widget_metadata['metadata'].get('panels')): return fields_column = datacatalog.ColumnSchema() fields_column.column = constants.ENTRY_COLUMN_FIELDS fields_column.type = 'array' fields_column.description = 'The Widget fields' panels = widget_metadata['metadata']['panels'] fields = [ panel for panel in panels if not panel.get('name') == constants.WIDGET_FILTERS_PANEL_NAME ] for field in fields: for item in field.get('items'): fields_column.subcolumns.append( cls.__make_column_schema_for_jaql(item.get('jaql'))) return fields_column if fields_column.subcolumns else None
def make_entry_for_tables(self, table, table_container_name): """Create Datacatalog entries from a table dict. :param table: :param table_container_name: :return: entry_id, entry """ entry_id = self._format_id('{}__{}'.format(table_container_name, table['name'])) entry = datacatalog.Entry() # some RDBMS' store views and tables definitions in the same # system table, and the name is not user friendly, so we only # keep it if it's a VIEW type. table_type = table.get(constants.TABLE_TYPE_KEY) if table_type and table_type.lower() == \ constants.VIEW_TYPE_VALUE: table_type = table_type.lower() else: table_type = self.__metadata_definition['table_def']['type'] entry.user_specified_type = table_type entry.user_specified_system = self.__entry_group_id entry.display_name = self._format_display_name(table['name']) entry.name = datacatalog.DataCatalogClient.entry_path( self.__project_id, self.__location_id, self.__entry_group_id, entry_id) desc = table.get('desc') if pd.isna(desc): desc = '' entry.description = desc entry.linked_resource = '{}/{}/{}'.format( self.__entry_resource_url_prefix, table_container_name, self._format_id(table['name'])) create_time, update_time = \ DataCatalogEntryFactory.__convert_source_system_timestamp_fields( table.get('create_time'), table.get('update_time')) if create_time and update_time: created_timestamp = timestamp_pb2.Timestamp() created_timestamp.FromSeconds(create_time) entry.source_system_timestamps.create_time = created_timestamp updated_timestamp = timestamp_pb2.Timestamp() updated_timestamp.FromSeconds(update_time) entry.source_system_timestamps.update_time = updated_timestamp columns = [] for column in table['columns']: desc = column.get('desc') if pd.isna(desc): desc = '' columns.append( datacatalog.ColumnSchema( column=self._format_id(column['name']), description=desc, type=DataCatalogEntryFactory.__format_entry_column_type( column['type']))) entry.schema.columns.extend(columns) return entry_id, entry
def create_column_schema(cls, name, column_type, description, mode=None): return datacatalog.ColumnSchema(column=name, type=column_type, description=description, mode=mode)