class ModelAttribute(SchematicsModel): taxon: str = NonEmptyStringType(required=True) identifier: bool = BooleanType(default=False) tel_transformation: str = NonEmptyStringType(required=True) """ Model attribute transformation written in TEL. """ quantity_type: ValueQuantityType = EnumType(ValueQuantityType, default=ValueQuantityType.scalar) @memoized_property def taxon_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.taxon @memoized_property def identifier_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.identifier def __repr__(self): return serialize_class_with_props(self) def __hash__(self): return hash(str(self.taxon_memoized))
class TaxonTaxonFilterClause(FilterClause): """Filter clause which represents clause including two columns connected by a simple operator""" taxon: str = StringType(required=True, min_length=1) """Left-side column name""" right_taxon: str = StringType(required=True, min_length=1) """Right-side column name""" operator: SimpleFilterOperator = EnumType(SimpleFilterOperator, required=True) """Operator in the clause""" def generate( self, ctx, query, taxon_model_info_map: Dict[str, TaxonModelInfo]) -> ClauseElement: return _generate_simple_operator_clause(ctx, self, taxon_model_info_map) def get_taxon_slugs(self) -> Set[str]: return {self.taxon, self.right_taxon} @staticmethod def _claim_polymorphic( data: Dict[str, Any]) -> Optional[Type['FilterClause']]: return FilterClause._detect_filter_clause_type( data, FilterClauseType.TAXON_TAXON, TaxonTaxonFilterClause)
class TaxonValueFilterClause(FilterClause): """Filter clause which represents simple clause with one column and one value connected by simple operator """ taxon: str = StringType(required=True, min_length=1) """Column name in the clause""" operator: SimpleFilterOperator = EnumType(SimpleFilterOperator, required=True) """Operator in the clause""" value: Optional[Union[str, float, bool]] = UnionNoConversionType( (IntType, FloatType, StringType, BooleanType)) """Comparison value in the clause""" def generate( self, ctx, query, taxon_model_info_map: Dict[str, TaxonModelInfo]) -> ClauseElement: return _generate_simple_operator_clause(ctx, self, taxon_model_info_map) def get_taxon_slugs(self) -> Set[str]: return {self.taxon} @staticmethod def _claim_polymorphic( data: Dict[str, Any]) -> Optional[Type['FilterClause']]: return FilterClause._detect_filter_clause_type( data, FilterClauseType.TAXON_VALUE, TaxonValueFilterClause)
class Scope(schematics.Model): company_id: str = StringType(required=True) """ Historically, company_id=None meant global. Now, we should use support company id, same as with taxons. """ project_id: Optional[str] = StringType() model_visibility: ModelVisibility = EnumType( ModelVisibility, default=ModelVisibility.available) """ Every production consumer should not set this field. Set to experimental if you want to test new changes / models. """ preaggregation_filters = PolyModelType(get_all_filter_clauses()) """Optional pre-aggregation filters which determine scope of this Husky request""" @property def all_filters(self): """ Get current preaggregation scope filters """ return self.preaggregation_filters def __hash__(self) -> int: return hash('_'.join( str(part) for part in [ self.project_id, self.company_id, self.model_visibility, ]))
class ApiScope(schematics.Model): company_id: str = StringType() project_id: Optional[str] = StringType() model_visibility: ModelVisibility = EnumType( ModelVisibility, default=ModelVisibility.available) """ Every production consumer should not set this field. Set to experimental if you want to test new changes / models. """ preaggregation_filters = PolyModelType(get_all_filter_clauses()) """Pre-aggregation filters which determine scope of this Husky request"""
class TaxonArrayFilterClause(FilterClause): """Filter clause which represents an array-like clause in SQL (like IN)""" taxon: str = StringType(required=True) """Column name in the clause""" operator: ArrayFilterOperator = EnumType(ArrayFilterOperator, required=True) """Specifies operator in the clause""" value: Iterable[Union[str, float]] = ListType(StringType(required=True, min_length=1), min_size=1) """Value in the array clause""" def generate( self, ctx, query, taxon_model_info_map: Dict[str, TaxonModelInfo]) -> ClauseElement: taxon_model_info = taxon_model_info_map[self.taxon] left_operand = literal_column(taxon_model_info.taxon_sql_accessor) if taxon_model_info.is_array: # If taxon is an array, instead of using IN operator # we check if any of the value is obtained in the taxon value (list) if self.operator is ArrayFilterOperator.IN: return func.arrays_overlap(func.array_construct(self.value), left_operand) if self.operator is ArrayFilterOperator.NOT_IN: return not_( func.arrays_overlap(func.array_construct(self.value), left_operand)) else: if self.operator is ArrayFilterOperator.IN: return left_operand.in_(self.sql_value) if self.operator is ArrayFilterOperator.NOT_IN: return not_(left_operand.in_(self.sql_value)) raise UnknownOperator(self) def get_taxon_slugs(self) -> Set[str]: return {self.taxon} @property def sql_value(self): return [literal(v) for v in self.value] @staticmethod def _claim_polymorphic( data: Dict[str, Any]) -> Optional[Type['FilterClause']]: return FilterClause._detect_filter_clause_type( data, FilterClauseType.TAXON_ARRAY, TaxonArrayFilterClause)
class FilterClause(Model): """Filter clause representing simple or a nested filter clause NOTE: Since class Model is already abstract class, this class does not need to specify it. """ type: FilterClauseType = EnumType(FilterClauseType, required=True) """ Type of the filter clause """ @abstractmethod def generate( self, ctx, query: Select, taxon_model_info_map: Dict[str, TaxonModelInfo]) -> ClauseElement: """ Generates SQL Alchemy representation of this filter clause""" raise NotImplementedError('Not implemented') @abstractmethod def get_taxon_slugs(self) -> Set[str]: """ Gets all taxons used in this filter clause (and all filter clauses nested within it) """ raise NotImplementedError('Not implemented') @staticmethod @abstractmethod def _claim_polymorphic( data: Dict[str, Any]) -> Optional[Type['FilterClause']]: """ This method is used by PolyModelType to determine which model should be used to represent the structure during conversion. We could use .claim_polymorphic() instead, but then this method needs to know about all possible models. This is not an ideal so we opted for overriding this protected method. :param data: Data to be converted :return: Model class representing the data """ raise NotImplementedError('Not implemented') @staticmethod def _detect_filter_clause_type( data: Dict[str, Any], expected_type: FilterClauseType, filter_clause: Type['FilterClause'] ) -> Optional[Type['FilterClause']]: if 'type' not in data: return None if EnumHelper.from_value_safe(FilterClauseType, data['type']) == expected_type: return filter_clause return None
class GroupFilterClause(FilterClause): """Filter clause which represents a group of filter clauses connected by specified operator""" logical_operator: LogicalOperator = EnumType(LogicalOperator, required=True) """ Logical operator connecting clauses""" clauses: List[FilterClause] = ListType( PolyModelType([ 'GroupFilterClause', TaxonValueFilterClause, TaxonTaxonFilterClause, TaxonArrayFilterClause ]), required=True, min_size=1, ) """ List of clauses """ negate: bool = BooleanType(default=False) """ Negate the whole group of clauses """ def generate( self, ctx, query: Select, taxon_model_info_map: Dict[str, TaxonModelInfo]) -> ClauseElement: if self.logical_operator not in GROUP_OPERATORS_FUNCTIONS: raise UnknownOperator(self) clause = GROUP_OPERATORS_FUNCTIONS[self.logical_operator]( clause.generate(ctx, query, taxon_model_info_map) for clause in self.clauses).self_group() if self.negate: return not_(clause) return clause def get_taxon_slugs(self) -> Set[str]: return set(taxon_slug for clause in self.clauses for taxon_slug in clause.get_taxon_slugs()) @staticmethod def _claim_polymorphic( data: Dict[str, Any]) -> Optional[Type['FilterClause']]: return FilterClause._detect_filter_clause_type(data, FilterClauseType.GROUP, GroupFilterClause)
class ModelWithEnum(SchematicsModel): e = EnumType(MyEnum) default = EnumType(MyEnum, default=MyEnum.b)
class TaxonDataOrder(schematics.Model): taxon: str = StringType(required=True, min_length=1) type: TaxonOrderType = EnumType(TaxonOrderType, required=True)
class ComparisonConfig(schematics.Model): taxons: Optional[List[TaxonExpressionStr]] = ListType(NonEmptyStringType) scope: ComparisonScopeType = EnumType(ComparisonScopeType, default=ComparisonScopeType.company)
class ModelJoin(SchematicsModel): join_type: JoinType = EnumType(JoinType, required=True) relationship: Relationship = EnumType(Relationship, required=True) direction: Optional[JoinDirection] = EnumType(JoinDirection, required=False) """ Allows to explicitly define in which direction a join edge can be traversed. ModelJoin is defined on a model and references to a model. If direction is not defined, system will use relationship type to infer the allowed direction (for backward compatibility issues) - 'both', the join edge can be traversed from both models (defined and referenced) - 'outgoing', the join edge can be traversed from defined model only - 'incoming', the join edge can be traversed from referenced model to defined model only """ to_model = NonEmptyStringType(required=True) taxons: Optional[List[str]] = ListType(NonEmptyStringType) """ List of taxons on which the two models should be joined. Later, we can set more customizable joins, even joining different taxons on each other. """ @memoized_property def join_type_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.join_type @memoized_property def relationship_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.relationship @memoized_property def direction_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.direction @memoized_property def to_model_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.to_model @memoized_property def taxons_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.taxons
class HuskyModel(SchematicsModel): name: str = StringType(required=True, min_length=3) attributes: Dict[str, ModelAttribute] = DictType(ModelType(ModelAttribute), default=dict()) """ Key is name of the attribute. For usage, use attributes_memoized or attributes_by_taxon_memoized. """ joins: List[ModelJoin] = ListType(ModelType(ModelJoin), default=[]) """ List of possible joins on other models """ data_sources: List[str] = ListType(StringType(min_length=3), default=[], max_size=1) """ Explicitly defined data sources. """ model_type: Optional[HuskyModelType] = EnumType(HuskyModelType) """ Optional attribute which defines type of the model explicitly """ visibility: ModelVisibility = EnumType(ModelVisibility, default=ModelVisibility.hidden) company_id: str = NonEmptyStringType(required=True) project_id: Optional[str] = NonEmptyStringType(required=False) _alias: Optional[str] = None """ Unique alias used in SQL for this model (if None, use full object name to reference columns) """ fully_qualified_name_parts: Optional[List[str]] = ListType(NonEmptyStringType(required=True)) """ All parts of the fully qualified name. Can contain 2..N values, depending on the actual federated database. Example: - physical data source (always first) - database name - schema name - table name """ def __repr__(self): return serialize_class_with_props(self) @property def number_of_identifiers(self) -> int: return len(self.identifier_attributes) @property def identifier_attributes(self) -> Set[ModelAttribute]: return {attr for attr in self.attributes_memoized.values() if attr.identifier_memoized} @property def identifier_taxon_slugs(self) -> Set[str]: return {attr.taxon for attr in self.identifier_attributes} @property def physical_data_source(self) -> str: """ Gets name of physical data source. """ if self.fully_qualified_name_parts is None: raise ValueError('Missing physical data source') else: return self.fully_qualified_name_parts[0] @memoized_property def time_granularity(self) -> Optional[TimeGranularity]: """ Time granularity of model's data (if it can be inferred) """ date_taxon_slug = prefix_with_virtual_data_source(self.data_source, TaxonSlugs.DATE) date_hour_taxon_slug = prefix_with_virtual_data_source(self.data_source, TaxonSlugs.DATE_HOUR) if self.has_taxon(date_taxon_slug): return TimeGranularity.day elif self.has_taxon(date_hour_taxon_slug): return TimeGranularity.hour return None def full_object_name(self, ctx: HuskyQueryContext) -> str: """ Full name of the database object, including db and schema name. """ assert self.fully_qualified_name_parts full_object_name = '.'.join( [quote_identifier(part, ctx.dialect) for part in self.fully_qualified_name_parts[1:]] ) # sanity check that we have ANY name if not full_object_name: raise GenericModelException( 'You are working with federated model so you need to turn on the appropriate feature flag', self.name, ExceptionErrorCode.FDQ_FLAG_REQUIRED, ) return full_object_name @property def table_alias(self) -> Optional[str]: """ Optional table alias to keep reference to the model unique (in case it is joined multiple times in query) """ return self._alias @table_alias.setter def table_alias(self, alias: str): """ Setter for optional table alias :param alias: New table alias """ self._alias = alias @table_alias.deleter def table_alias(self): """ Removes table alias """ self._alias = None def unique_object_name(self, ctx: HuskyQueryContext) -> str: """ Unique model reference within query :param ctx: """ identifier = self.full_object_name(ctx) if self.table_alias is None else self.table_alias return identifier @property def graph_name(self) -> str: """ Unique name in graph """ return self.name if self.table_alias is None else self.table_alias @property def is_entity(self) -> bool: """ Returns if the model is entity or not. Derived from model name at runtime. """ if self.model_type is not None: return self.model_type is HuskyModelType.ENTITY return 'entity' in self.name.lower() @memoized_property def data_source(self) -> str: """ Returns data source. Newly, all models have exactly one data source. """ return self.data_sources[0] def get_attribute_by_taxon(self, taxon_slug: str) -> ModelAttribute: attribute = self.attributes_by_taxon_memoized.get(taxon_slug) if attribute: return attribute else: raise AttributeNotFound(f'Attribute with taxon {taxon_slug} not found.') def has_taxon(self, taxon: str) -> bool: try: self.get_attribute_by_taxon(taxon) return True except AttributeNotFound: return False @memoized_property def attributes_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.attributes @memoized_property def attributes_by_taxon_memoized(self) -> Dict[str, ModelAttribute]: """ Note the memoized property. It is cached on the instance after the first access. :return: """ return {attr.taxon_memoized: attr for attr in self.attributes_memoized.values()} @memoized_property def joins_memoized(self): """ Note the memoized property. It is cached on the instance after the first access. :return: """ return self.joins @property def taxons(self) -> Set[str]: return set(self.attributes_by_taxon_memoized.keys()) def taxon_sql_accessor( self, ctx: HuskyQueryContext, taxon_slug: str, cast_array: bool = False, model_tel_dialect: Optional[ModelTelDialect] = None, ) -> str: """ Helper function that returns full sql accessor to given taxon on the model :param ctx: Husky query context :param taxon_slug Original taxon slug :param cast_array Automatically handle arrays by casting them to string (default is False) :param model_tel_dialect Initialized model TEL dialect, if there is one (we use it to check for cyclic reference). """ attribute = self.get_attribute_by_taxon(taxon_slug) # let TEL grammar to render the SQL transformation # on purpose, we dont use 'column' variable here, because we dont really rely on column_name attribute here tel_dialect = model_tel_dialect if tel_dialect is None: # no initialized tel visitor is provided so create a generic one tel_dialect = ModelTelDialect( unique_object_name=self.unique_object_name(ctx), virtual_data_source=self.data_sources[0], model=self, ) # render the TEL transformation parsed_expression = tel_dialect.render(attribute.tel_transformation, ctx, {}) sql_accessor = compile_query(parsed_expression.sql(ctx.dialect), ctx.dialect) # we cast arrays to varchar, if requested if cast_array and attribute.quantity_type is ValueQuantityType.array: sql_accessor = f'CAST({sql_accessor} AS VARCHAR)' return sql_accessor def __hash__(self): return hash(self.unique_object_name(SNOWFLAKE_HUSKY_CONTEXT)) def add_attribute(self, model_attribute: ModelAttribute): """ Adds attribute to a model. It should be used only in very edge-cases. One of the use cases is dynamically adding attributes when working with normalized values. :param model_attribute: Model attribute """ self.attributes[model_attribute.taxon] = model_attribute self.attributes_memoized[model_attribute.taxon] = model_attribute self.attributes_by_taxon_memoized[model_attribute.taxon] = model_attribute def remove_attribute(self, taxon_slug: str): """ Removes attribute from model by taxon slug :param taxon_slug: Taxon.slug """ self.attributes.pop(taxon_slug, None) self.attributes_memoized.pop(taxon_slug, None) self.attributes_by_taxon_memoized.pop(taxon_slug, None) def add_join(self, model_join: ModelJoin): self.joins.append(model_join)
class BlendingQueryInfo(schematics.Model): """ Query info object for blending requests. """ uuid: str = StringType(required=True) start_time: datetime = DateTimeType() """ Timestamp of the moment when Husky started processing the blending request """ status: str = StringType(choices=['success', 'fail'], default='success') """ Status of the request """ error: Optional[str] = StringType() """ Traceback to the error """ data_request: BlendingDataRequest = ModelType(BlendingDataRequest) """ Full API request """ origin_information: Optional[Dict[str, Any]] = DictType(BaseType) """ Contains additional (and optional) information about request origin """ subrequests_info: List[QueryInfo] = ListType(ModelType(QueryInfo), default=[]) """ List of QueryInfo objects for subrequests """ comparison_subrequests_info: List[QueryInfo] = ListType(ModelType(QueryInfo), default=[]) """ List of QueryInfo objects for comparison subrequests """ internal_metrics: QueryInternalMetrics = ModelType(QueryInternalMetrics, default=QueryInternalMetrics()) """ Internal measurements. Mostly timings and counts """ definition: QueryDefinition = ModelType(QueryDefinition, default=QueryDefinition()) """ Definition of the Query Similar to actual API request, but should have more stable structure, and be independend of API changes """ original_request_str: str = StringType(required=False) """ Original request as it came to API serialized into string. """ query_runtime: HuskyQueryRuntime = EnumType(HuskyQueryRuntime) @staticmethod def create( data_request: BlendingDataRequest, husky_context: HuskyQueryContext, origin_information: Optional[Dict[str, Any]] = None, ): return BlendingQueryInfo( trusted_data=dict( data_request=data_request, start_time=datetime.utcnow(), origin_information=origin_information, query_runtime=husky_context.query_runtime, ) )