class Hub(SatelliteOwner, register_as="hub"): star_prefix = "dim" _key_type: str = wysdom.UserProperty(str, name="key_type") static_columns: Dict[str, SatelliteColumn] = wysdom.UserProperty( wysdom.SchemaDict(SatelliteColumn), persist_defaults=True, default={}) @property def key_type(self) -> ColumnType: return ColumnType(self._key_type) @property def key_length(self) -> int: return self.key_type.serialized_length @property def hubs(self) -> Dict[str, VaultObject]: return {self.name: self} @property def satellites_containing_keys(self) -> Dict[str, VaultObject]: return { key: sat for key, sat in self.project.satellites.items() if sat.parent.key == self.key or sat.parent.key in [link.key for link in self.links.values()] or self.name in sat.referenced_hubs.keys() } @property def links(self) -> Dict[str, VaultObject]: return { key: link for key, link in self.project.links.items() if self.name in link.unique_hubs.keys() } def hub_key_columns(self, satellite: Satellite) -> Dict[str, List[HubKeyColumn]]: return { self.name: [HubKeyColumn(self.key_column_name, f'sat_{satellite.name}')] } def generate_key(self, from_table): return from_table.c[self.key_name] # TODO: Should this be in HubModel? def prepare_key_for_link(self, alias, from_table): key_column = from_table.c[self.alias_key_name(alias)] return self.key_type.serialize_column_expression(key_column) @property def link_key_columns(self): return [] def validate(self) -> None: pass
class VaultObjectReference(wysdom.UserObject): type: str = wysdom.UserProperty(str) name: str = wysdom.UserProperty(str) @LazyProperty def key(self) -> VaultObjectKey: return VaultObjectKey(self.type, self.name)
class Column(wysdom.UserObject): _type: str = wysdom.UserProperty(str, name="type") @property def type(self) -> ColumnType: return ColumnType(self._type)
class SatelliteSQLPipeline(SatellitePipeline, register_as="sql"): type: str = wysdom.UserProperty(wysdom.SchemaConst('sql')) sql: str = wysdom.UserProperty(str, name="sql") load_dt: Optional[str] = wysdom.UserProperty(str, optional=True) deleted_ind: Optional[str] = wysdom.UserProperty(str, optional=True) dependencies: List[SatellitePipelineDependency] = wysdom.UserProperty( wysdom.SchemaArray(SatellitePipelineDependency), default=[]) @property def key_columns(self) -> Dict[str, str]: if self._key_columns: return self._key_columns else: return { hub_alias: hub_alias for hub_alias in self.satellite.parent.hubs.keys() }
class SatellitePipelineDependency(wysdom.UserObject, Generic[DependencyType]): name: str = wysdom.UserProperty(str) type: str = wysdom.UserProperty(str) view: Optional[str] = wysdom.UserProperty(str, optional=True) @property def object_reference_key(self) -> VaultObjectKey: return VaultObjectKey(self.type, self.name) @property def project(self) -> Project: return wysdom.document(self).project @property def object_reference(self) -> DependencyType: return self.project[self.type, self.name] def validate(self) -> None: if self.object_reference_key not in self.project: raise KeyError( f"Cannot find {self.object_reference_key} in project.")
class SatelliteSourcePipeline( SatellitePipeline, register_as="source" ): type: str = wysdom.UserProperty(wysdom.SchemaConst('source')) _source: str = wysdom.UserProperty(str, name="source") @property def key_columns(self) -> Dict[str, str]: if self._key_columns: return self._key_columns else: return { key_column: source_column for key_column, source_column in zip( self.satellite.parent.hubs.keys(), self.source.columns.keys() ) } @property def source(self) -> Source: # TODO: Refactor so this definitely returns Source, not VaultObject source_obj = self.project["source", self._source] assert isinstance(source_obj, Source) return source_obj @property def dependencies(self) -> List[SatellitePipelineDependency]: return [ SatellitePipelineDependency( {'name': self._source, 'type': 'source'}, json_dom_info=wysdom.dom.DOMInfo( document=wysdom.document(self), parent=self) ) ]
class Config(wysdom.UserObject, wysdom.ReadsJSON, wysdom.ReadsYAML): model_path: str = ConfigProperty(str, default_function=lambda self: os.getcwd()) schema: str = ConfigProperty(str) skip_deploy: bool = ConfigProperty(bool, default=False) environment_type: str = ConfigProperty(str, default="local_spark") session: SessionConfig = ConfigProperty(SessionConfig, default={}, persist_defaults=True) services: Dict[str, ServiceConfig] = ConfigProperty( wysdom.SchemaDict(ServiceConfig), default={}, persist_defaults=True) compute: str = ConfigProperty(str) registry: str = ConfigProperty(str) drop_schema_if_exists: bool = ConfigProperty(bool, default=False) @LazyProperty def secret_lookup(self) -> SecretLookup: return SecretLookup.registered_subclass_instance( self._secret_lookup_name) _secret_lookup_name: str = wysdom.UserProperty(str, name="secret_lookup") def reset_session(self): self.session.clear() def __str__(self): return yaml.dump(self.to_builtin(), default_flow_style=False) @classmethod def config_dir(cls): return os.path.join(str(Path.home()), '.jetavator') @classmethod def config_file(cls): return os.path.join(cls.config_dir(), 'config.yml') @classmethod def make_config_dir(cls): if not os.path.exists(cls.config_dir()): os.makedirs(cls.config_dir()) def save(self): config_dict = self.to_builtin() # Don't save session specific config info if 'session' in config_dict: del config_dict['session'] self.make_config_dir() with open(self.config_file(), 'w') as f: f.write(yaml.dump(config_dict, default_flow_style=False))
class SatellitePipeline(wysdom.UserObject, RegistersSubclasses, ABC): type: str = wysdom.UserProperty(str) performance_hints: PerformanceHints = wysdom.UserProperty( PerformanceHints, persist_defaults=True, default={}) _key_columns: Dict[str, str] = wysdom.UserProperty(wysdom.SchemaDict(str), name="key_columns", persist_defaults=True, default={}) @property def satellite(self) -> SatelliteABC: # TODO: Improve the type checking here? parent = wysdom.parent(self) if isinstance(parent, SatelliteABC): return parent else: raise TypeError('Parent is not a subclass of SatelliteABC') @property def project(self) -> Project: return self.satellite.project @property @abstractmethod def dependencies(self) -> List[SatellitePipelineDependency]: raise NotImplementedError @property @abstractmethod def key_columns(self) -> Dict[str, str]: pass def validate(self) -> None: for dep in self.dependencies: dep.validate()
class VaultObject(wysdom.UserObject, wysdom.RegistersSubclasses, ABC): name: str = wysdom.UserProperty(str) type: str = wysdom.UserProperty(str) optional_yaml_properties = [] def __init__(self, project: ProjectABC, sqlalchemy_object: ObjectDefinition) -> None: self.project = project self._sqlalchemy_object = sqlalchemy_object super().__init__(self.definition) def __repr__(self) -> str: class_name = type(self).__name__ return f'{class_name}({self.name})' @classmethod def subclass_instance(cls, project: ProjectABC, definition: ObjectDefinition) -> VaultObject: return cls.registered_subclass_instance(definition.type, project, definition) @LazyProperty def key(self) -> VaultObjectKey: return VaultObjectKey(self.type, self.name) @property def definition(self) -> Dict[str, Any]: return self._sqlalchemy_object.definition def export_sqlalchemy_object(self) -> ObjectDefinition: if self._sqlalchemy_object.version != str(self.project.version): raise ValueError( "ObjectDefinition version must match project version " "and cannot be updated.") self._sqlalchemy_object.deploy_dt = str(datetime.now()) return self._sqlalchemy_object @abstractmethod def validate(self) -> None: pass @property def compute_service(self) -> ComputeServiceABC: return self.project.compute_service @property def full_name(self) -> str: return f'{self.type}_{self.name}' @property def checksum(self) -> str: return str(self._sqlalchemy_object.checksum) @property def dependent_satellites(self) -> List[VaultObject]: return [ satellite for satellite in self.project.satellites.values() if any( dependency.type == self.type and dependency.name == self.name for dependency in satellite.pipeline.dependencies) ]
class SatelliteOwner(VaultObject, ABC, register_as="satellite_owner"): key_length: int = None options: List[str] = wysdom.UserProperty(wysdom.SchemaArray(str), default=[]) exclude_from_star_schema: bool = wysdom.UserProperty(bool, default=False) @property @abstractmethod def hubs(self) -> Dict[str, VaultObject]: pass @property def satellites(self) -> Dict[str, SatelliteABC]: return { satellite.name: satellite for satellite in self.project.satellites.values() if satellite.parent.key == self.key } @property def star_satellites(self) -> Dict[str, SatelliteABC]: return { satellite.name: satellite for satellite in self.satellites.values() if not satellite.exclude_from_star_schema } @property @abstractmethod def satellites_containing_keys(self) -> Dict[str, SatelliteABC]: pass @property def satellite_columns(self) -> Dict[str, SatelliteColumn]: return { column_name: column for satellite in self.star_satellites.values() for column_name, column in satellite.columns.items() } @property def key_column_name(self) -> str: return f"{self.type}_{self.name}_key" @property def hash_column_name(self) -> str: return f"{self.type}_{self.name}_hash" @property def hashed_columns(self) -> Dict[str, SatelliteColumn]: return self.satellite_columns def hub_key_columns(self, satellite) -> Dict[str, HubKeyColumn]: raise NotImplementedError def option(self, option_name: str) -> bool: return any(option == option_name for option in self.options) @abstractmethod def validate(self) -> None: pass def alias_key_name(self, alias): return f"{self.type}_{alias}_key" def alias_hash_key_name(self, alias): return f"{self.type}_{alias}_hash" @property def key_name(self): return self.alias_key_name(self.name) @property def hash_key_name(self): return self.alias_hash_key_name(self.name) def alias_primary_key_name(self, alias): if self.option("hash_key"): return self.alias_hash_key_name(alias) else: return self.alias_key_name(alias) @abstractmethod def generate_key(self, from_table): pass @property @abstractmethod def link_key_columns(self): pass @property @abstractmethod def key_type(self) -> ColumnType: pass # TODO: Move SQLAlchemy column generation to sql_model def alias_key_column(self, alias): return Column(self.alias_key_name(alias), self.key_type.sqlalchemy_type, nullable=False) def alias_hash_key_column(self, alias): return Column(self.alias_hash_key_name(alias), CHAR(32), nullable=False) def alias_key_columns(self, alias): if self.option("hash_key"): return [ self.alias_hash_key_column(alias), self.alias_key_column(alias) ] else: return [self.alias_key_column(alias)] def alias_primary_key_column(self, alias): if self.option("hash_key"): return self.alias_hash_key_column(alias) else: return self.alias_key_column(alias) @property def table_name(self) -> str: return f"vault_{self.type}_{self.name}" @property def star_table_name(self) -> str: return f"star_{self.star_prefix}_{self.name}" @property @abstractmethod def star_prefix(self): pass
class SourceColumn(Column): nullable: bool = wysdom.UserProperty(bool) pk: Optional[bool] = wysdom.UserProperty(bool, default=False)
class Satellite(SatelliteABC, register_as="satellite"): _parent: VaultObjectReference = wysdom.UserProperty(VaultObjectReference, name="parent") columns: Dict[str, SatelliteColumn] = wysdom.UserProperty( wysdom.SchemaDict(SatelliteColumn)) pipeline: SatellitePipeline = wysdom.UserProperty(SatellitePipeline) exclude_from_star_schema: bool = wysdom.UserProperty(bool, default=False) @property def parent(self) -> SatelliteOwner: return self.project[self._parent.key] @property def hub_reference_columns(self) -> Dict[str, SatelliteColumn]: return {k: v for k, v in self.columns.items() if v.hub_reference} @property def referenced_hubs(self) -> Dict[str, SatelliteOwner]: return { hub_name: self.project["hub", hub_name] for hub_name in VaultObjectSet( x.hub_reference for x in self.hub_reference_columns.values()) } @property def full_name(self) -> str: return f'sat_{self.name}' @property def hub_key_columns(self) -> Dict[str, List[HubKeyColumn]]: # check if this can be safely refactored to # a function hub_key_columns(self, hub_name) columns = self.parent.hub_key_columns(self) if (self.hub_reference_columns and not self.pipeline.performance_hints.no_update_referenced_hubs): for column_name, column in self.hub_reference_columns.items(): columns.setdefault(column.hub_reference, []).append( HubKeyColumn(column_name, f'hub_{column.hub_reference}')) return columns @LazyProperty def input_keys(self) -> VaultObjectSet[SatelliteOwner]: return VaultObjectSet(owner for dep in self.pipeline.dependencies if isinstance(dep.object_reference, Satellite) for owner in dep.object_reference.output_keys) @LazyProperty def produced_keys(self) -> VaultObjectSet[SatelliteOwner]: if self.pipeline.performance_hints.no_update_hubs: keys = VaultObjectSet() else: keys = VaultObjectSet(self.project.hubs[name] for name in self.hub_key_columns) if (self.parent.registered_name == 'link' and not self.pipeline.performance_hints.no_update_links): keys.add(self.parent) return keys @LazyProperty def output_keys(self) -> VaultObjectSet[SatelliteOwner]: return self.produced_keys | self.input_keys def dependent_satellites_by_owner(self, satellite_owner) -> List[Satellite]: return [ dep.object_reference for dep in self.pipeline.dependencies if isinstance(dep.object_reference, Satellite) for output_key in dep.object_reference.output_keys if output_key is satellite_owner ] def validate(self) -> None: if self._parent.key not in self.project: raise KeyError(f"Could not find parent object {self._parent.key}") self.pipeline.validate() @property def satellite_columns(self): return [ Column(column_name, column.type.sqlalchemy_type, nullable=True) for column_name, column in self.columns.items() ] @property def table_name(self): return f"vault_sat_{self.name}"
class PerformanceHints(wysdom.UserObject): no_update_hubs: str = wysdom.UserProperty(bool, default=False) no_update_links: str = wysdom.UserProperty(bool, default=False) no_update_referenced_hubs: str = wysdom.UserProperty(bool, default=False)
class SatelliteColumn(Column): nullable: bool = wysdom.UserProperty(bool, default=True) hub_reference: Optional[str] = wysdom.UserProperty(str, optional=True) index: bool = wysdom.UserProperty(bool, default=False)
class SparkDeltaStorageConfig(StorageServiceConfig): type: str = wysdom.UserProperty(wysdom.SchemaConst('spark_delta'))
class Link(SatelliteOwner, register_as="link"): star_prefix = "fact" # TODO: Rename link_hubs to hubs _link_hubs: Dict[str, str] = wysdom.UserProperty( wysdom.SchemaDict(str), name='link_hubs') @property def hubs(self) -> Dict[str, Hub]: return { k: self.project['hub', v] for k, v in self._link_hubs.items() } @property def satellites_containing_keys(self) -> Dict[str, Satellite]: return self.star_satellites @property def key_length(self) -> int: return sum([ hub.key_length + 1 for hub in self.hubs.values() ]) - 1 @property def key_type(self) -> ColumnType: return ColumnType(f"CHAR({self.key_length})") @property def unique_hubs(self) -> Dict[str, Hub]: return { hub_name: self.project["hub", hub_name] for hub_name in set(x.name for x in self.hubs.values()) } def hub_key_columns(self, satellite) -> Dict[str, HubKeyColumn]: columns = {} for alias, hub in self.hubs.items(): columns.setdefault(hub.name, []).append( HubKeyColumn(f'hub_{alias}_key', f'hub_{hub.name}')) return columns def generate_key(self, from_table): key_components = iter([ hub.prepare_key_for_link(hub_alias, from_table) for hub_alias, hub in self.hubs.items() ]) composite_key = next(key_components) for column in key_components: composite_key = composite_key.concat( func.char(literal_column(str(SEPARATOR))) ).concat(column) return composite_key @property def link_key_columns(self): return [ hub.alias_key_column(hub_alias) for hub_alias, hub in self.hubs.items() ] def validate(self) -> None: for k, v in self._link_hubs.items(): if ('hub', v) not in self.project: raise KeyError( f"Cannot find referenced hub {v} in object {self.key}" )
class Source(VaultObject, register_as="source"): DELETED_INDICATOR_SYSTEM_COLUMN = "jetavator_deleted_ind" LOAD_TIMESTAMP_SYSTEM_COLUMN = "jetavator_load_dt" columns: Dict[str, SourceColumn] = wysdom.UserProperty( wysdom.SchemaDict(SourceColumn)) csv_files: List[str] = [] deleted_indicator_column: Optional[str] = wysdom.UserProperty( str, optional=True) load_timestamp_column: Optional[str] = wysdom.UserProperty(str, optional=True) date_format: Optional[str] = wysdom.UserProperty(str, optional=True) timestamp_format: Optional[str] = wysdom.UserProperty(str, optional=True) @property def primary_key_columns(self) -> Dict[str, SourceColumn]: return {k: v for k, v in self.columns.items() if v.pk} def validate(self) -> None: pass @LazyProperty def create_table_statement(self) -> CreateTable: return CreateTable(self.table) # TODO: Move to sql_model? @LazyProperty def table(self) -> Table: return Table(self.full_name, MetaData(), *self._table_columns()) def load_csvs( self, csv_files: List[FilePath] # , assume_schema_integrity=False ) -> None: """Loads a list of CSV files into a single named Source :param csv_files: List of paths on disk of the CSV files """ self.csv_files = csv_files def _table_columns(self) -> List[Column]: # TODO: Spark/Hive does not allow PKs. Make this configurable per engine? use_primary_key = False return [ *self._source_columns(use_primary_key), *self._date_columns(use_primary_key) ] def _source_columns(self, use_primary_key: bool = True) -> List[Column]: return [ Column(column_name, column.type.sqlalchemy_type, nullable=True, primary_key=(use_primary_key and column.pk)) for column_name, column in self.columns.items() ] @staticmethod def _date_columns(use_primary_key: bool = True) -> List[Column]: return [ Column("jetavator_load_dt", DateTime(), nullable=True, primary_key=use_primary_key), Column( "jetavator_deleted_ind", # TODO: Loading as integer saves space in CSVs. # Does this make sense for other file formats? # Is there a more general solution? Integer(), nullable=True, default=0) ]