def get_ownership_to_set( graph: DataHubGraph, urn: str, mce_ownership: Optional[OwnershipClass] ) -> Optional[OwnershipClass]: if not mce_ownership or not mce_ownership.owners: # nothing to add, no need to consult server return None assert mce_ownership server_ownership = graph.get_ownership(entity_urn=urn) if server_ownership: # compute patch # we only include owners who are not present in the server ownership # if owner ids match, but the ownership type differs, we prefer the transformers opinion owners_to_add: List[OwnerClass] = [] needs_update = False server_owner_ids = [o.owner for o in server_ownership.owners] for owner in mce_ownership.owners: if owner.owner not in server_owner_ids: owners_to_add.append(owner) else: # we need to check if the type matches, and if it doesn't, update it for server_owner in server_ownership.owners: if ( owner.owner == server_owner.owner and owner.type != server_owner.type ): server_owner.type = owner.type needs_update = True if owners_to_add or needs_update: mce_ownership.owners = server_ownership.owners + owners_to_add return mce_ownership else: return None else: return mce_ownership
def __init__( self, run_id: str, datahub_api: Optional[DatahubClientConfig] = None, pipeline_name: Optional[str] = None, dry_run: bool = False, preview_mode: bool = False, ) -> None: self.run_id = run_id self.graph = DataHubGraph( datahub_api) if datahub_api is not None else None self.pipeline_name = pipeline_name self.dry_run_mode = dry_run self.preview_mode = preview_mode self.reporters: Dict[str, Committable] = dict() self.checkpointers: Dict[str, Committable] = dict() self._set_dataset_urn_to_lower_if_needed()
def __init__( self, run_id: str, datahub_api: Optional[DatahubClientConfig] = None, pipeline_name: Optional[str] = None, dry_run: bool = False, preview_mode: bool = False, ) -> None: self.run_id = run_id self.graph = DataHubGraph( datahub_api) if datahub_api is not None else None self.pipeline_name = pipeline_name self.dry_run_mode = dry_run self.preview_mode = preview_mode
class PipelineContext: def __init__( self, run_id: str, datahub_api: Optional[DatahubClientConfig] = None, pipeline_name: Optional[str] = None, dry_run: bool = False, preview_mode: bool = False, ) -> None: self.run_id = run_id self.graph = DataHubGraph( datahub_api) if datahub_api is not None else None self.pipeline_name = pipeline_name self.dry_run_mode = dry_run self.preview_mode = preview_mode self.reporters: Dict[str, Committable] = dict() self.checkpointers: Dict[str, Committable] = dict() self._set_dataset_urn_to_lower_if_needed() def _set_dataset_urn_to_lower_if_needed(self) -> None: # TODO: Get rid of this function once lower-casing is the standard. if self.graph: server_config = self.graph.get_config() if server_config and server_config.get("datasetUrnNameCasing"): set_dataset_urn_to_lower(True) def register_checkpointer(self, committable: Committable) -> None: if committable.name in self.checkpointers: raise IndexError( f"Checkpointing provider {committable.name} already registered." ) self.checkpointers[committable.name] = committable def register_reporter(self, committable: Committable) -> None: if committable.name in self.reporters: raise IndexError( f"Reporting provider {committable.name} already registered.") self.reporters[committable.name] = committable def get_reporters(self) -> Iterable[Committable]: for committable in self.reporters.values(): yield committable def get_committables(self) -> Iterable[Tuple[str, Committable]]: for reporting_item_commitable in self.reporters.items(): yield reporting_item_commitable for checkpointing_item_commitable in self.checkpointers.items(): yield checkpointing_item_commitable
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> IngestionStateProvider: if ctx.graph: return cls(ctx.graph) elif config_dict is None: raise ConfigurationError("Missing provider configuration") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext, name: str) -> IngestionCheckpointingProviderBase: if ctx.graph: # Use the pipeline-level graph if set return cls(ctx.graph, name) elif config_dict is None: raise ConfigurationError("Missing provider configuration.") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph, name) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
logging.basicConfig(level=logging.INFO) # Inputs -> owner, ownership_type, dataset owner_to_add = make_user_urn("jdoe") ownership_type = OwnershipTypeClass.DATAOWNER dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") # Some objects to help with conditional pathways later owner_class_to_add = OwnerClass(owner=owner_to_add, type=ownership_type) ownership_to_add = OwnershipClass(owners=[owner_class_to_add]) # First we get the current owners gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) current_owners: Optional[OwnershipClass] = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="ownership", aspect_type=OwnershipClass, ) need_write = False if current_owners: if (owner_to_add, ownership_type) not in [(x.owner, x.type) for x in current_owners.owners]: # owners exist, but this owner is not present in the current owners current_owners.owners.append(owner_class_to_add) need_write = True else:
path = ".".join(tokens) return path else: # not a v2, we assume this is a simple path return field_path # Inputs -> the column, dataset and the tag to set column = "address.zipcode" dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") tag_to_add = make_tag_urn("location") # First we get the current editable schema metadata gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) current_editable_schema_metadata = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="editableSchemaMetadata", aspect_type=EditableSchemaMetadataClass, ) # Some pre-built objects to help all the conditional pathways tag_association_to_add = TagAssociationClass(tag=tag_to_add) tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add]) field_info_to_set = EditableSchemaFieldInfoClass( fieldPath=column, globalTags=tags_aspect_to_set )
link_description = "This is the definition of what real estate means" dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") # Some helpful variables to fill out objects later now = int(time.time() * 1000) # milliseconds since epoch current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion") institutional_memory_element = InstitutionalMemoryMetadataClass( url=link_to_add, description=link_description, createStamp=current_timestamp, ) # First we get the current owners gms_endpoint = "http://localhost:8080" graph = DataHubGraph(config=DatahubClientConfig(server=gms_endpoint)) current_editable_properties = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="editableDatasetProperties", aspect_type=EditableDatasetPropertiesClass, ) need_write = False if current_editable_properties: if documentation_to_add != current_editable_properties.description: current_editable_properties.description = documentation_to_add need_write = True else: # create a brand new editable dataset properties aspect current_editable_properties = EditableDatasetPropertiesClass(
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph # Imports for metadata model classes from datahub.metadata.schema_classes import ( AuditStampClass, ChangeTypeClass, GlossaryTermAssociationClass, GlossaryTermsClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # First we get the current terms gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") current_terms: Optional[GlossaryTermsClass] = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="glossaryTerms", aspect_type=GlossaryTermsClass, ) term_to_add = make_term_urn("Classification.HighlyConfidential") term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) # an audit stamp that basically says we have no idea when these terms were added to this dataset # change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time