Python DataHubGraph примеры использования

Язык программирования: Python

Пространство имен/Пакет: datahub.ingestion.graph.client

Класс/Тип: DataHubGraph

Примеров на hotexamples.com: 10

Python DataHubGraph - 10 примеров найдено. Это лучшие примеры Python кода для datahub.ingestion.graph.client.DataHubGraph, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DataHubGraph(5)

get_aspect_v2(5)

emit(1)

get_config(1)

get_ownership(1)

Пример #1

Показать файл

Файл: add_dataset_ownership.py Проект: shirshanka/datahub

    def get_ownership_to_set(
        graph: DataHubGraph, urn: str, mce_ownership: Optional[OwnershipClass]
    ) -> Optional[OwnershipClass]:
        if not mce_ownership or not mce_ownership.owners:
            # nothing to add, no need to consult server
            return None
        assert mce_ownership
        server_ownership = graph.get_ownership(entity_urn=urn)
        if server_ownership:
            # compute patch
            # we only include owners who are not present in the server ownership
            # if owner ids match, but the ownership type differs, we prefer the transformers opinion
            owners_to_add: List[OwnerClass] = []
            needs_update = False
            server_owner_ids = [o.owner for o in server_ownership.owners]
            for owner in mce_ownership.owners:
                if owner.owner not in server_owner_ids:
                    owners_to_add.append(owner)
                else:
                    # we need to check if the type matches, and if it doesn't, update it
                    for server_owner in server_ownership.owners:
                        if (
                            owner.owner == server_owner.owner
                            and owner.type != server_owner.type
                        ):
                            server_owner.type = owner.type
                            needs_update = True

            if owners_to_add or needs_update:
                mce_ownership.owners = server_ownership.owners + owners_to_add
                return mce_ownership
            else:
                return None
        else:
            return mce_ownership

Пример #2

Показать файл

 def __init__(
     self,
     run_id: str,
     datahub_api: Optional[DatahubClientConfig] = None,
     pipeline_name: Optional[str] = None,
     dry_run: bool = False,
     preview_mode: bool = False,
 ) -> None:
     self.run_id = run_id
     self.graph = DataHubGraph(
         datahub_api) if datahub_api is not None else None
     self.pipeline_name = pipeline_name
     self.dry_run_mode = dry_run
     self.preview_mode = preview_mode
     self.reporters: Dict[str, Committable] = dict()
     self.checkpointers: Dict[str, Committable] = dict()
     self._set_dataset_urn_to_lower_if_needed()

Пример #3

Показать файл

Файл: common.py Проект: taufiqibrahim/datahub

 def __init__(
     self,
     run_id: str,
     datahub_api: Optional[DatahubClientConfig] = None,
     pipeline_name: Optional[str] = None,
     dry_run: bool = False,
     preview_mode: bool = False,
 ) -> None:
     self.run_id = run_id
     self.graph = DataHubGraph(
         datahub_api) if datahub_api is not None else None
     self.pipeline_name = pipeline_name
     self.dry_run_mode = dry_run
     self.preview_mode = preview_mode

Пример #4

Показать файл

class PipelineContext:
    def __init__(
        self,
        run_id: str,
        datahub_api: Optional[DatahubClientConfig] = None,
        pipeline_name: Optional[str] = None,
        dry_run: bool = False,
        preview_mode: bool = False,
    ) -> None:
        self.run_id = run_id
        self.graph = DataHubGraph(
            datahub_api) if datahub_api is not None else None
        self.pipeline_name = pipeline_name
        self.dry_run_mode = dry_run
        self.preview_mode = preview_mode
        self.reporters: Dict[str, Committable] = dict()
        self.checkpointers: Dict[str, Committable] = dict()
        self._set_dataset_urn_to_lower_if_needed()

    def _set_dataset_urn_to_lower_if_needed(self) -> None:
        # TODO: Get rid of this function once lower-casing is the standard.
        if self.graph:
            server_config = self.graph.get_config()
            if server_config and server_config.get("datasetUrnNameCasing"):
                set_dataset_urn_to_lower(True)

    def register_checkpointer(self, committable: Committable) -> None:
        if committable.name in self.checkpointers:
            raise IndexError(
                f"Checkpointing provider {committable.name} already registered."
            )
        self.checkpointers[committable.name] = committable

    def register_reporter(self, committable: Committable) -> None:
        if committable.name in self.reporters:
            raise IndexError(
                f"Reporting provider {committable.name} already registered.")
        self.reporters[committable.name] = committable

    def get_reporters(self) -> Iterable[Committable]:
        for committable in self.reporters.values():
            yield committable

    def get_committables(self) -> Iterable[Tuple[str, Committable]]:
        for reporting_item_commitable in self.reporters.items():
            yield reporting_item_commitable
        for checkpointing_item_commitable in self.checkpointers.items():
            yield checkpointing_item_commitable

Пример #5

Показать файл

Файл: datahub_ingestion_state_provider.py Проект: taufiqibrahim/datahub

 def create(cls, config_dict: Dict[str, Any],
            ctx: PipelineContext) -> IngestionStateProvider:
     if ctx.graph:
         return cls(ctx.graph)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )

Пример #6

Показать файл

 def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext,
            name: str) -> IngestionCheckpointingProviderBase:
     if ctx.graph:
         # Use the pipeline-level graph if set
         return cls(ctx.graph, name)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration.")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph, name)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )

Пример #7

Показать файл

Файл: dataset_add_owner.py Проект: swaroopjagadish/datahub

logging.basicConfig(level=logging.INFO)

# Inputs -> owner, ownership_type, dataset
owner_to_add = make_user_urn("jdoe")
ownership_type = OwnershipTypeClass.DATAOWNER
dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")

# Some objects to help with conditional pathways later
owner_class_to_add = OwnerClass(owner=owner_to_add, type=ownership_type)
ownership_to_add = OwnershipClass(owners=[owner_class_to_add])

# First we get the current owners
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

current_owners: Optional[OwnershipClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="ownership",
    aspect_type=OwnershipClass,
)

need_write = False
if current_owners:
    if (owner_to_add, ownership_type) not in [(x.owner, x.type)
                                              for x in current_owners.owners]:
        # owners exist, but this owner is not present in the current owners
        current_owners.owners.append(owner_class_to_add)
        need_write = True
else:

Пример #8

Показать файл

        path = ".".join(tokens)
        return path
    else:
        # not a v2, we assume this is a simple path
        return field_path


# Inputs -> the column, dataset and the tag to set
column = "address.zipcode"
dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")
tag_to_add = make_tag_urn("location")


# First we get the current editable schema metadata
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))


current_editable_schema_metadata = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableSchemaMetadata",
    aspect_type=EditableSchemaMetadataClass,
)


# Some pre-built objects to help all the conditional pathways
tag_association_to_add = TagAssociationClass(tag=tag_to_add)
tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add])
field_info_to_set = EditableSchemaFieldInfoClass(
    fieldPath=column, globalTags=tags_aspect_to_set
)

Пример #9

Показать файл

Файл: dataset_add_documentation.py Проект: swaroopjagadish/datahub

link_description = "This is the definition of what real estate means"
dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

# Some helpful variables to fill out objects later
now = int(time.time() * 1000)  # milliseconds since epoch
current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion")
institutional_memory_element = InstitutionalMemoryMetadataClass(
    url=link_to_add,
    description=link_description,
    createStamp=current_timestamp,
)


# First we get the current owners
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(config=DatahubClientConfig(server=gms_endpoint))

current_editable_properties = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableDatasetProperties",
    aspect_type=EditableDatasetPropertiesClass,
)

need_write = False
if current_editable_properties:
    if documentation_to_add != current_editable_properties.description:
        current_editable_properties.description = documentation_to_add
        need_write = True
else:
    # create a brand new editable dataset properties aspect
    current_editable_properties = EditableDatasetPropertiesClass(

Пример #10

Показать файл

Файл: dataset_add_term.py Проект: swaroopjagadish/datahub

from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    AuditStampClass,
    ChangeTypeClass,
    GlossaryTermAssociationClass,
    GlossaryTermsClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")

current_terms: Optional[GlossaryTermsClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="glossaryTerms",
    aspect_type=GlossaryTermsClass,
)

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time