qualified_name="process_xyz", typeName="process_with_steps", guid=-1003, relationshipAttributes={ "steps": [ step01.to_json(minimum=True), step02.to_json(minimum=True), step03.to_json(minimum=True), ] }, attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded to Purview batch = [step01, step02, step03, parent, input01, output01] # Upload the types typeResults = client.upload_typedefs( entityDefs=[processWithSteps, processSteps], relationshipDefs=[relationship], force_update=True) # Upload the entities results = client.upload_entities(batch) # Print the results of the entities upload print(json.dumps(results, indent=2)) print("Successfully created types and entities!")
"type": column_lineage_process_entity.name, "name": "query", "isContainer": False, "cardinality": "SINGLE", "isLegacyAttribute": True }, endDef2={ "type": table_process_entity.name, "name": "columnLineages", "isContainer": True, "cardinality": "SET", "isLegacyAttribute": False }) # Output composite entity output = { "entityDefs": [ column_lineage_process_entity.to_json(), table_process_entity.to_json() ], "relationshipDefs": [table_process_column_lineage_relationship.to_json()] } print(json.dumps(output, indent=2)) input(">>>>Ready to upload?") upload_results = client.upload_typedefs(output) print(json.dumps(upload_results, indent=2))
"name": "columns", "isContainer": True, "cardinality": "SET", "isLegacyAttribute": False }, endDef2={ "type": "custom_spark_dataframe_column", "name": "dataframe", "isContainer": False, "cardinality": "SINGLE", "isLegacyAttribute": False }) typedef_results = client.upload_typedefs( { "entityDefs": [type_spark_df, type_spark_columns, type_spark_job], "relationshipDefs": [spark_column_to_df_relationship] }, force_update=True) print(typedef_results) # COMMAND ---------- # No we actually do some databricks work df = spark.read.csv("/databricks-datasets/flights/departuredelays.csv", header=True, inferSchema=True) # COMMAND ---------- # Do some transformations
"cardinality": "SET", "isLegacyAttribute": False }, endDef2={ "type": "custom_spark_dataframe_column", "name": "dataframe", "isContainer": False, "cardinality": "SINGLE", "isLegacyAttribute": False }) typedef_results = client.upload_typedefs( { "entityDefs": [ type_spark_df.to_json(), type_spark_columns.to_json(), type_spark_job.to_json() ], "relationshipDefs": [spark_column_to_df_relationship.to_json()] }, force_update=True) print(typedef_results) # COMMAND ---------- # No we actually do some databricks work df = spark.read.csv("/databricks-datasets/flights/departuredelays.csv", header=True, inferSchema=True) # COMMAND ----------
"name": "columns", "isContainer": True, "cardinality": "SET", "isLegacyAttribute": False, }, endDef2={ "type": column_entity_def.name, "name": "table", "isContainer": False, "cardinality": "SINGLE", "isLegacyAttribute": False }) # Upload the results upload_results = client.upload_typedefs( entityDefs=[column_entity_def, table_entity_def], relationshipDefs=[table_column_relationship], force_update=True) # With all the types and relationships defined, we can create entities. # We can use a GuidTracker to always get a unique negative number gt = GuidTracker() table_entity = AtlasEntity( name="sample_table", qualified_name="pyapacheatlas://sample_tablepyapacheatlas_custom_type", typeName="pyapacheatlas_demo_table", guid=gt.get_guid()) # Add two columns. They must include the "relationshipAttribute" attribute. column01 = AtlasEntity( name="column01",
# SETUP: This is just setting up the excel file for you file_path = "./demo_custom_type_and_entity_upload.xlsx" excel_config = ExcelConfiguration() excel_reader = ExcelReader(excel_config) # Create an empty excel template to be populated excel_reader.make_template(file_path) # This is just a helper to fill in some demo data fill_in_type_workbook(file_path, excel_config) fill_in_entity_workbook(file_path, excel_config) # ACTUAL WORK: This parses our excel file and creates a batch to upload typedefs = excel_reader.parse_entity_defs(file_path) entities = excel_reader.parse_bulk_entities(file_path) # This is what is getting sent to your Atlas server # print(json.dumps(typedefs,indent=2)) # print(json.dumps(entities,indent=2)) type_results = client.upload_typedefs(typedefs, force_update=True) entity_results = client.upload_entities(entities) print(json.dumps(type_results, indent=2)) print("\n") print(json.dumps(entity_results, indent=2)) print( "Completed type and bulk upload successfully!\nSearch for exampledataset to see your results." )
1, "valuesMaxCount": 1, "isUnique": False, "isIndexable": False, "includeInNotification": False }]) # Alternatively, you can get all atlas types via... # atlas_type_defs = client.get_all_typedefs() input(">>>>Ready to upload type definitions?") # Upload scaffolded type defs and view the results of upload _upload_typedef = client.upload_typedefs(atlas_type_defs, force_update=True) print(json.dumps(_upload_typedef, indent=2)) input(">>>>Review the above results to see what was uploaded.") # Generate the atlas entities! excel_results = excel_reader.parse_lineages(file_path, atlas_type_defs, use_column_mapping=True) print("Results from excel transformation") print(json.dumps(excel_results, indent=2)) input(">>>>Review the above results to see what your excel file contained")
attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded as json/dicts batch = [ step01.to_json(), step02.to_json(), step03.to_json(), parent.to_json(), input01.to_json(), output01.to_json() ] # Upload the types typeResults = client.upload_typedefs( { "entityDefs": [processWithSteps.to_json(), processSteps.to_json()], "relationshipDefs": [relationship.to_json()] }, force_update=True) # Upload the entities results = client.upload_entities({"entities": batch}) # Print the results of the entities upload print(json.dumps(results, indent=2)) print("Successfully created types and entities!")
account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth ) # We need a custom process entity type that contains the definition for # a columnMapping attribute. procType = EntityTypeDef( "ProcessWithColumnMapping", superTypes=["Process"], attributeDefs = [ AtlasAttributeDef("columnMapping") ] ) # Upload the type definition type_results = client.upload_typedefs(entityDefs=[procType], force_update=True) print(json.dumps(type_results,indent=2)) # Set up a guid tracker to make it easier to generate negative guids gt = GuidTracker() # Now we can create the entities, we will have two inputs and one output colMapInput01 = AtlasEntity( "Input for Column Mapping", "hive_table", "pyapacheatlas://colMapInput01", guid=gt.get_guid() ) colMapInput02 = AtlasEntity( "Second Input for Column Mapping", "hive_table",
# Set up the new entity types to capture delta lake tables and databricks jobs # Databricks Table databricks_table_type = EntityTypeDef( name="databricks_table", attributeDefs=[ AtlasAttributeDef(name="format", defaultValue="parquet", isOptional=True).to_json(), AtlasAttributeDef(name="location", isOptional=True).to_json(), AtlasAttributeDef(name="num_files", isOptional=True).to_json(), AtlasAttributeDef(name="size", isOptional=True).to_json() ], superTypes=["DataSet"], options={"schemaElementAttribute": "columns"}) typedef_results = client.upload_typedefs( {"entityDefs": [databricks_table_type.to_json()]}, force_update=True) print(typedef_results) # COMMAND ---------- # DBTITLE 1,databricks-column entity type # Databricks Column databricks_column_type = EntityTypeDef( name="databricks_column", attributeDefs=[AtlasAttributeDef(name="data_type")], superTypes=["DataSet"], ) typedef_results = client.upload_typedefs( {"entityDefs": [databricks_column_type.to_json()]}, force_update=True) print(typedef_results)
# Create an entity type definition with three columns (column1, 2, 3) # with column1 required. edef = EntityTypeDef(name="pyapacheatlas_create_type_def_sample", attributeDefs=[ AtlasAttributeDef("column1", typeName="string", isOptional=False), AtlasAttributeDef("column2", typeName="int"), AtlasAttributeDef("column3", typeName="array<string>", cardinality="SET"), ], superTypes=["DataSet"]) # Do the upload results = client.upload_typedefs(entityDefs=[edef], force_update=True) # Just for demonstration purposes, get the entity type def. get_results = client.get_typedef( TypeCategory.ENTITY, name="pyapacheatlas_create_type_def_sample") print("# Results from getting created type def:") print(json.dumps(get_results, indent=2)) # Creating an instance of this custom type actual_entity = AtlasEntity( name="instance_of_pyapacheatlas_create_type_def_sample", qualified_name= "pyapacheatlas://instance_of_pyapacheatlas_create_type_def_sample", typeName="pyapacheatlas_create_type_def_sample", attributes={ "column1": "abc",
import os import json from pyapacheatlas.auth import ServicePrincipalAuthentication from pyapacheatlas.core import PurviewClient, AtlasEntity, AtlasProcess print(os.environ.get('AZURE_TENANT_ID', '')) oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get('AZURE_TENANT_ID', ''), client_id=os.environ.get('AZURE_CLIENT_ID', ''), client_secret=os.environ.get('AZURE_CLIENT_SECRET', '')) client = PurviewClient(account_name=os.environ.get('PURVIEW_CATALOG_NAME', ''), authentication=oauth) client.upload_typedefs(json.load( open('./pyapacheatlas_mysql_typedefs_v2.json', 'r')), force_update=True)