} for t in terms } # TODO: You apply business logic to massage the terms in different ways # based on how you might say "Master Data" in the glossary but have it # on a table as "m-data" (as an example). # For every term, search across all dataset entities (so no process entities). for term in terms: primary_display_text = term["displayText"] term_guid = term["termGuid"] search_query = client.search_entities(query=f"{primary_display_text}*", search_filter={ "typeName": "DataSet", "includeSubTypes": True }) lowest_seen_score = 99 threshold = args.threshold search_intermediate_results = [] # Iterate over the search results for each term # Discover all of the entities that are relevant by cutting the search # off at a specific relevance threshold (default is 3.0). for batch in search_query: for entity in batch: if entity["typeName"] == "AtlasGlossaryTerm": continue search_score = entity['@search.score']
auth = ServicePrincipalAuthentication(tenant_id=v_tenant_id, client_id=v_client_id, client_secret=v_client_secret) # Create a client to connect to your service. client = PurviewClient(account_name=v_data_catalog_name, authentication=auth) guid = GuidTracker() # COMMAND ---------- # Search for the entity you want to delete import json import os search = client.search_entities("loan_risk_data.csv") for page in search: print(json.dumps(page, indent=2)) # COMMAND ---------- # MAGIC %md # MAGIC #####3. Bulk delete upto 50 entities # COMMAND ---------- import json import os search = client.search_entities("databricksassets") for page in search: print(page["id"])
The response is a Python generator that allows you to page through the search results. For each page in the search results, you have a list of search entities that can be iterated over. """ # Authenticate against your Atlas server oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "")) client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth) # Assuming you have an entity with the word demo in the name or description search = client.search_entities("demo") # Alternative search methods include... # Searching across a given attribute: # Search only the name (or qualifiedName) field and it begins with demo # Must include a wildcard character (*) at the end, does not support # wildcard at the beginning or middle. # search = client.search_entities("name:demo*") # search = client.search_entities("qualifiedName:demo*") # Searching within a given type and include subtypes... # Provide a search filter that specifies the typeName and whether # you want to include sub types of that type or not. # filter_setup = {"typeName": "DataSet", "includeSubTypes": True}
for dir in dirs.rdd.collect(): #print(dir.notebook_path.split('/')[-1]) nbname = dir.notebook_path.split('/')[-1] df = spark.read.text(notebook_mnt+"/"+nbname+".py") cnt = df.filter("lower(value) like '%create %' or lower(value) like '%merge %' or lower(value) like '%insert %' or lower(value) like '%spark.write%'").count() if cnt > 0: notebook_list.append(dir.notebook_path) notebook_list # COMMAND ---------- # Get Databricks Tables in purview import json import os tables = [] search = client.search_entities("databricks_table") for page in search: tblln = [] tname = page["qualifiedName"].split('/')[-1] tblln.insert(0,tname.lower()) tblln.insert(1,page["qualifiedName"]) tables.append(tblln) # COMMAND ---------- # Get ADLS files in purview import json import os adlsfiles = [] filter_setup = {"typeName": "azure_datalake_gen2_path"}