def processItem(anItem, itemCount): ''' put your code here - that does something with the item ''' itemId = anItem["id"] itemName = edcutils.getFactValue(anItem, "core.name") print("\titem " + str(itemCount) + "=" + itemId + " name=" + itemName)
def processItem(anItem, itemCount): ''' put your code here - that does something with the item for this example, just print the it and name @note python 2.7 does not allow us to specify the parameter type... ''' itemId = anItem["id"] itemName = edcutils.getFactValue(anItem, "core.name") print("\titem " + str(itemCount) + "=" + itemId + " name=" + itemName)
def getSchemaContents(schemaName, schemaType, resourceName): """ given a schema name, schema class type (e.g. hanadb is different) and resource name, find the schema object then execute a relationships call to get the schema tables & columns (parent/child links) note: some models separate primary key columns from regular columns note: some models have different relationships (e.g. sap hana db) returns a dictionary of all tables & columns for the schema & the id of the schema object key=table val=tableid key=table.column val=columnid """ print("\tgetSchemaContents for:" + schemaName + " resource=" + resourceName) # schemaDict returned key=TABLE.COLUMN value=column id schemaDict = {} tableNames = {} # url = catalogServer + "/access/2/catalog/data/objects" url = edcHelper.baseUrl + "/access/2/catalog/data/objects" query = (f'+core.resourceName:"{resourceName}"' + f' +core.classType:"{schemaType}"' + f' +core.name:"{schemaName}"') parameters = {"q": query, "offset": 0, "pageSize": 1} print("\tquery=" + query) schemaId = None tableCount = 0 columnCount = 0 # make the call to find the schema object response = edcHelper.session.get(url, params=parameters, timeout=3) print(f"session get finished: {response.status_code}") rc = response.status_code if rc != 200: print("error reading object: rc=" + str(rc) + " response:" + str(response.json)) if rc == 401: print("\t401:Possible Missing/bad credentials") print(str(response)) return # get the total # of objects returned (first part of the json resultset) totalObjects = response.json()["metadata"]["totalCount"] print("\tobjects returned: " + str(totalObjects)) for item in response.json()["items"]: schemaId = item["id"] schemaName = edcutils.getFactValue(item, "core.name") # get the tables & columns print("\tfound schema: " + schemaName + " id=" + schemaId) lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships" lineageParms = { "seed": schemaId, "association": "core.ParentChild", "depth": "2", "direction": "OUT", "includeAttribute": {"core.name", "core.classType"}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false", } print("\tGET child rels for schema: " + lineageURL + " parms=" + str(lineageParms)) # get using uid/pwd lineageResp = edcHelper.session.get( lineageURL, params=lineageParms, ) lineageStatus = lineageResp.status_code print("\tlineage resp=" + str(lineageStatus)) if lineageStatus != 200: print(f"error getting schema contents (tables) rc={rc}" f" response:{response.json}") if rc == 401: print("\t401:Possible Missing/bad credentials") print(str(response)) return if lineageResp.text.startswith("{items:"): # bug (10.2.0 & 10.2.1) - the items collection should be "items" lineageJson = lineageResp.text.replace("items", '"items"', 1) else: lineageJson = lineageResp.text # relsJson = json.loads(lineageJson.replace('items', '"items"')) relsJson = json.loads(lineageJson) # print(len(relsJson)) for lineageItem in relsJson["items"]: # print('\t\t' + str(lineageItem)) inId = lineageItem.get("inId") outId = lineageItem.get("outId") # print('new inId===' + inId + " outId=" + outId) # print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name")) assocId = lineageItem.get("associationId") # print("\t\t" + inId + " assoc=" + assocId) # if assocId=='com.infa.ldm.relational.SchemaTable': if assocId.endswith(".SchemaTable"): # note - custom lineage does not need table and column # count the tables & store table names tableCount += 1 # tableName = inId.split('/')[-1] tableName = edcutils.getFactValue(lineageItem["inEmbedded"], "core.name").lower() # store the table name (for lookup when processing the columns) # key=id, val=name tableNames[inId] = tableName schemaDict[tableName] = inId # if assocId=='com.infa.ldm.relational.TableColumn': if assocId.endswith(".TableColumn") or assocId.endswith( ".TablePrimaryKeyColumn"): # columnName = inId.split('/')[-1] columnCount += 1 columnName = edcutils.getFactValue(lineageItem["inEmbedded"], "core.name").lower() tableName = tableNames[outId].lower() # print("column=" + tableName + "." + columnName) schemaDict[tableName + "." + columnName] = inId print("\tgetSchema: returning " + str(columnCount) + " columns, in " + str(tableCount) + " tables") return schemaDict, schemaId
def main(): """ main starts here - run the query processing all items - find all com.infa.ldm.relational.ExternalDatabase objects - for each - call processExternalDB """ print("ExternalDBLinker started") start_time = time.time() tableLinksCreated = 0 columnLinksCreated = 0 errorsFound = 0 """ logger = logging.getLogger('externalDBLinker') logger.setLevel('DEBUG') file_log_handler = logging.FileHandler('logfile.log') logger.addHandler(file_log_handler) stderr_log_handler = logging.StreamHandler() logger.addHandler(stderr_log_handler) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_log_handler.setFormatter(formatter) stderr_log_handler.setFormatter(formatter) logger.info('Info message') logger.error('Error message') """ columnHeader = [ "Association", "From Connection", "To Connection", "From Object", "To Object", ] if str(platform.python_version()).startswith("2.7"): fCSVFile = open(outputFile, "w") else: fCSVFile = open(outputFile, "w", newline="", encoding="utf-8") print("custom lineage file initialized. " + outputFile) colWriter = csv.writer(fCSVFile) colWriter.writerow(columnHeader) print("executing query to find all external DB objects: " + url + " q=" + parameters.get("q")) resp = requests.get(url, params=parameters, headers=header, auth=HTTPBasicAuth(uid, pwd)) status = resp.status_code print("extDB query rc=" + str(status)) resultJson = resp.json() total = resultJson["metadata"]["totalCount"] print("external db objects found... " + str(total)) currentDB = 0 # for each externalDatabase object for extDBItem in resultJson["items"]: itemId = extDBItem["id"] currentDB += 1 print("processing database: " + itemId + " " + str(currentDB) + " of " + str(total)) itemType = edcutils.getFactValue(extDBItem, "core.classType") itemName = edcutils.getFactValue(extDBItem, "core.name") resourceName = edcutils.getFactValue(extDBItem, "core.resourceName") resourceType = edcutils.getFactValue(extDBItem, "core.resourceType") tabLinks, colLinks, errors = processExternalDB(itemId, itemType, itemName, resourceType, resourceName, colWriter) tableLinksCreated += tabLinks columnLinksCreated += colLinks errorsFound += errors sys.stdout.flush() fCSVFile.close print("finished!") print("table links: created=" + str(tableLinksCreated)) print("column links: created=" + str(columnLinksCreated)) print("errors found: " + str(errorsFound)) # end of main() print("Finished - run time = %s seconds ---" % (time.time() - start_time)) # call the resource create/update/load function to get the data imported into EDC if executeEDCImport: edcutils.createOrUpdateAndExecuteResource( catalogServer, uid, pwd, lineageResourceName, lineageResourceTemplate, csvFileName, outputFile, waitToComplete, )
def processExternalDB(dbId, classType, dbName, resType, resName, colWriter): """ dbId = the id of the external database object classType = the classtype of the external database oject dbNAme - the name of the external db object resType - the resource type e.g Oracle, SQLServer that created the external db object resName - the resource name that created the external db object colWriter - the csv File object to write any lineage entries process outline:- get lineage to the table level (2 parent/child hops) from the external database for each table, find the actual table that should be linked (case insensitive) if only 1 table was found (otherwise - print an error message) link the table and all columns """ dbUnknown = False # counters for # of links created tabLinks = 0 colLinks = 0 errors = 0 # 'External' can be used if the database name is not known (sqlserver use case) if dbName == "External": dbUnknown = True print("\tthe database for the externally referenced object is unknown") # note: if the database type is oracle, we can't trust the name of the database # since it will contain the dblink name, not the externally referenced db # so we will treat it as unknown and just try to find the schema/table ref'd if resType == "Oracle": dbUnknown = True print("\ttype=" + classType + " name=" + dbName + " id=" + dbId + " unknownDB:" + str(dbUnknown)) # get the lineage for the database object lineageURL = catalogServer + "/access/2/catalog/data/relationships" lineageParms = { "seed": dbId, "association": "core.ParentChild", "depth": "2", "direction": "OUT", "includeAttribute": {"core.name", "core.classType"}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false", } print("\tLineage query for: " + dbName + " params=" + str(lineageParms)) lineageResp = requests.get(lineageURL, params=lineageParms, headers=header, auth=HTTPBasicAuth(uid, pwd)) lineageStatus = lineageResp.status_code print("\tlineage rc=" + str(lineageStatus)) lineageJson = lineageResp.text # bug in the relationships api call - the items collection sould be # "items" (with quotes) if lineageJson.startswith("{items"): # bug in 10.2.1 and before # replace items with "items" - for just the first occurrence # (note: this is fixed after 10.2.1 lineageJson = json.loads(lineageJson.replace("items", '"items"', 1)) else: # 10.2.1u1 + json is escaped properly lineageJson = json.loads(lineageJson) # for each item in the lineage resultset for lineageItem in lineageJson["items"]: inId = lineageItem.get("inId") assocId = lineageItem.get("associationId") schemaName = "" # print("\t" + inId + " assoc=" + assocId) if assocId == "com.infa.ldm.relational.ExternalSchemaTable": # find the table... schemaName = inId.split("/")[-2] inEmbedded = lineageItem.get("inEmbedded") tableName = edcutils.getFactValue(inEmbedded, "core.name") print("\tprocessing table=" + tableName + " schema=" + schemaName + " db=" + dbName + " id=" + inId) # format the query to find the actual table q = ( "core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View) and core.name_lc_exact:" + tableName) if dbUnknown and schemaName == "": q = q + " and core.resourceName:" + resName # if dbUnknown==False: # q=q+ ' and ' + dbName q = q + ' and core.resourceType:"' + resType + '"' tableSearchParms = {"q": q, "offset": 0, "pageSize": 100} print("\t\tquery=" + str(tableSearchParms)) # find the table - with name tableName tResp = requests.get( url, params=tableSearchParms, headers=header, auth=HTTPBasicAuth(uid, pwd), ) tStatus = tResp.status_code print("\t\tquery rc=" + str(tStatus)) # print("\t\t\t\t" + str(tResp.json())) foundTabId = "" tableMatchCount = 0 # possible matching tables for tableItem in tResp.json()["items"]: fromTableId = tableItem["id"] fromSchemaName = fromTableId.split("/")[-2].lower() # foundTabId='' # foundTabId = tableItem["id"] # print("rs-" + fromTableId + " compared with " + inId); if fromTableId != inId and fromTableId.count("/") == 4: # filter out comparing against itself + # counting the / chars - 4 = normal db, 5=external db (disregard) print("\t\tchecking " + fromTableId) # print("could be this one...." + foundTabId + " << " + inId) theName = fromTableId.split("/")[-1] # the schema could be empty - in that case it should match dbo # unless we have a way of knowing what the default schema is if theName.lower() == tableName.lower() and ( schemaName.lower() == fromSchemaName.lower() or (schemaName == "" and fromSchemaName == "dbo")): tableMatchCount += 1 foundTabId = fromTableId print("\t\ttable name matches..." + theName + "==" + tableName + " " + inId + " " + foundTabId + " count/" + str(fromTableId.count("/"))) else: print( "\t\tno match...schema match 1:" + theName + " 2:" + tableName + " 3:" + schemaName + " 4:" + fromSchemaName + ":" + str(schemaName.lower() == fromSchemaName.lower())) # else: # print("skipping this one"); print("\t\ttotal matching tables=" + str(tableMatchCount) + " inId:" + inId) if tableMatchCount == 1: # ok - we have a single table to match # link at the table level - then get the columns and link them too # print("linking from actual tab - to external ref tab") print("\t\tlinking tables " + foundTabId + " -->> " + inId) # get the columns for the ext Table # (will be a reduced set - only the linked columns) extCols = getColumnsForTable(inId) tabCols = getColumnsForTable(foundTabId) # link the table level edcutils.exportLineageLink(foundTabId, inId, "core.DataSetDataFlow", colWriter) tabLinks += 1 tabColsLinked = 0 # match the columns on the left/right side for toCol, toId in extCols.items(): # check if the toCol (the name) exists in the tabCols dict # print('\t\t\tchecking toCol:' + toCol + tabCols.get(toCol)) fromId = tabCols.get(toCol) if fromId is not None: # print('\t\t\tlinking columns...' + fromId + ' --->>>' + toId) edcutils.exportLineageLink(fromId, toId, "core.DirectionalDataFlow", colWriter) tabColsLinked += 1 colLinks += 1 else: print("\t\t\tError: cannot find column " + toCol + " in table " + inId) errors += 1 print("\t\t\text cols:" + str(len(extCols)) + " tablCols:" + str(len(tabCols)) + " linked=" + str(tabColsLinked)) # print("\t\t\tcolumns linked=" + str(tabColsLinked)) else: print("\t\tmutlple possible matches found (" + str(tableMatchCount) + ") no links will be created") # flush the console buffer - for tailing the stdout log sys.stdout.flush() print("external database: " + dbName + " processed: tab/col links created: " + str(tabLinks) + "/" + str(colLinks) + " errors:" + str(errors)) print("") return tabLinks, colLinks, errors
def getSchemaContents(schemaName, schemaType, resourceName): """ given a schema name, schema class type (e.g. hanadb is different) and resource name find the schema object then execute a relationships call to get the schema tables & columns (parent/child links) note: some models separate primary key columns from regular columns note: some models have different relationships (e.g. sap hana db) returns a dictionary of all tables & columns for the schema & the id of the schema object key=table val=tableid key=table.column val=columnid """ print('\tgetSchemaContents for:' + schemaName + " resource=" + resourceName) # schemaDict returned key=TABLE.COLUMN value=column id schemaDict = {} tableNames = {} url = catalogServer + '/access/2/catalog/data/objects' query = "core.resourceName:" + resourceName + " and core.classType:" + schemaType + " and core.name_lc_exact:" + schemaName parameters = {'q': query, 'offset': 0, 'pageSize': 1} # header using uid/pwd (no Authorization) header = {"Accept": "application/json"} # header using Authorization - no need to use uid/pwd in the get call #header = {"Accept": "application/json", "Authorization": authCredentials} print("\tquery=" + query) #print("\theader=" + str(header)) schemaId = None tableCount = 0 columnCount = 0 # make the call to find the schema object response = requests.get(url, params=parameters, headers=header, auth=HTTPBasicAuth(uid, pwd)) #response = requests.get(url,params=parameters,headers=header) rc = response.status_code if rc != 200: print("error reading object: rc=" + str(rc) + " response:" + str(response.json)) if rc == 401: print( "\t401:Possible Missing/bad credentials - or server not found/responding" ) print(str(response)) return # get the total # of objects returned (first part of the json resultset) totalObjects = response.json()['metadata']['totalCount'] print("\tobjects returned: " + str(totalObjects)) for item in response.json()['items']: schemaId = item["id"] schemaName = edcutils.getFactValue(item, 'core.name') # get the tables & columns print("\tfound schema: " + schemaName + " id=" + schemaId) lineageURL = catalogServer + '/access/2/catalog/data/relationships' lineageParms = { "seed": schemaId, "association": "core.ParentChild", "depth": "2", "direction": "OUT", "includeAttribute": {'core.name', 'core.classType'}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false" } print("\tGET child rels for schema: " + lineageURL + " parms=" + str(lineageParms)) # get using uid/pwd lineageResp = requests.get(lineageURL, params=lineageParms, headers=header, auth=HTTPBasicAuth(uid, pwd)) # credentials are in the header #lineageResp = requests.get(lineageURL,params=lineageParms,headers=header) lineageStatus = lineageResp.status_code print("\tlineage resp=" + str(lineageStatus)) if lineageStatus != 200: print("error getting schema contents (tables) rc=" + str(rc) + " response:" + str(response.json)) if rc == 401: print( "\t401:Possible Missing/bad credentials - or server not found/responding" ) print(str(response)) return if lineageResp.text.startswith('{items:'): # bug (10.2.0 & 10.2.1) - the items collection should be "items" lineageJson = lineageResp.text.replace('items', '"items"', 1) else: lineageJson = lineageResp.text #relsJson = json.loads(lineageJson.replace('items', '"items"')) relsJson = json.loads(lineageJson) #print(len(relsJson)) for lineageItem in relsJson["items"]: #print('\t\t' + str(lineageItem)) inId = lineageItem.get("inId") outId = lineageItem.get("outId") #print('new inId===' + inId + " outId=" + outId) #print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name")) assocId = lineageItem.get("associationId") #print("\t\t" + inId + " assoc=" + assocId) #if assocId=='com.infa.ldm.relational.SchemaTable': if assocId.endswith('.SchemaTable'): # note - custom lineage does not need table and column - count the tables & store table names tableCount += 1 #tableName = inId.split('/')[-1] tableName = edcutils.getFactValue(lineageItem["inEmbedded"], "core.name").lower() # store the table name (for lookup when processing the columns) key-id, val=name tableNames[inId] = tableName schemaDict[tableName] = inId #if assocId=='com.infa.ldm.relational.TableColumn': if assocId.endswith('.TableColumn') or assocId.endswith( ".TablePrimaryKeyColumn"): #columnName = inId.split('/')[-1] columnCount += 1 columnName = edcutils.getFactValue(lineageItem["inEmbedded"], "core.name").lower() tableName = tableNames[outId].lower() #print("column=" + tableName + "." + columnName) schemaDict[tableName + "." + columnName] = inId print("\tgetSchema: returning " + str(columnCount) + " columns, in " + str(tableCount) + " tables") return schemaDict, schemaId
def main(): """ main starts here - run the query processing all items - find all com.infa.ldm.relational.ExternalDatabase objects - for each - call processExternalDB """ print("ExternalDBLinker started") start_time = time.time() args = args, unknown = parser.parse_known_args() # setup edc session and catalog url - with auth in the session header, # by using system vars or command-line args edcHelper.initUrlAndSessionFromEDCSettings() print(f"command-line args parsed = {args} ") tableLinksCreated = 0 columnLinksCreated = 0 errorsFound = 0 columnHeader = [ "Association", "From Connection", "To Connection", "From Object", "To Object", ] outputFile = args.outDir + "/" + args.csvFileName fullpath = os.path.abspath(outputFile) fCSVFile = open(outputFile, "w", newline="", encoding="utf-8") from pathlib import Path print("custom lineage file initialized. " + outputFile + " RELATIVE=" +fullpath) colWriter = csv.writer(fCSVFile) colWriter.writerow(columnHeader) parameters = { "q": "core.classType:com.infa.ldm.relational.ExternalDatabase", "offset": 0, "pageSize": 1000, } url = edcHelper.baseUrl + "/access/2/catalog/data/objects" print( "executing query to find all external DB objects: " f"{url} q={parameters.get('q')} {parameters}" ) resp = edcHelper.session.get(url, params=parameters) status = resp.status_code print("extDB query rc=" + str(status)) if status != 200: print(f"error - expecting 200 rc, got {status} - message={resp.json()}") return resultJson = resp.json() total = resultJson["metadata"]["totalCount"] print(f"external db objects found... {total}") currentDB = 0 # for each externalDatabase object for extDBItem in resultJson["items"]: itemId = extDBItem["id"] currentDB += 1 print(f"processing database: {itemId} {currentDB} of {total}") itemType = edcutils.getFactValue(extDBItem, "core.classType") itemName = edcutils.getFactValue(extDBItem, "core.name") resourceName = edcutils.getFactValue(extDBItem, "core.resourceName") resourceType = edcutils.getFactValue(extDBItem, "core.resourceType") tabLinks, colLinks, errors = processExternalDB( itemId, itemType, itemName, resourceType, resourceName, colWriter ) tableLinksCreated += tabLinks columnLinksCreated += colLinks errorsFound += errors sys.stdout.flush() fCSVFile.close() print("finished!") print("table links: created=" + str(tableLinksCreated)) print("column links: created=" + str(columnLinksCreated)) print("errors found: " + str(errorsFound)) # end of main() print("Finished - run time = %s seconds ---" % (time.time() - start_time)) # call the resource create/update/load function to get the data imported into EDC if args.edcimport and tableLinksCreated > 0: edcutils.createOrUpdateAndExecuteResourceUsingSession( edcHelper.baseUrl, edcHelper.session, args.lineageResourceName, args.lineageResourceTemplate, args.csvFileName, fullpath, waitToComplete, "LineageScanner" )
def main(): ''' main starts here - run the query processing all items - find all com.infa.ldm.relational.ExternalDatabase objects - for each - call processExternalDB ''' print("ExternalDBLinker started") start_time = time.time() tableLinksCreated = 0 columnLinksCreated = 0 errorsFound = 0 columnHeader = [ "Association", "From Connection", "To Connection", "From Object", "To Object" ] if str(platform.python_version()).startswith("2.7"): fCSVFile = open(csvFileName, "w") else: fCSVFile = open(csvFileName, "w", newline='', encoding='utf-8') print("custom lineage file initialized. " + csvFileName) colWriter = csv.writer(fCSVFile) colWriter.writerow(columnHeader) print("executing query to find all external DB objects: " + url + ' q=' + parameters.get('q')) resp = requests.get(url, params=parameters, headers=header, auth=HTTPBasicAuth(uid, pwd)) status = resp.status_code print('extDB query rc=' + str(status)) resultJson = resp.json() total = resultJson['metadata']['totalCount'] print("external db objects found... " + str(total)) currentDB = 0 # for each externalDatabase object for extDBItem in resultJson["items"]: itemId = extDBItem["id"] currentDB += 1 print("processing database: " + itemId + " " + str(currentDB) + ' of ' + str(total)) itemType = edcutils.getFactValue(extDBItem, "core.classType") itemName = edcutils.getFactValue(extDBItem, "core.name") resourceName = edcutils.getFactValue(extDBItem, "core.resourceName") resourceType = edcutils.getFactValue(extDBItem, "core.resourceType") tabLinks, colLinks, errors = processExternalDB(itemId, itemType, itemName, resourceType, resourceName, colWriter) tableLinksCreated += tabLinks columnLinksCreated += colLinks errorsFound += errors fCSVFile.close print("finished!") print("table links: created=" + str(tableLinksCreated)) print("column links: created=" + str(columnLinksCreated)) print("errors found: " + str(errorsFound)) # end of main() print("Finished - run time = %s seconds ---" % (time.time() - start_time))
def getSchemaContents(schemaName, schemaType, resourceName, substitute_chars): """ given a schema name, schema class type (e.g. hanadb is different) and resource name, find the schema object then execute a relationships call to get the schema tables & columns (parent/child links) note: some models separate primary key columns from regular columns note: some models have different relationships (e.g. sap hana db) returns a dictionary of all tables & columns for the schema & the id of the schema object key=table val=tableid key=table.column val=columnid """ print("\tgetSchemaContents for:" + schemaName + " resource=" + resourceName) # schemaDict returned key=TABLE.COLUMN value=column id schemaDict = {} tableNames = {} # url = catalogServer + "/access/2/catalog/data/objects" url = edcHelper.baseUrl + "/access/2/catalog/data/objects" query = (f'+core.resourceName:"{resourceName}"' + f' +core.classType:"{schemaType}"' + f' +core.name:"{schemaName}"' ) parameters = {"q": query, "offset": 0, "pageSize": 1} print("\tquery=" + query) schemaId = None tableCount = 0 columnCount = 0 # make the call to find the schema object response = edcHelper.session.get(url, params=parameters, timeout=3) print(f"session get finished: {response.status_code}") rc = response.status_code if rc != 200: print("error reading object: rc=" + str(rc) + " response:" + str(response.json)) if rc == 401: print( "\t401:Possible Missing/bad credentials" ) print(str(response)) return # get the total # of objects returned (first part of the json resultset) totalObjects = response.json()["metadata"]["totalCount"] print("\tobjects returned: " + str(totalObjects)) for item in response.json()["items"]: schemaId = item["id"] schemaNameFound = edcutils.getFactValue(item, "core.name") # get the tables & columns # check to see if schemaName found is an exact match # (for situations where there are other schemas with the same prefix) # e.g. search for "PUBLIC" will also return "PUBLIC_TEST" print("\tfound schema: " + schemaNameFound + " id=" + schemaId + " type=" + schemaType) if schemaNameFound != schemaName: print(f"schema {schemaNameFound} does not exactly match {schemaName}, skipping") continue lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships" depth = 2 # for SAP Hana Calculation views - the package might contain sub-packages # set the depth to >2 if schemaType == "com.infa.ldm.relational.SAPHanaPackage": depth = 10 print(f"\tNote: SAP Hana Package used for datasource (schema)\n\t\tsetting depth={depth} for relationships query") lineageParms = { "seed": schemaId, "association": "core.ParentChild", "depth": depth, "direction": "OUT", "includeAttribute": {"core.name", "core.classType"}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false", } print( "\tGET child rels for schema: " + lineageURL + " parms=" + str(lineageParms) ) # get using uid/pwd lineageResp = edcHelper.session.get( lineageURL, params=lineageParms, ) lineageStatus = lineageResp.status_code print("\tlineage resp=" + str(lineageStatus)) if lineageStatus != 200: print( f"error getting schema contents (tables) rc={rc}" f" response:{response.json}" ) if rc == 401: print( "\t401:Possible Missing/bad credentials" ) print(str(response)) return if lineageResp.text.startswith("{items:"): # bug (10.2.0 & 10.2.1) - the items collection should be "items" lineageJson = lineageResp.text.replace("items", '"items"', 1) else: lineageJson = lineageResp.text # relsJson = json.loads(lineageJson.replace('items', '"items"')) relsJson = json.loads(lineageJson) # print(len(relsJson)) for lineageItem in relsJson["items"]: # print('\t\t' + str(lineageItem)) inId = lineageItem.get("inId") outId = lineageItem.get("outId") # print('new inId===' + inId + " outId=" + outId) # print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name")) assocId = lineageItem.get("associationId") # print("\t\t" + inId + " assoc=" + assocId) # if assocId=='com.infa.ldm.relational.SchemaTable': if (assocId.endswith(".SchemaTable") or assocId == "com.infa.adapter.snowflake.PackageFlatRecord_table" or assocId == "com.infa.ldm.relational.SAPHanaPackageCalculationView"): # note - custom lineage does not need table and column # count the tables & store table names tableCount += 1 # tableName = inId.split('/')[-1] # tableName = edcutils.getFactValue( # lineageItem["inEmbedded"], "core.name" # ).lower() # store the table name (for lookup when processing the columns) # key=id, val=name # tableNames[inId] = tableName # schemaDict[tableName] = inId # if assocId=='com.infa.ldm.relational.TableColumn': if (assocId.endswith(".TableColumn") or assocId.endswith(".TablePrimaryKeyColumn") or assocId == "com.infa.adapter.snowflake.FlatRecord_tableField" or assocId == "com.infa.ldm.relational.CalculationViewAttribute"): # columnName = inId.split('/')[-1] columnCount += 1 columnName = edcutils.getFactValue( lineageItem["inEmbedded"], "core.name" ).lower() # check if key exists?? possible bug or different order from relationships # check for substitutions columnName = (substitute_name(columnName, substitute_chars)).lower() # get table name from id (split and get the parent - won't work if a table / in the name) tableName = outId.split("/")[-1].lower() # print(f"table::: {outId}") # print(outId.split("/")) # tableName = tableNames[outId].lower() # print("column=" + tableName + "." + columnName) schemaDict[tableName + "." + columnName] = inId schemaDict[tableName] = outId print( "\tgetSchema: returning " + str(columnCount) + " columns, in " + str(tableCount) + " tables" ) return schemaDict, schemaId
def main(): """ main starts here - run the query processing all items """ print("ExternalDBLinker started") start_time = time.time() args = args, unknown = parser.parse_known_args() # setup edc session and catalog url - with auth in the session header, # by using system vars or command-line args edcHelper.initUrlAndSessionFromEDCSettings() print(f"command-line args parsed = {args} ") tableLinksCreated = 0 columnLinksCreated = 0 errorsFound = 0 columnHeader = [ "Association", "From Connection", "To Connection", "From Object", "To Object", ] outputFile = args.outDir + "/" + args.csvFileName fullpath = os.path.abspath(outputFile) fCSVFile = open(outputFile, "w", newline="", encoding="utf-8") from pathlib import Path print("custom lineage file initialized. " + outputFile + " RELATIVE=" +fullpath) colWriter = csv.writer(fCSVFile) colWriter.writerow(columnHeader) query = f"+core.classType:com.infa.ldm.relational.View +core.name:\"{args.viewname}\"" parameters = { "q": query, "offset": 0, "pageSize": 1000, } url = edcHelper.baseUrl + "/access/2/catalog/data/objects" print( f"executing query to find view named {args.viewname}: " f"{url} q={parameters.get('q')} {parameters}" ) resp = edcHelper.session.get(url, params=parameters) status = resp.status_code print("extDB query rc=" + str(status)) if status != 200: print(f"error - expecting 200 rc, got {status} - message={resp.json()}") return resultJson = resp.json() total = resultJson["metadata"]["totalCount"] print(f"external db objects found... {total}") id_list = list() # for each externalDatabase object for view in resultJson["items"]: itemId = view["id"] # print(f"\tview={itemId}") for dstlink in view['dstLinks']: print(f"\t{dstlink}") print(f"{dstlink.get('id')}") if dstlink['classType'] == 'com.infa.ldm.relational.ViewColumn': id_list.append(dstlink['id']) print(f"view columns found - used for lineage seed ids...{len(id_list)}") # get the lineage for the database object lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships" lineageParms = { "seed": id_list, "association": "core.DirectionalDataFlow", "depth": "1", "direction": "IN", "includeAttribute": {"core.name", "core.classType"}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false", } print(f"\tLineage query for: {args.viewname} params={lineageParms}") lineageResp = edcHelper.session.get( lineageURL, params=lineageParms, ) lineageStatus = lineageResp.status_code print(f"\tlineage rc={lineageStatus}") # print(f"\n\n--------") # print(lineageResp.json()) columnHeader = [ "left_id", "left_name", "left_type", "right_id", "right_name", "right_type", ] outputFile = args.outDir + "/" + args.csvFileName fullpath = os.path.abspath(outputFile) fCSVFile = open(outputFile, "w", newline="", encoding="utf-8") print("csv file initialized. " + outputFile) colWriter = csv.writer(fCSVFile) colWriter.writerow(columnHeader) # dump the lineage result to file (for documentation/understanding) jsonFile = args.outDir + "/view_column_lineage.json" with open(jsonFile, "w") as json_file: json.dump(lineageResp.json(), json_file, indent=4,) # iterate over the lineage results and write to a csv file lineage_json = lineageResp.json() for lineageitem in lineage_json["items"]: outid = lineageitem["outId"] inid = lineageitem["inId"] outname = edcutils.getFactValue(lineageitem["outEmbedded"], "core.name") outclass = edcutils.getFactValue(lineageitem["outEmbedded"], "core.classType") inname = edcutils.getFactValue(lineageitem["inEmbedded"], "core.name") inclass = edcutils.getFactValue(lineageitem["inEmbedded"], "core.classType") print(f"a link to process - {outid} {outname} {outclass} -> {inid} {inname} {inclass}") colWriter.writerow([outid, outname, outclass, inid, inname, inclass]) fCSVFile.close()
def processExternalDB(dbId, classType, dbName, resType, resName, colWriter): """ dbId = the id of the external database object classType = the classtype of the external database oject dbNAme - the name of the external db object resType - the resource type e.g Oracle, SQLServer that created the external db object resName - the resource name that created the external db object colWriter - the csv File object to write any lineage entries process outline:- get lineage to the table level (2 parent/child hops) from the external database for each table, find the actual table that should be linked (case insensitive) if only 1 table was found (otherwise - print an error message) link the table and all columns """ dbUnknown = False # counters for # of links created tabLinks = 0 colLinks = 0 errors = 0 tables_with_no_links = 0 db_mapping = dict() # 'External' can be used if the database name is not known (sqlserver use case) if dbName == "External": dbUnknown = True print("\tthe database for the externally referenced object is unknown") # lookup to see of the database name is in the mapping lookup table dblookupName = dbName if dbName.startswith("@"): dblookupName = dbName[1:] if dblookupName in mem.dblinks: db_mapping = mem.dblinks[dblookupName] print( f"\tactual name for db found in dblinks using lookup: {db_mapping}" ) dbUnknown = False else: dbUnknown = True # note: if the database type is oracle, we can't trust the name of the database # since it will contain the dblink name, not the externally referenced db # so we will treat it as unknown and just try to find the schema/table ref'd # note - better handled by lookup references... (above) # if resType == "Oracle": # dbUnknown = True print(f"\ttype={classType} name={dbName} id={dbId} unknownDB:{dbUnknown}") # get the lineage for the database object lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships" lineageParms = { "seed": dbId, "association": "core.ParentChild", "depth": "2", "direction": "OUT", "includeAttribute": {"core.name", "core.classType"}, "includeTerms": "false", "removeDuplicateAggregateLinks": "false", } print(f"\tLineage query for: {dbName} params={lineageParms}") lineageResp = edcHelper.session.get( lineageURL, params=lineageParms, ) lineageStatus = lineageResp.status_code print(f"\tlineage rc={lineageStatus}") lineageJson = lineageResp.text # bug in the relationships api call - the items collection sould be # "items" (with quotes) if lineageJson.startswith("{items"): # bug in 10.2.1 and before # replace items with "items" - for just the first occurrence # (note: this is fixed after 10.2.1 lineageJson = json.loads(lineageJson.replace("items", '"items"', 1)) else: # 10.2.1u1 + json is escaped properly lineageJson = json.loads(lineageJson) # for each item in the lineage resultset for lineageItem in lineageJson["items"]: inId = lineageItem.get("inId") assocId = lineageItem.get("associationId") inEmbedded = lineageItem.get("inEmbedded") schemaName = "" schema_substituted = False if assocId == "com.infa.ldm.relational.ExternalSchemaTable": # find the table... schemaName = inId.split("/")[-2] regex_pattern = re.compile(".*\[\d+\].+\[\d+\]") if regex_pattern.match(schemaName): print( f"\tfound un-known schema name reference id= {schemaName}") if "schema" in db_mapping: print( f"\treplaceing schema={schemaName} with {db_mapping['schema']}" ) schemaName = db_mapping["schema"] schema_substituted = True inEmbedded = lineageItem.get("inEmbedded") tableName = edcutils.getFactValue(inEmbedded, "core.name") print(f"\tprocessing table={tableName} schema={schemaName}" f" db={dbName} id={inId}") # format the query to find the actual table # "core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View) and core.name_lc_exact:" q = ( 'core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View) and core.name:"' + tableName.lower() + '"') if dbUnknown and schemaName == "": q = q + " and core.resourceName:" + resName # if dbUnknown==False: # q=q+ ' and ' + dbName q = q + ' and core.resourceType:"' + resType + '"' tableSearchParms = {"q": q, "offset": 0, "pageSize": 100} print("\t\tquery=" + str(tableSearchParms)) # find the table - with name tableName tResp = edcHelper.session.get( edcHelper.baseUrl + "/access/2/catalog/data/objects", params=tableSearchParms, ) tStatus = tResp.status_code print("\t\tquery rc=" + str(tStatus)) # print("\t\t\t\t" + str(tResp.json())) foundTabId = "" tableMatchCount = 0 # possible matching tables for tableItem in tResp.json()["items"]: fromTableId = tableItem["id"] fromSchemaName = fromTableId.split("/")[-2].lower() # foundTabId='' # foundTabId = tableItem["id"] # print("rs-" + fromTableId + " compared with " + inId); if fromTableId != inId and fromTableId.count("/") == 4: # filter out comparing against itself + # counting the / chars - 4 = normal db, 5=external db (disregard) print("\t\tchecking " + fromTableId) # print("could be this one...." + foundTabId + " << " + inId) theName = fromTableId.split("/")[-1] # the schema could be empty - in that case it should match dbo # unless we have a way of knowing what the default schema is if theName.lower() == tableName.lower() and ( schemaName.lower() == fromSchemaName.lower() or (schemaName == "" and fromSchemaName == "dbo")): # table and schema names match # check if there is a dblink lookup that has a database & if that matches too if schema_substituted: fromdbName = fromTableId.split("/")[-3].lower() linkedDbName = db_mapping["database"] print( f"\t\t\tchecking for matching database for dblink does {fromdbName} = {linkedDbName}?: {fromdbName.lower() == linkedDbName.lower()}" ) if fromdbName.lower() == linkedDbName.lower(): # make the link print("\t\t\tok to make the link....") tableMatchCount += 1 foundTabId = fromTableId print( f"\t\ttable name matches...{theName}=={tableName} {inId} {foundTabId} " f" count/{fromTableId.count('/')}") else: print( "\t\t\tdatabase name does not match - not linkable" ) else: tableMatchCount += 1 foundTabId = fromTableId print( f"\t\ttable name matches...{theName}=={tableName} {inId} {foundTabId} " f" count/{fromTableId.count('/')}") else: print( "\t\t\tno match... linked table name:" + theName + " ref table:" + tableName + " ref shema:" + schemaName + " from schema:" + fromSchemaName + " schema names match? : " + str(schemaName.lower() == fromSchemaName.lower()) + " schema_substituted: " + str(schema_substituted) + " db_mapping: " + str(db_mapping)) # else: # print("skipping this one"); print("\t\ttotal matching tables=" + str(tableMatchCount) + " inId:" + inId) if tableMatchCount == 1: # ok - we have a single table to match # link at the table level - then get the columns and link them too # print("linking from actual tab - to external ref tab") print("\t\tlinking tables " + foundTabId + " -->> " + inId) # get the columns for the ext Table # (will be a reduced set - only the linked columns) extCols = getColumnsForTable(inId) tabCols = getColumnsForTable(foundTabId) # link the table level edcutils.exportLineageLink(foundTabId, inId, "core.DataSetDataFlow", colWriter) tabLinks += 1 tabColsLinked = 0 # match the columns on the left/right side for toCol, toId in extCols.items(): # check if the toCol (the name) exists in the tabCols dict # print('\t\t\tchecking toCol:' + toCol + tabCols.get(toCol)) fromId = tabCols.get(toCol) if fromId is not None: # print('\t\t\tlinking columns...' + fromId + ' --->>>' + toId) edcutils.exportLineageLink(fromId, toId, "core.DirectionalDataFlow", colWriter) tabColsLinked += 1 colLinks += 1 else: print( f"\t\t\tError: cannot find column {toCol} in table {inId}" ) errors += 1 print(f"\t\t\text cols:{len(extCols)} tablCols:{len(tabCols)}" f" linked={tabColsLinked}") # print("\t\t\tcolumns linked=" + str(tabColsLinked)) else: tables_with_no_links += 1 print( f"\t\t{tableMatchCount} links found) no links will be created" ) # flush the console buffer - for tailing the stdout log sys.stdout.flush() print( f"external database: {dbName} processed: tab/col links created: " f"{tabLinks}/{colLinks} errors:{errors} tables not linked={tables_with_no_links}" ) mem.tables_not_linked += tables_with_no_links print("") return tabLinks, colLinks, errors
def findAndUpdateCatalog(databaseName, schemaName, datasetName, attrDict): ''' given a list of properties - find an object in the catalog - and compare the attributes found, update if necessary ''' #datasetName = 'CRM_CUSTOMER_MAIN' print("\tready to find and update catalog object: " + datasetName + " attrs=" + str(attrDict)) print("\tready to find and update catalog schema: " + schemaName) print("\tready to find and update catalog database: " + databaseName) header = {"Accept": "application/json"} #query = "+core.resourceType: " + resourceType + " +core.allclassTypes:com.infa.ldm.relational.Schema +core.name:" + schemaName + "+core.allclassTypes:com.infa.ldm.relational.Table +core.name:" + datasetName #query = "+core.allclassTypes:com.infa.ldm.relational.Table +core.name:" + datasetName query = "id:*/" + databaseName + "/" + schemaName + "/" + datasetName parameters = {'offset': "0", 'pageSize': "200", 'q': query} # execute catalog rest call, for a page of results print('\texecuting search: q=' + query) #resp = requests.get(catalogServer + '/access/2/catalog/data/objects', params=parameters, headers=header, auth=HTTPBasicAuth(uid,pwd), verify=sslCert) resp = requests.get(catalogServer + '/access/2/catalog/data/objects', params=parameters, headers=header, auth=HTTPBasicAuth(uid, pwd)) status = resp.status_code if status != 200: # some error - e.g. catalog not running, or bad credentials print("error! " + str(status) + str(resp.json())) return resultJson = resp.json() print('\tJSON response: q=' + str(resultJson)) total = resultJson['metadata']['totalCount'] #print(total) if total == 0: print("\t" + str(total) + " item(s) not found - catalog object will not be updated") return if total != 1: print("\t" + str(total) + " more than 1 result found, all of them will be updated") itemId = "" itemName = "" itemHref = "" # get the eTag eTag = resp.headers['ETag'] print("\titem found: eTag=" + eTag) for foundItem in resultJson["items"]: itemId = foundItem["id"] itemName = edcutils.getFactValue(foundItem, "core.name") itemHref = foundItem["href"] # get the facts collection (to be updated for any new/upated attributes itemFacts = foundItem['facts'] print("\tid=" + itemId) print("\tname=" + itemName) # print("\tfacts=" + str(itemFacts)) # flag to determine if an updated is necessary updateObject = False # activity counts newFactCount = 0 updFactCount = 0 for excelAttrName, excelAttrVal in attrDict.items(): print("\t\tchecking excel attr=" + excelAttrName) doesAttrExist = excelAttrName in custAttrDict print("\t\t\tattr exists in EDC? " + str(doesAttrExist)) if doesAttrExist: # check if the content matches edcAttrVal = edcutils.getFactValue( foundItem, custAttrDict.get(excelAttrName)) print("\t\t\t edc Attribute Value " + edcAttrVal) if (edcAttrVal == ''): newFactCount += 1 print("\t\t\tnew attribute value...") newFact = { 'attributeId': custAttrDict.get(excelAttrName), 'value': excelAttrVal, 'label': excelAttrName } print("\t\t\t\t\t" + str(newFact)) itemFacts.append(newFact) updateObject = True else: print("\t\t\texisting value='" + edcAttrVal + "'=='" + excelAttrVal + "' " + str(edcAttrVal == excelAttrVal)) # # compare the values - if match, do nothing # if different - update if (edcAttrVal == excelAttrVal): print("\t\t\tvalues match - no update needed") else: updFactCount += 1 updateObject = True # update the actual value... print("\t\t\treplacing: attr=" + custAttrDict.get(excelAttrName) + " from=" + edcAttrVal + " with " + excelAttrVal) replaceExistingAttrValue( foundItem, custAttrDict.get(excelAttrName), excelAttrVal) else: # attribute does not exist print("\t\t\tattr '" + excelAttrName + "' does not exist - use ldmadmin ui to create it") # print message if nothing to do & return if not updateObject: print("\tno updates needed for object") return # object should be updated print("\tobject should be updated - new facts:" + str(newFactCount) + " updated facts:" + str(updFactCount)) # uncomment these 2 lines to help test concurrent updates - while the sleep is happening, manually update the object # this should cause a 412 error message (ifMatch <> eTag) #print("sleeping for 20 seconds - to simulate concurrent updates") #time.sleep(20) # prepare to write the item to the catalog putParms = { 'Content-Type': 'application/json', 'Accept': 'application/json', 'If-Match': eTag } putUrl = catalogServer + '/access' + itemHref print("\tready to update item: url=" + putUrl + " parm=" + str(putParms)) # make the PUT request (execute the updaate #r = requests.put(putUrl, data=json.dumps(foundItem), headers=putParms, auth=HTTPBasicAuth(uid,pwd), verify=sslCert) r = requests.put(putUrl, data=json.dumps(foundItem), headers=putParms, auth=HTTPBasicAuth(uid, pwd)) putStatus = r.status_code print("\treturn code=" + str(putStatus)) if putStatus == 200: newEtag = r.headers['eTag'] print("\t\trequest completed: status=%d new eTag %s" % (putStatus, newEtag)) else: print("update failed." + str(r.json()))