def getForeignKeys(self, columnseparator='|'): retval = [] with pymysql.connect(host=self.db_host, user=self.db_user, passwd=self.db_password, db=self.db_catalog) as cursor: cursor.execute(""" SELECT TABLE_CATALOG, TABLE_SCHEMA, REFERENCED_TABLE_SCHEMA, TABLE_NAME, REFERENCED_TABLE_NAME, CONSTRAINT_NAME, GROUP_CONCAT(COLUMN_NAME SEPARATOR '{0}'), GROUP_CONCAT(REFERENCED_COLUMN_NAME SEPARATOR '{0}') FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE TABLE_SCHEMA = '{1}' AND REFERENCED_TABLE_NAME IS NOT NULL AND REFERENCED_TABLE_SCHEMA IS NOT NULL AND REFERENCED_COLUMN_NAME IS NOT NULL GROUP BY TABLE_NAME, TABLE_SCHEMA, REFERENCED_TABLE_NAME, REFERENCED_TABLE_SCHEMA """.format(columnseparator, self.db_catalog)) retval = [ ForeignKey(db_catalog=d[1], pkdb_schema='', fkdb_schema='', pktablename=d[3], fktablename=d[4], keyname=d[5], pk_columns=d[6].split(columnseparator), fk_columns=d[7].split(columnseparator), type='explicit') for d in cursor.fetchall() ] return retval
def getForeignKeys(self, columnseparator='|'): retval = [] with pymssql.connect(self.db_host, self.db_user, self.db_password, self.db_catalog) as conn: with conn.cursor() as cursor: cursor.execute(""" with cte1 as ( SELECT obj.name AS FK_NAME, pksch.name AS [pkschema], fksch.name AS [fkschema], tab1.name AS [table], col1.name AS [column], tab2.name AS [referenced_table], col2.name AS [referenced_column] FROM sys.foreign_key_columns fkc INNER JOIN sys.objects obj ON obj.object_id = fkc.constraint_object_id INNER JOIN sys.tables tab1 ON tab1.object_id = fkc.parent_object_id INNER JOIN sys.tables tab2 ON tab2.object_id = fkc.referenced_object_id INNER JOIN sys.schemas pksch ON tab1.schema_id = pksch.schema_id INNER JOIN sys.schemas fksch ON tab2.schema_id = fksch.schema_id INNER JOIN sys.columns col1 ON col1.column_id = parent_column_id AND col1.object_id = tab1.object_id INNER JOIN sys.columns col2 ON col2.column_id = referenced_column_id AND col2.object_id = tab2.object_id ) SELECT G.pkschema, G.fkschema, G.[table] as pktable, G.referenced_table as fktable, G.FK_NAME, stuff( ( select cast('{0}' as varchar(max)) + U.[column] from cte1 U WHERE U.FK_NAME = G.FK_NAME for xml path('') ), 1, 1, '') AS pkcolumns, stuff( ( select cast('{0}' as varchar(max)) + U.[referenced_column] from cte1 U WHERE U.FK_NAME = G.FK_NAME for xml path('') ), 1, 1, '') AS fkcolumns FROM cte1 G GROUP BY G.FK_NAME, G.pkschema, G.fkschema, G.[table], G.referenced_table """.format(columnseparator)) retval = [ ForeignKey(db_catalog=self.db_catalog, pkdb_schema=d[0], fkdb_schema=d[1], pktablename=d[2], fktablename=d[3], keyname=d[4], pk_columns=d[5].split(columnseparator), fk_columns=d[6].split(columnseparator), type='explicit') for d in cursor.fetchall() ] return retval
def discoverfks(self, theta): # phase 1 fs = [] fm = [] # b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>) bksketches = {} quantiles = {} s = {} # calculate bottom-k sketch for all columns and store in dictionary <bksketches> for column in self.columns: bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch( self.getDataFn(column.db_schema, column.columnname, column.tablename)) pkall = self.pksingle pkall.extend(self.pkmulti) for pk in pkall: # foreach primary key (single and multi) pkcollst = pk.db_columns.split(self.colseparator) n = len(pkcollst) for keycolumn_name in pkcollst: # foreach column in primary key for candidate in self.columns: # foreach column as foreign key candidate if self.inclusion( bksketches[ (candidate.db_schema, candidate.tablename, candidate.columnname)], bksketches[ (pk.db_schema, pk.tablename, keycolumn_name)]) >= theta and ( candidate.tablename != pk.tablename): if n == 1: # in case we are dealing with a single column pk fs.append(([candidate], pk)) if n > 1: # in case we are dealing with a multi column pk if (pk.db_columns, keycolumn_name) not in s: s[(pk.db_columns, keycolumn_name)] = [] # dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be # just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name') # indicate the two entries in s for PK 'id|name'. For each entry we store a list of # candidate columns found in other tables s[(pk.db_columns, keycolumn_name)].append(candidate) if n > 1: bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch( self.getDataFn( pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram( self.getDataFn( pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) # phase 2 # fks: dictionary that indexes on (<foreignkey table>, <primary key column>) # value of the dictionary are those candidate columns in <foreignkey table> for <primary key column> # TBD: remove the table loop fks = {} for kvp in s: spkcolname = kvp[1] for e in s[kvp]: key = (e.tablename, spkcolname) if key not in fks: fks[key] = [] fks[key].append(e) for pkm in self.pkmulti: pkcollst = pkm.db_columns.split(self.colseparator) print(pkm) # print() # for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian # product and store in the fm list for table in self.tables: tname = table.tablename L = [] for pkcolumn in pkcollst: key = (tname, pkcolumn) if key not in fks: continue L.append(fks[key]) if len(L) == len(pkcollst): cart = self.cartesian(L) for prod in cart: fm.append((prod, pkm)) for flst, pk in fm: pkcollst = pk.db_columns.split(self.colseparator) fcols = [c.columnname for c in flst] fschema = flst[0].db_schema # TBD: ugly indices here ftable = flst[0].tablename # TBD: and here fsample = self.bottomksketch(self.getDataFn( fschema, fcols, ftable)) if self.inclusion( fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta: quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram( self.getDataFn( pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram( self.getDataFn(fschema, fcols, ftable)) else: fm.remove((flst, pk)) for flst, pk in fs: # only index zero because every fs has only one candidate column... quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram( self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename)) result = [] fall = fs fall.extend(fm) for f, pk in fall: fcols = [] for cdict in f: fcols.append(cdict.columnname) fschema = f[0].db_schema # TBD: ugly indices here ftable = f[0].tablename # TBD: and here if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[( pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns.... qfk = quantiles[(fschema, ftable, "|".join(fcols))] qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] emdscore = 0 try: for i in range(len(qfk)): fkhist = qfk[i][0] pkhist = qpk[i][0] fkbins = qfk[i][1] pkbins = qpk[i][1] emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1]) emdscore = emdscore / len(qfk[0]) except: emdscore = -1 if math.isnan(emdscore): emdscore = -1 nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit') nfk.pk_columns = pk.db_columns nfk.score = emdscore result.append((nfk, emdscore)) # print("## len(Q): " + str(len(q))) return sorted(result, key=lambda kvp: kvp[1], reverse=False)
def single(self, fk, statistics=None): obj = ForeignKey() print(fk) obj.db_catalog = fk['db_catalog'] obj.pkdb_schema = fk['srcschema'] obj.fkdb_schema = fk['referred_schema'] obj.pktablename = fk['srctable'] obj.fktablename = fk['referred_table'] obj.pk_columns = '|'.join(fk['constrained_columns']) obj.fk_columns = '|'.join(fk['referred_columns']) obj.keyname = fk['name'] obj.type = 'EXPLICIT' obj.score = 1.0 obj.comment = '' obj.tags = '' obj.date_added = datetime.datetime.now() return obj
def discoverfks(self, theta): # phase 1 fs = [] fm = [] # b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>) bksketches = {} quantiles = {} s = {} # calculate bottom-k sketch for all columns and store in dictionary <bksketches> for column in self.columns: bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch(self.getDataFn(column.db_schema, column.columnname, column.tablename)) pkall = self.pksingle pkall.extend(self.pkmulti) for pk in pkall: # foreach primary key (single and multi) pkcollst = pk.db_columns.split(self.colseparator) n = len(pkcollst) for keycolumn_name in pkcollst: # foreach column in primary key for candidate in self.columns: # foreach column as foreign key candidate this = bksketches[(candidate.db_schema, candidate.tablename, candidate.columnname)] that = bksketches[(pk.db_schema, pk.tablename, keycolumn_name)] if self.inclusion(this, that) >= theta and (candidate.tablename != pk.tablename): if n == 1: # in case we are dealing with a single column pk fs.append(([candidate], pk)) if n > 1: # in case we are dealing with a multi column pk if (pk.db_columns, keycolumn_name) not in s: s[(pk.db_columns, keycolumn_name)] = [] # dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be # just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name') # indicate the two entries in s for PK 'id|name'. For each entry we store a list of # candidate columns found in other tables s[(pk.db_columns, keycolumn_name)].append(candidate) if n > 1: bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) # phase 2 # fks: dictionary that indexes on (<foreignkey table>, <primary key column>) # value of the dictionary are those candidate columns in <foreignkey table> for <primary key column> # TBD: remove the table loop fks = {} for kvp in s: spkcolname = kvp[1] for e in s[kvp]: key = (e.tablename, spkcolname) if key not in fks: fks[key] = [] fks[key].append(e) for pkm in self.pkmulti: pkcollst = pkm.db_columns.split(self.colseparator) print(pkm) # print() # for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian # product and store in the fm list for table in self.tables: tname = table.tablename L = [] for pkcolumn in pkcollst: key = (tname, pkcolumn) if key not in fks: continue L.append(fks[key]) if len(L) == len(pkcollst): cart = self.cartesian(L) for prod in cart: fm.append((prod, pkm)) for flst,pk in fm: pkcollst = pk.db_columns.split(self.colseparator) fcols = [ c.columnname for c in flst ] fschema = flst[0].db_schema # TBD: ugly indices here ftable = flst[0].tablename # TBD: and here fsample = self.bottomksketch(self.getDataFn(fschema, fcols, ftable)) if self.inclusion(fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta: quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram(self.getDataFn(fschema, fcols, ftable)) else: fm.remove((flst,pk)) for flst,pk in fs: # only index zero because every fs has only one candidate column... quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram(self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename)) result = [] fall = fs fall.extend(fm) for f,pk in fall: fcols = [] for cdict in f: fcols.append(cdict.columnname) fschema = f[0].db_schema # TBD: ugly indices here ftable = f[0].tablename # TBD: and here if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns.... qfk = quantiles[(fschema, ftable, "|".join(fcols))] qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] emdscore = 0 try: for i in range(len(qfk)): fkhist = qfk[i][0] pkhist = qpk[i][0] fkbins = qfk[i][1] pkbins = qpk[i][1] emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1]) emdscore = emdscore/len(qfk[0]) except: emdscore = -1 if math.isnan(emdscore): emdscore = -1 nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit') nfk.pk_columns=pk.db_columns nfk.score = emdscore result.append((nfk, emdscore)) # print("## len(Q): " + str(len(q))) return sorted(result, key=lambda kvp: kvp[1], reverse=False)