def getPrimaryKeys(self, columnseparator='|'): retval = [] with pymysql.connect(host=self.db_host, user=self.db_user, passwd=self.db_password, db=self.db_catalog) as cursor: cursor.execute(""" SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, GROUP_CONCAT(COLUMN_NAME SEPARATOR '{0}') FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE CONSTRAINT_NAME = 'PRIMARY' and TABLE_SCHEMA = '{1}' GROUP BY TABLE_NAME, TABLE_SCHEMA """.format(columnseparator, self.db_catalog)) retval = [ PrimaryKey(db_catalog=d[1], db_schema='', tablename=d[2], keyname='expk_' + str(d[2]), db_columns=d[3].split(columnseparator), type='explicit') for d in cursor.fetchall() ] return retval
def getPrimaryKeys(self, columnseparator='|'): retval = [] with pymssql.connect(self.db_host, self.db_user, self.db_password, self.db_catalog) as conn: with conn.cursor() as cursor: cursor.execute(""" with cte1 as ( SELECT Tab.TABLE_CATALOG, Tab.TABLE_SCHEMA, Tab.TABLE_NAME, Col.CONSTRAINT_NAME, Col.COLUMN_NAME FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS Tab, INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE Col WHERE Col.Constraint_Name = Tab.Constraint_Name AND Col.Table_Name = Tab.Table_Name AND Constraint_Type = 'PRIMARY KEY' ) SELECT G.TABLE_CATALOG, G.TABLE_SCHEMA, G.TABLE_NAME, G.CONSTRAINT_NAME, stuff( ( SELECT cast('{0}' as varchar(max)) + U.COLUMN_NAME FROM cte1 U WHERE U.CONSTRAINT_NAME = G.CONSTRAINT_NAME and U.TABLE_CATALOG = G.TABLE_CATALOG and U.TABLE_SCHEMA = G.TABLE_SCHEMA and U.TABLE_NAME = G.TABLE_NAME for xml path('') ), 1, 1, '') AS PKCOLS FROM cte1 G WHERE G.TABLE_CATALOG = '{1}' GROUP BY G.CONSTRAINT_NAME, G.TABLE_CATALOG, G.TABLE_SCHEMA, G.TABLE_NAME """.format(columnseparator, self.db_catalog)) retval = [ PrimaryKey(db_catalog=d[0], db_schema=d[1], tablename=d[2], keyname=d[3], db_columns=d[4].split(columnseparator), type='explicit') for d in cursor.fetchall() ] return retval
def doOneTable(self, table): excluded_fields = ['text', 'decimal', 'float', 'binary'] retval = [] try: data = {} counts = {} columnnames = [] alldata = [] columns = [ c for c in table.columns if c.type not in excluded_fields ] columnnames = [ c.name for c in columns ] alldata = self.getDataForSelectedColumns(table) for column in table.columns: idx = columnnames.index(column.name) data[column.name] = [ row[idx] for row in alldata ] if len(data) > 0: for columnname in columnnames: values = [str(d) for d in data[columnname]] c = Counter(values) counts[columnname] = dict(c) singlekeys = itertools.combinations(columnnames, 1) pks = [] useless = [] candidateCombi = {} numrows = len(alldata) for combi in singlekeys: key = combi[0] dictionary = counts[key] if len(dictionary.keys()) == 1: useless.append(key) if len(dictionary.keys()) == numrows: candidateCombi[combi] = len(dictionary.keys()) pks.append(key) for key in useless: columnnames.remove(key) duokeys = itertools.combinations(columnnames, 2) triplekeys = [] if len(columnnames) <= 10: triplekeys = itertools.combinations(columnnames, 3) for combi in itertools.chain(duokeys, triplekeys): n = 1 overlap = [val for val in combi if val in useless] overlap = overlap or [val for val in combi if val in pks] if len(overlap) == 0: for key in combi: n = n * len(counts[key].keys()) if n >= numrows: candidateCombi[combi] = n sortedCombis = sorted(candidateCombi.items(), key=operator.itemgetter(1), reverse=True) rankedCombis = [] for length in range(1, len(columnnames)): for combi in sortedCombis: if len(combi[0]) == length: rankedCombis.append(combi) del counts combiCounts = {} validCombis = [] for combi in rankedCombis: validCombis.append(combi[0]) combiCounts[combi[0]] = sortedcontainers.SortedList() for row in alldata: for combi in validCombis: values = [] for key in combi: idx = columnnames.index(key) value = row[idx] values.append(str(value)) if len(combi) > 0: if not values in combiCounts[combi]: combiCounts[combi].add(values) else: validCombis.remove(combi) for validCombi in validCombis: pk = PrimaryKey() pk.db_catalog = table.info['db_catalog'] pk.db_schema = table.info['schemaname'] pk.tablename = table.name pk.db_columns = self.colseparator.join(list(validCombi)) pk.keyname = 'detected PK' pk.type = 'IMPLICIT' pk.score = 1.0 retval.append(pk) except Exception as e: print('could not process: ' + table.name) print(e) return retval
def single(self, pk, statistics=None): obj = PrimaryKey() obj.db_catalog = pk['db_catalog'] obj.schema = pk['schemaname'] obj.tablename = pk['tablename'] obj.db_columns = '|'.join(pk['constrained_columns']) obj.keyname = pk['name'] obj.type = 'EXPLICIT' obj.score = 1.0 obj.comment = '' obj.tags = '' obj.date_added = datetime.datetime.now() return obj
def discoverpks(self): excluded_fields = ['text'] retval = [] for table in self.tables: print(table) try: data = {} counts = {} tablecolumns = [] # tablecoldict = {} columnnames = [] alldata = [] tablecolumns = [ i for i in self.columns if i.db_catalog == table.db_catalog and i.db_schema == table.db_schema and i.tablename == table.tablename ] columnnames = [ i.columnname for i in tablecolumns if i.datatype not in excluded_fields ] alldata = self.getDataFn(table) for column in tablecolumns: columnname = column.columnname # tablecoldict[columnname] = column data[columnname] = [i[columnname] for i in alldata] if len(data) > 0: for column in tablecolumns: values = data[column.columnname] c = Counter(values) counts[column.columnname] = dict(c) singlekeys = itertools.combinations(columnnames, 1) pks = [] useless = [] candidateCombi = {} numrows = len(alldata) for combi in singlekeys: key = combi[0] dictionary = counts[key] if len(dictionary.keys()) == 1: useless.append(key) if len(dictionary.keys()) == numrows: candidateCombi[combi] = len(dictionary.keys()) pks.append(key) for key in useless: columnnames.remove(key) duokeys = itertools.combinations(columnnames, 2) triplekeys = [] if len(columnnames) <= 10: triplekeys = itertools.combinations(columnnames, 3) for combi in itertools.chain(duokeys, triplekeys): n = 1 overlap = [val for val in combi if val in useless] overlap = overlap or [ val for val in combi if val in pks ] if len(overlap) == 0: for key in combi: n = n * len(counts[key].keys()) if n >= numrows: candidateCombi[combi] = n sortedCombis = sorted(candidateCombi.items(), key=operator.itemgetter(1), reverse=True) rankedCombis = [] for length in range(1, len(columnnames)): for combi in sortedCombis: if len(combi[0]) == length: rankedCombis.append(combi) del counts combiCounts = {} validCombis = [] for combi in rankedCombis: validCombis.append(combi[0]) combiCounts[combi[0]] = sortedcontainers.SortedList() for rowdict in alldata: for combi in validCombis: values = [] for key in combi: value = rowdict[key] values.append(str(value)) if len(combi) > 0: if not values in combiCounts[combi]: combiCounts[combi].add(values) else: validCombis.remove(combi) for validCombi in validCombis: pk = PrimaryKey(db_catalog=table.db_catalog, db_schema=table.db_schema, tablename=table.tablename, db_columns=list(validCombi), keyname='implicit primary key', type='implicit') retval.append(pk) except Exception as e: print('could not process: ' + str(table)) print(e) return retval