Пример #1
0
    def getPrimaryKeys(self, columnseparator='|'):
        retval = []
        with pymysql.connect(host=self.db_host,
                             user=self.db_user,
                             passwd=self.db_password,
                             db=self.db_catalog) as cursor:
            cursor.execute("""
				SELECT
					TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, GROUP_CONCAT(COLUMN_NAME SEPARATOR '{0}')
				FROM 
					INFORMATION_SCHEMA.KEY_COLUMN_USAGE
				WHERE 
					CONSTRAINT_NAME = 'PRIMARY' and TABLE_SCHEMA = '{1}'
				GROUP BY
					TABLE_NAME, TABLE_SCHEMA
			""".format(columnseparator, self.db_catalog))
            retval = [
                PrimaryKey(db_catalog=d[1],
                           db_schema='',
                           tablename=d[2],
                           keyname='expk_' + str(d[2]),
                           db_columns=d[3].split(columnseparator),
                           type='explicit') for d in cursor.fetchall()
            ]
        return retval
Пример #2
0
    def getPrimaryKeys(self, columnseparator='|'):
        retval = []
        with pymssql.connect(self.db_host, self.db_user, self.db_password,
                             self.db_catalog) as conn:
            with conn.cursor() as cursor:
                cursor.execute("""
					with cte1 as (
						SELECT
							Tab.TABLE_CATALOG,
							Tab.TABLE_SCHEMA,
							Tab.TABLE_NAME,
							Col.CONSTRAINT_NAME,
							Col.COLUMN_NAME 
						FROM 
							INFORMATION_SCHEMA.TABLE_CONSTRAINTS Tab, 
							INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE Col 
						WHERE 
							Col.Constraint_Name = Tab.Constraint_Name
							AND Col.Table_Name = Tab.Table_Name
							AND Constraint_Type = 'PRIMARY KEY'
					)    

					SELECT
						G.TABLE_CATALOG,
						G.TABLE_SCHEMA,
						G.TABLE_NAME,
						G.CONSTRAINT_NAME,
						stuff(
						(
							SELECT cast('{0}' as varchar(max)) + U.COLUMN_NAME
							FROM cte1 U
							WHERE U.CONSTRAINT_NAME = G.CONSTRAINT_NAME and U.TABLE_CATALOG = G.TABLE_CATALOG and U.TABLE_SCHEMA = G.TABLE_SCHEMA and U.TABLE_NAME = G.TABLE_NAME
							for xml path('')
						), 1, 1, '') AS PKCOLS
					FROM
						cte1 G
					WHERE
						G.TABLE_CATALOG = '{1}'
					GROUP BY
						G.CONSTRAINT_NAME, G.TABLE_CATALOG, G.TABLE_SCHEMA, G.TABLE_NAME
				""".format(columnseparator, self.db_catalog))
                retval = [
                    PrimaryKey(db_catalog=d[0],
                               db_schema=d[1],
                               tablename=d[2],
                               keyname=d[3],
                               db_columns=d[4].split(columnseparator),
                               type='explicit') for d in cursor.fetchall()
                ]
        return retval
Пример #3
0
    def doOneTable(self, table):
        excluded_fields = ['text', 'decimal', 'float', 'binary']
        retval = []

        try:
            data = {}
            counts = {}
            columnnames = []
            alldata = []

            columns = [ c for c in table.columns if c.type not in excluded_fields ]
            columnnames = [ c.name for c in columns ]

            alldata = self.getDataForSelectedColumns(table)

            for column in table.columns:
                idx = columnnames.index(column.name)
                data[column.name] = [ row[idx] for row in alldata ]

            if len(data) > 0:
                for columnname in columnnames:
                    values = [str(d) for d in data[columnname]]
                    c = Counter(values)
                    counts[columnname] = dict(c)

                singlekeys = itertools.combinations(columnnames, 1)
                pks = []
                useless = []
                candidateCombi = {}
                numrows = len(alldata)

                for combi in singlekeys:
                    key = combi[0]
                    dictionary = counts[key]
                    if len(dictionary.keys()) == 1:
                        useless.append(key)
                    if len(dictionary.keys()) == numrows:
                        candidateCombi[combi] = len(dictionary.keys())
                        pks.append(key)

                for key in useless:
                    columnnames.remove(key)

                duokeys = itertools.combinations(columnnames, 2)
                triplekeys = []
                if len(columnnames) <= 10:
                    triplekeys = itertools.combinations(columnnames, 3)

                for combi in itertools.chain(duokeys, triplekeys):
                    n = 1
                    overlap = [val for val in combi if val in useless]
                    overlap = overlap or [val for val in combi if val in pks]
                    if len(overlap) == 0:
                        for key in combi:
                            n = n * len(counts[key].keys())
                        if n >= numrows:
                            candidateCombi[combi] = n

                sortedCombis = sorted(candidateCombi.items(), key=operator.itemgetter(1), reverse=True)
                rankedCombis = []
                for length in range(1, len(columnnames)):
                    for combi in sortedCombis:
                        if len(combi[0]) == length:
                            rankedCombis.append(combi)

                del counts

                combiCounts = {}
                validCombis = []
                for combi in rankedCombis:
                    validCombis.append(combi[0])
                    combiCounts[combi[0]] = sortedcontainers.SortedList()

                for row in alldata:
                    for combi in validCombis:
                        values = []
                        for key in combi:
                            idx = columnnames.index(key)
                            value = row[idx]
                            values.append(str(value))
                        if len(combi) > 0:
                            if not values in combiCounts[combi]:
                                combiCounts[combi].add(values)
                            else:
                                validCombis.remove(combi)

                for validCombi in validCombis:
                    pk = PrimaryKey()
                    pk.db_catalog = table.info['db_catalog']
                    pk.db_schema = table.info['schemaname']
                    pk.tablename = table.name
                    pk.db_columns = self.colseparator.join(list(validCombi))
                    pk.keyname = 'detected PK'
                    pk.type = 'IMPLICIT'
                    pk.score = 1.0
                    retval.append(pk)
        except Exception as e:
            print('could not process: ' + table.name)
            print(e)

        return retval
Пример #4
0
 def single(self, pk, statistics=None):
     obj = PrimaryKey()
     obj.db_catalog = pk['db_catalog']
     obj.schema = pk['schemaname']
     obj.tablename = pk['tablename']
     obj.db_columns = '|'.join(pk['constrained_columns'])
     obj.keyname = pk['name']
     obj.type = 'EXPLICIT'
     obj.score = 1.0
     obj.comment = ''
     obj.tags = ''
     obj.date_added = datetime.datetime.now()
     return obj
Пример #5
0
    def discoverpks(self):
        excluded_fields = ['text']
        retval = []

        for table in self.tables:
            print(table)

            try:
                data = {}
                counts = {}
                tablecolumns = []
                # tablecoldict = {}
                columnnames = []
                alldata = []

                tablecolumns = [
                    i for i in self.columns
                    if i.db_catalog == table.db_catalog and i.db_schema ==
                    table.db_schema and i.tablename == table.tablename
                ]
                columnnames = [
                    i.columnname for i in tablecolumns
                    if i.datatype not in excluded_fields
                ]
                alldata = self.getDataFn(table)

                for column in tablecolumns:
                    columnname = column.columnname

                    # tablecoldict[columnname] = column
                    data[columnname] = [i[columnname] for i in alldata]

                if len(data) > 0:
                    for column in tablecolumns:
                        values = data[column.columnname]
                        c = Counter(values)
                        counts[column.columnname] = dict(c)

                    singlekeys = itertools.combinations(columnnames, 1)
                    pks = []
                    useless = []
                    candidateCombi = {}
                    numrows = len(alldata)

                    for combi in singlekeys:
                        key = combi[0]
                        dictionary = counts[key]
                        if len(dictionary.keys()) == 1:
                            useless.append(key)
                        if len(dictionary.keys()) == numrows:
                            candidateCombi[combi] = len(dictionary.keys())
                            pks.append(key)

                    for key in useless:
                        columnnames.remove(key)

                    duokeys = itertools.combinations(columnnames, 2)
                    triplekeys = []
                    if len(columnnames) <= 10:
                        triplekeys = itertools.combinations(columnnames, 3)

                    for combi in itertools.chain(duokeys, triplekeys):
                        n = 1
                        overlap = [val for val in combi if val in useless]
                        overlap = overlap or [
                            val for val in combi if val in pks
                        ]
                        if len(overlap) == 0:
                            for key in combi:
                                n = n * len(counts[key].keys())
                            if n >= numrows:
                                candidateCombi[combi] = n

                    sortedCombis = sorted(candidateCombi.items(),
                                          key=operator.itemgetter(1),
                                          reverse=True)
                    rankedCombis = []
                    for length in range(1, len(columnnames)):
                        for combi in sortedCombis:
                            if len(combi[0]) == length:
                                rankedCombis.append(combi)

                    del counts

                    combiCounts = {}
                    validCombis = []
                    for combi in rankedCombis:
                        validCombis.append(combi[0])
                        combiCounts[combi[0]] = sortedcontainers.SortedList()

                    for rowdict in alldata:
                        for combi in validCombis:
                            values = []
                            for key in combi:
                                value = rowdict[key]
                                values.append(str(value))
                            if len(combi) > 0:
                                if not values in combiCounts[combi]:
                                    combiCounts[combi].add(values)
                                else:
                                    validCombis.remove(combi)

                    for validCombi in validCombis:
                        pk = PrimaryKey(db_catalog=table.db_catalog,
                                        db_schema=table.db_schema,
                                        tablename=table.tablename,
                                        db_columns=list(validCombi),
                                        keyname='implicit primary key',
                                        type='implicit')
                        retval.append(pk)
            except Exception as e:
                print('could not process: ' + str(table))
                print(e)

        return retval