def unify(cur): """ When a new table comes in, reduce it using reduce_log() and then run this function to incorporate it into the unified table, along with partitioning """ cur.execute("USE reduced_log") cur.execute('SHOW TABLES') tables = set(x for x, in cur.fetchall()) cur.execute("""SELECT PARTITION_NAME FROM INFORMATION_SCHEMA.PARTITIONS WHERE TABLE_SCHEMA = 'reduced_log' AND TABLE_NAME = 'unified'""") partitions = set(x for x, in cur.fetchall()) tables_to_add = sorted(tables - partitions - set(['unified', 'users', 'servers', 'unified_users', 'unified_servers'])) for table in tables_to_add: print_and_execute("""ALTER TABLE unified REORGANIZE PARTITION other INTO ({0}, PARTITION other VALUES LESS THAN MAXVALUE)""".format(partition_from_str(table)), cur) print_and_execute("INSERT INTO unified SELECT * FROM {0}".format(table), cur)
def create_unified(cur): """ When the reduced_log.unified table does not exist, or when the schema changes, run this function to regenerate it. @Precondition: all the tables in @initial_tables must be in the reduced log """ cur.execute("USE reduced_log") # Get list of 2 initial tables cur.execute("SHOW TABLES") initial_tables = [x for x, in cur.fetchall()][:2] print_and_execute("CREATE TABLE unified {0}".format(" UNION ALL ".join("SELECT * FROM {0}".format(t) for t in initial_tables)), cur) print_and_execute("ALTER TABLE unified ADD INDEX (userid)", cur) print_and_execute("ALTER TABLE unified ADD INDEX (serverid)", cur) print_and_execute("ALTER TABLE unified ADD INDEX (event_time)", cur) print_and_execute("ALTER TABLE unified PARTITION BY RANGE( TO_DAYS(event_time) ) ( " + ", ".join(partition_from_str(t) for t in initial_tables) + ", PARTITION other VALUES LESS THAN MAXVALUE" + ")", cur)
def create_new_temp_table(self): # Create a temp table lasttable = self.current_table_name() nexttable = self.next_table_name() # Create main table self.cur.execute("DROP TABLE IF EXISTS {0}".format(nexttable)) print_and_execute("CREATE TEMPORARY TABLE {0} AS {1}".format(nexttable, self.fil.sql(lasttable)), self.cur) print_and_execute("ALTER TABLE {0} ADD INDEX (userid)".format(nexttable), self.cur) print_and_execute("ALTER TABLE {0} ADD INDEX (serverid)".format(nexttable), self.cur) self.last_used_fil = self.fil
def reduce_log(tablename, cur): print >>sys.stderr, "Reducing general_log.{0} and storing into reduced_log".format(tablename) print >>sys.stderr, "Selecting results..." cur.execute("USE reduced_log") print_and_execute("SELECT user, userid FROM users", cur) users = dict(cur.fetchall()) usernum = max(users.values()) + 1 if users.values() else 0 # first open usernum newusers = [] print_and_execute("SELECT server, serverid FROM servers", cur) servers = dict(cur.fetchall()) servernum = max(servers.values()) + 1 if servers.values() else 0 # first open servernum newservers = [] cur.execute('USE general_log') print_and_execute("""SELECT * FROM {0} WHERE command_type IN ('Execute', 'Query')""".format(tablename), cur) print >>sys.stderr, "Selected results, cleaning queries and writing temp file..." temp_filename = '{0}_reduced.tmp'.format(tablename) outfile = open(temp_filename, 'w') for event_time, user_host, thread_id, server_id, command_type, query in cur: cleaned_query = clean(query, reserved_words) # Clean the query some more: remove numlists, replace constants cleaned_query = numlist_re.sub(numlist_sub_fcn, cleaned_query) try: user, server, cleaned_query = reducer.accept(user_host, cleaned_query) except TypeError: continue cleaned_query, vals = repl_constants(cleaned_query) vals = ' ~ '.join(vals) if user not in users: users[user] = usernum newusers.append(user) usernum += 1 if server not in servers: servers[server] = servernum newservers.append(server) servernum += 1 if cleaned_query.startswith('INSERT INTO'): query_type = 'INSERT' if values_re.search(cleaned_query): #TODO: count number of rows inserted. Nontrivial because of parens, commas, quotes, etc. cleaned_query = insert_re.match(cleaned_query).group(0) + ' <values>' vals = '' elif cleaned_query.startswith('SELECT'): query_type = 'SELECT' elif cleaned_query.startswith('CREATE TABLE'): # Replacing schemas with length + hash doesn't help much. # There aren't many create table statements (~1%) # cleaned_query = re.sub(r'\(.*\)', # lambda x: '<schema len={0}, hash={1}>'.format(x.group().count(',')+1, x.group().__hash__()), # cleaned_query) query_type = 'CREATE_TABLE' elif cleaned_query.startswith('SET'): query_type = 'SET' elif cleaned_query.startswith('LOAD DATA'): query_type = 'LOAD' elif cleaned_query.startswith('ALTER'): query_type = 'ALTER' else: query_type = 'OTHER' #we ignore server_id because it's always 0... cleaned_query = repr(cleaned_query)[1:-1] #deal with \n and others final = event_time, users[user], servers[server], thread_id, query_type, cleaned_query, vals print >>outfile, '\t'.join(str(s) for s in final) outfile.close() print >>sys.stderr, "Wrote temp file, loading data into {0} table...".format(tablename) cur.execute("USE reduced_log") cur.execute("""CREATE TABLE {0} (event_time DATETIME, userid INT, serverid INT, thread_id INT(11), query_type ENUM{1}, query MEDIUMTEXT, vals MEDIUMTEXT, INDEX (userid), INDEX (serverid), INDEX (event_time) )""".format(tablename, querytypes)) cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format(temp_filename, tablename)) os.remove(temp_filename) print >>sys.stderr, "Loaded data and removed temp file. Adding into users table..." for user in newusers: cur.execute("INSERT INTO users VALUES ('{0}', {1})".format(user, users[user])) print >>sys.stderr, "Added into users table, adding into servers table..." for server in newservers: cur.execute("INSERT INTO servers VALUES ('{0}', {1})".format(server, servers[server])) db.commit() print >>sys.stderr, "Added into servers table. Defining time functions..." # This redefines the time fcns for every table reduced, but that's a small cost define_time_functions(cur) print >>sys.stderr, "Defined time functions. Reduction complete"
def query_profile(tablename, numtop, period, cur): """ Generate profiles of the queries in @tablename """ # We are assuming the db already has the time functions defined. # This is one of the actions in the create reduced log table # define_time_functions(cur) print_and_execute("""SELECT user, time, query_type, count FROM (SELECT userid, my_{1}(event_time) AS time, query_type, count(*) AS count FROM {0} GROUP BY userid, time, query_type ) AS sth NATURAL JOIN users """.format(tablename, period), cur) peruser_divided = defaultdict(dict) peruser_alltime = dict() full_divided = dict() full_alltime = defaultdict(int) for user, time, query_type, count in cur.fetchall(): # print user, time, query_type, count if time not in peruser_divided[user]: peruser_divided[user][time] = defaultdict(int) peruser_divided[user][time][query_type] = count if user not in peruser_alltime: peruser_alltime[user] = defaultdict(int) peruser_alltime[user][query_type] += count if time not in full_divided: full_divided[time] = defaultdict(int) full_divided[time][query_type] += count full_alltime[query_type] += count #sort them by time and make into (ordered) list of tuples full_divided = sorted([(k, v) for (k, v) in full_divided.iteritems()], key=itemgetter(0)) for user in peruser_divided.keys(): peruser_divided[user] = sorted([(k, v) for (k, v) in peruser_divided[user].iteritems()], key=itemgetter(0)) print_and_execute("""SELECT user, query, vals FROM {0} NATURAL JOIN users """.format(tablename), cur) full_topqueries = defaultdict(dict) peruser_topqueries = defaultdict(dict) for user, query, vals in cur.fetchall(): full_query = full_topqueries[query] if vals not in full_query: full_query[vals] = 0 full_query[vals] += 1 peruser_user = peruser_topqueries[user] if query not in peruser_user: peruser_user[query] = {} if vals not in peruser_user[query]: peruser_user[query][vals] = 0 peruser_user[query][vals] += 1 print "stored queries" for user in peruser_topqueries: #takes forever peruser_topqueries[user] = map(lambda x: (x[0], sorted(x[1].iteritems(), key=itemgetter(1), reverse=True)), sorted(peruser_topqueries[user].iteritems(), key=lambda x: sum(ct for val, ct in x[1].iteritems()), reverse=True) )[:numtop] print "sorted each user" #takes no time: peruser_topqueries = sorted(peruser_topqueries.iteritems(), key = lambda x: sum( sum(ct for val, ct in valcts) for query, valcts in x[1] ), reverse = True) print "sorted peruser_topqueries" #takes a long time: full_topqueries = map(lambda x: (x[0], sorted(x[1].iteritems(), key=itemgetter(1), reverse=True)), sorted([(q, c) for (q, c) in full_topqueries.iteritems()], key = lambda x: sum(count for vals, count in x[1].iteritems()), reverse=True))[:numtop] print "sorted full_topqueries" return peruser_divided, peruser_alltime, full_divided, full_alltime, full_topqueries, peruser_topqueries
def run(self, target_db, source_db = 'general_log'): sort_schema, sql_sort_by = self._sort_schema() sql_where = ( "WHERE " + " AND ".join('(' + sel + ')' for sel in self.selectors) ) \ if self.selectors else '' final_selector = itemgetter(*[colname for colname, coltype in self.outputs]) conn = mysql.connect(**{"host": "localhost", "user": "******", "passwd": "", "unix_socket": "/u1/vbar/mysql/thesock", "cursorclass": DictCursor}) cur = conn.cursor() for table in self._tables_to_reduce(target_db, source_db): cur.execute("USE {0}".format(source_db)) sql_full = "SELECT * FROM {0} {1} {2}".format(table, sql_where, sql_sort_by) print_and_execute(sql_full, cur) rows = filter(self._all_prefilters_pass, cur.fetchall()) # TODO: can avoid using DictCursor by changing the above line for group in sort_schema: newrows = [] if group[0].sort_column: rows.sort(key=itemgetter(group[0].sort_column), reverse=group[0].sort_reverse) for row in rows: try: for processor in group: row.update(zip(processor.outputs, processor.process(row))) newrows.append(row) except SkipRowException: continue print "Done with one group of processors" rows = newrows print "All done processing, writing temp file and loading into table" temp_filename = '{0}.tmp'.format(table) with open(temp_filename, 'w') as outfile: print >>outfile, '\n'.join('\t'.join(str(x) for x in final_selector(row)) for row in rows) cur.execute("USE {0}".format(target_db)) cur.execute("CREATE TABLE {0} (".format(table) + \ ',\n'.join("{0} {1}".format(col, typ) for col, typ in self.outputs) + \ ")") cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}" .format(temp_filename, table)) os.remove(temp_filename) conn.commit() cur.close() conn.close()
def create_checkbox_lists(self, initial=False): """ Removes the current user and server checkbox panels from the window, if they exist (if they don't, they will be None, from __init__()) creates new ones with data from the current table, then adds them to the window again. If @initial is True, only the last partition (month) of the 'unified' table will be used for counts, but all users and servers will still be shown @initial - if this is the first time the checkbox lists are being generated, it works a bit differently: the counts are taken from just the last partition of the unified table so that startup doesn't take forever. We then also need to grab the names of other users who didn't appear in this first partition """ # Remove current user and server checkbox panels from the window if self.user_panel: self.window.remove(self.user_panel) if self.server_panel: self.window.remove(self.server_panel) # Create user filter checkboxes if initial: print_and_execute("""SELECT PARTITION_NAME FROM INFORMATION_SCHEMA.PARTITIONS WHERE TABLE_SCHEMA = 'reduced_log' AND TABLE_NAME = 'unified'""", self.cur) # there will be each month, then the 'other' partition, so we want to # select from the 2nd to last partition last_partition = [x for x, in self.cur.fetchall()][-2] table_to_use = "unified PARTITION({0})".format(last_partition) else: table_to_use = self.current_table_name() print_and_execute("""SELECT userid, user, count FROM (SELECT userid, COUNT(*) AS count FROM {0} GROUP BY userid ) AS sth NATURAL JOIN users ORDER BY count DESC """.format(table_to_use), self.cur) userlist = [x for x in self.cur.fetchall()] x_pos = 0 y_pos = 0 y_spacing = 20 self.user_checkboxes = {} size_width = 220 extent_width = size_width for userid, user, count in userlist: self.user_checkboxes[user] = CheckBox("{0} ({1})".format(user.replace('_', '_ '), count), position = (x_pos, y_pos), value = True) extent_width = max(self.user_checkboxes[user].size[0], extent_width) y_pos += y_spacing # Get users that didn't appear in the last partition, if @initial if initial: self.cur.execute("SELECT user, userid FROM users") for user, userid in self.cur.fetchall(): if user in self.user_checkboxes: continue self.user_checkboxes[user] = CheckBox(user.replace('_', '_ ') + " (0)", position = (x_pos, y_pos), value = True) extent_width = max(self.user_checkboxes[user].size[0], extent_width) y_pos += y_spacing # Add the user checkboxes to a ScrollableView: self.user_panel = ScrollableView(size = (size_width, 150), extent = (extent_width, max(150, y_pos)), scrolling = 'v' if extent_width <= size_width else 'hv') for cbox in self.user_checkboxes.values(): self.user_panel.add(cbox) # Add the panel to the window self.window.place(self.user_panel, top = top, left = self.query_type_panel + horiz_sp) # Create server filter checkboxes print_and_execute("""SELECT serverid, server, count FROM (SELECT serverid, COUNT(*) AS count FROM {0} GROUP BY serverid ) as sth NATURAL JOIN servers ORDER BY count DESC """.format(table_to_use), self.cur) serverlist = [x for x in self.cur.fetchall()] x_pos = 0 y_pos = 0 y_spacing = 20 self.server_checkboxes = {} size_width = 300 extent_width = size_width for serverid, server, count in serverlist: self.server_checkboxes[server] = CheckBox("{0} ({1})".format(server, count), position = (x_pos, y_pos), value = True) extent_width = max(self.server_checkboxes[server].size[0], extent_width) y_pos += y_spacing if initial: self.cur.execute("SELECT server FROM servers") for server, in self.cur.fetchall(): if server in self.server_checkboxes: continue self.server_checkboxes[server] = CheckBox(server.replace('_', '_ ') + " (0)", position = (x_pos, y_pos), value = True) extent_width = max(self.server_checkboxes[server].size[0], extent_width) y_pos += y_spacing # Add the server checkboxes to a ScrollableView self.server_panel = ScrollableView(size = (size_width, 150), extent = (extent_width, max(150, y_pos)), scrolling = 'v' if extent_width <= size_width else 'hv') for cbox in self.server_checkboxes.values(): self.server_panel.add(cbox) # Add the server panel to the window self.window.place(self.server_panel, top = top, left=self.user_panel + 10)
def run(self, target_db, source_db='general_log'): sort_schema, sql_sort_by = self._sort_schema() sql_where = ( "WHERE " + " AND ".join('(' + sel + ')' for sel in self.selectors) ) \ if self.selectors else '' final_selector = itemgetter( *[colname for colname, coltype in self.outputs]) conn = mysql.connect( **{ "host": "localhost", "user": "******", "passwd": "", "unix_socket": "/u1/vbar/mysql/thesock", "cursorclass": DictCursor }) cur = conn.cursor() for table in self._tables_to_reduce(target_db, source_db): cur.execute("USE {0}".format(source_db)) sql_full = "SELECT * FROM {0} {1} {2}".format( table, sql_where, sql_sort_by) print_and_execute(sql_full, cur) rows = filter(self._all_prefilters_pass, cur.fetchall()) # TODO: can avoid using DictCursor by changing the above line for group in sort_schema: newrows = [] if group[0].sort_column: rows.sort(key=itemgetter(group[0].sort_column), reverse=group[0].sort_reverse) for row in rows: try: for processor in group: row.update( zip(processor.outputs, processor.process(row))) newrows.append(row) except SkipRowException: continue print "Done with one group of processors" rows = newrows print "All done processing, writing temp file and loading into table" temp_filename = '{0}.tmp'.format(table) with open(temp_filename, 'w') as outfile: print >> outfile, '\n'.join('\t'.join( str(x) for x in final_selector(row)) for row in rows) cur.execute("USE {0}".format(target_db)) cur.execute("CREATE TABLE {0} (".format(table) + \ ',\n'.join("{0} {1}".format(col, typ) for col, typ in self.outputs) + \ ")") cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format( temp_filename, table)) os.remove(temp_filename) conn.commit() cur.close() conn.close()