def main(): cutme = read_in_args_to_cut() db = connect_db() args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): if arg in cutme: continue done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() if arg == '"': continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.close() cmds = list(set(chain.from_iterable([v.keys() for v in args.values()]))) cmds_idxs = zip(cmds, range(len(cmds))) for (cmd, idx) in cmds_idxs: print json.dumps((cmd, idx))
def main(): db = connect_db() #args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args_cursor = db.execute("select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() if arg == '"': continue arg = transform_argument(arg) cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] cmd = transform_command(cmd) if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.close() max_term_freq = {} for (arg, cmd_cnts) in args.iteritems(): for (cmd, cnt) in cmd_cnts.iteritems(): if not cmd in max_term_freq: max_term_freq[cmd] = 0 #max_term_freq[cmd] = max(max_term_freq[cmd], cnt) max_term_freq[cmd] += cnt #cmds = list(set(chain.from_iterable([v.keys() for v in args.values()]))) cmds_idxs = zip(max_term_freq.items(), range(len(max_term_freq))) for ((cmd, f), idx) in cmds_idxs: print json.dumps((cmd, f, idx))
def main(): cutme = read_in_args_to_cut() db = connect_db() args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): if arg in cutme: continue done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() if arg == '"': continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.commit() for (arg, cmd_counts) in args.iteritems(): print json.dumps([arg, cmd_counts.items()]) db.close()
def main(): db = connect_db() # args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args_cursor = db.execute( "select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg" ) args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() if arg == '"': continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.commit() cmds = list(set(chain.from_iterable([v.keys() for v in args.values()]))) cmds_idxs = zip(cmds, range(len(cmds))) for (cmd, idx) in cmds_idxs: print json.dumps((cmd, idx))
def main(): cut = read_in_args_to_cut() db = connect_db() args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): arg = arg.strip() done = progress(done) if arg == '"': continue if arg in cut: continue new_arg = transform_arg(arg) if new_arg != arg: continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] cmd = transform_cmd(cmd) if cmd == "search": continue if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.commit() cmds = list(set(chain.from_iterable([v.keys() for v in args.values()]))) cmds_idxs = zip(cmds, range(len(cmds))) for (cmd, idx) in cmds_idxs: print json.dumps((cmd, idx))
def main(): db = connect_db() select_cursor = db.execute("SELECT id, name FROM users") for (id, name) in select_cursor.fetchall(): matches = re.findall('[\d]{9}', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('[a-z]{2}[\d]{5}', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('[a-z]{1}[\d]{6}', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('\d[a-z]{5}\d', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('[a-z]{2}[\d]{3}[a-z]', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('[\d][a-z]{5}\d', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('savedsearch', name) if len(matches) > 0: update_user_type_as_machine(db, id) matches = re.findall('[\d]{8}', name) if len(matches) > 0: update_user_type_as_machine(db, id) db.close()
def main(): cut = read_in_args_to_cut() db = connect_db() args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): arg = arg.strip() done = progress(done) if arg == '"': continue if arg in cut: continue new_arg = transform_arg(arg) if new_arg != arg: continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] cmd = transform_cmd(cmd) if cmd == "search": continue if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.commit() for (arg, cmd_counts) in args.iteritems(): print json.dumps([arg, cmd_counts.items()])
def main(): db = connect_db() select_cursor = db.execute("SELECT id, text FROM queries WHERE id<?", [31651]) for (id, query) in select_cursor.fetchall(): if query.find(' rex ') > -1: delete_template_commands_args(db, id) db.close()
def main(): counts = [] #labels = [] db = connect_db() #cursor = db.execute("SELECT count(*) AS cnt, arg FROM args GROUP BY arg ORDER BY cnt DESC") cursor = db.execute("select count(distinct queries.user_id) as cnt, arg from args, queries where args.query_id = queries.id group by arg order by cnt desc") for (cnt, arg) in cursor.fetchall(): counts.append(cnt) #labels.append(arg) db.close() #counts = np.log(counts) ind = range(max(counts)+1) #ind = np.logspace(0, np.log10(max(counts)), num=20) #ind = [0] + list(ind) #ind = np.linspace(0, max(counts), num=20) #print ind total = len(counts) cdf = [float(len(filter(lambda x: x <= i, counts)))/float(total) for i in ind] #print zip(ind, cdf) plt.plot(ind, cdf) plt.ylabel("Percent of arguments with counts less than or equal to C") plt.xlabel("C") plt.xscale("log") plt.grid(True) plt.show()
def main(): db = connect_db() cursor = db.execute("SELECT command, text FROM commands, queries \ WHERE commands.query_id = queries.id \ AND operation_type=?", ["FilterSelection"]) types = defaultdict(int) total = 0 for (command, query) in cursor.fetchall(): total += 1 added_to_something = False if command.find("DEDUP") == 0: types["filter out duplicates"] += 1 added_to_something = True if command.find("HEAD") == 0: types["filter on index"] += 1 added_to_something = True if command.find("REGEX") == 0: c = command.lstrip("REGEX") types["filter on string match"] += 1 added_to_something = True if command == "SEARCH" or command == "SEARCH(SOURCE)": types["filter on string match"] += 1 added_to_something = True else: c = command.strip("SEARCH") c = c.strip("WHERE") s = get_filter_stage(query) if c.find("(AND") == 0 or c.find("(EQ") == 0: types["filter on multiple conditions"] += 1 added_to_something = True if c.find("(OR") == 0: types["filter on multiple conditions"] += 1 added_to_something = True if c.find("(NOT") == 0 or c.find("(NE") == 0: types["filter on multiple conditions"] += 1 added_to_something = True if c.find("(GT") > -1 or c.find("GE") > -1 or c.find("(LT") > -1 or c.find("(LE") > -1: types["filter with numerical bound"] += 1 added_to_something = True if c.find("(ISNULL") > -1 or c.find("(ISNOTNULL") > -1: types["filter on nullness"] += 1 added_to_something = True if c.find("SOURCETYPE") > -1 or c.find("SOURCE") > -1 or c.find("HOST") > -1 or c.find("EVENTTYPE") > -1 or query.find("index") > -1: types["filter on source"] += 1 added_to_something = True if c.find("EARLIEST") > -1 or c.find("LATEST") > -1: types["filter on time"] += 1 added_to_something = True if re.match(time_pattern,s): types["filter on time"] += 1 added_to_something = True if not added_to_something: print "Missed a case!", command db.close() print_results(types, total)
def main(): db = connect_db() select_cursor = db.cursor() select_cursor.execute("SELECT command, id FROM commands") for (command, id) in select_cursor.fetchall(): category = lookup_category(command) print "Put command", command, "in category", category update_cursor = db.cursor() update_cursor.execute("UPDATE commands SET operation_type=? WHERE id=?", [category, id]) db.commit() db.close()
def read_in_sessions(): sessions = defaultdict(list) db = connect_db() select_cursor = db.execute("SELECT session_id, query_id, template FROM templated_sessions ORDER BY session_id") for (session_id, query_id, template) in select_cursor.fetchall(): time_cursor = db.execute("SELECT time FROM queries WHERE id=?", [query_id]) time = float(time_cursor.fetchall()[0][0]) template = ParseTreeNode.from_dict(json.loads(template)) sessions[session_id].append((time, query_id, template)) db.close() return sessions
def main(): db = connect_db() cursor = db.execute("select count(distinct users.name) as cnt, template from templates, users, queries where templates.query_id = queries.id and queries.user_id = users.id group by template order by cnt desc") data = cursor.fetchall() for (cnt, template) in data: d = json.loads(template) template = ParseTreeNode.from_dict(d) template.print_tree() print "count: ", cnt print "" print "" db.close()
def main(): db = connect_db() big_sessions_cursor = db.execute("SELECT session_id, COUNT(session_id) AS cnt FROM queries GROUP BY session_id ORDER BY cnt DESC") for (session_id, queries_per_session) in big_sessions_cursor.fetchall(): if queries_per_session > 30: session_owner_cursor = db.execute("SELECT user_id FROM sessions WHERE id=? LIMIT 1", [session_id]) user_id = session_owner_cursor.fetchall()[0][0] owner_name_cursor = db.execute("SELECT name, user_type FROM users WHERE id=? LIMIT 1", [user_id]) for (name, user_type) in owner_name_cursor.fetchall(): if user_type == "person": print ','.join([str(name), str(session_id), str(queries_per_session)]) db.close()
def load_and_print_redundant_edges(): db = connect_db() cursor = db.execute("SELECT arg, query_id FROM args") curr = prev = "" query_set = set() for (arg, query_id) in cursor.fetchall(): curr = query_id if not curr == prev: print_edges(list(query_set)) query_set.clear() query_set.add(arg) prev = curr db.close()
def main(): db = connect_db() arg_clusters = read_in_arg_clusters() cluster_labels = read_in_cluster_labels() arg_labels = make_arg_labels(arg_clusters, cluster_labels) for (arg, labels) in arg_labels.iteritems(): if len(labels) > 1: print "adding to", arg, "id -2 label multiple" multiple_cursor = db.cursor() multiple_cursor.execute("UPDATE args SET clique_id=?, clique_label=? WHERE arg=?", [-1, "multiple", arg]) db.commit() if len(labels) == 1: cluster_id = min(arg_clusters[arg]) label = labels.pop() print "adding to", arg, "id", cluster_id, "label", label single_cursor = db.cursor() single_cursor.execute("UPDATE args SET clique_id=?, clique_label=? WHERE arg=?", [cluster_id, label, arg]) db.commit() db.close()
def main(): db = connect_db() args_cursor = db.execute("SELECT count(id) as cnt, clique_label, command_id FROM args GROUP BY clique_label, command_id") argrps = {} done = 0 for (cnt, label, cmdid) in args_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() cmds_cursor = db.execute("SELECT category FROM commands WHERE id=?", [cmdid]) category = cmds_cursor.fetchall()[0][0] if not label in argrps: argrps[label] = {} if not category in argrps[label]: argrps[label][category] = 0 argrps[label][category] += 1 db.commit() for (arglabel, cmdgrps) in argrps.iteritems(): print json.dumps([arglabel, cmdgrps.items()])
def main(): db = connect_db() done = 0 select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0]) schemas = {} for (id, query) in select_cursor.fetchall(): done += 1 if done % 10 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() schema = queryutils.extract_schema(query) if not schema is None: schemas[id] = (text, schema) print "query:", id print "schema:" print schema print users_by_query_text = {} for (id, (text, schema)) for schemas.iteritems():
def main(): done = 0 db = connect_db() select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0]) for (id, query) in select_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() check_cursor = db.execute("SELECT * FROM commands \ WHERE query_id=?", [str(id)]) if len(check_cursor.fetchall()) == 0: parsetree = queryutils.parse_query(query) if parsetree is None: last_query = query last_failed = True continue commands_and_args = parsetree.command_arg_tuple_list() insert_commands_and_args(db, id, commands_and_args) db.close()
def read_in_templatized_sessions(): session_queries = defaultdict(list) session_templates = defaultdict(list) db = connect_db() cursor = db.execute("SELECT queries.session_id, templates.query_id, templates.template \ FROM queries, templates, users \ WHERE queries.id = templates.query_id \ AND users.id = queries.user_id \ AND users.user_type = ?", ["person"]) num_queries = 0 for (session_id, query_id, template) in cursor.fetchall(): session_templates[session_id].append((query_id, template)) num_queries += 1 print "Formed session_templates dict with length", len(session_templates) print "\t and with number of queries", num_queries complete_sessions = {} for (session_id, query_templates) in session_templates.iteritems(): cursor = db.execute("SELECT session_id, id FROM queries \ WHERE session_id = ?", [session_id]) queries = [] for (sid, qid) in cursor.fetchall(): queries.append(qid) if len(queries) == len(query_templates): print "Have a complete session of length", len(query_templates) complete_sessions[session_id] = query_templates for (query_id, template) in query_templates: print "This is happening." insert_cursor = db.cursor() insert_cursor.execute("INSERT INTO templated_sessions \ (session_id, query_id, template) \ VALUES (?,?,?)", [session_id, query_id, template]) db.commit() db.close() return complete_sessions
def main(): done = 0 commands_plus_counts = defaultdict(int) last_query = "" db = connect_db() select_cursor = db.execute("SELECT template FROM templates where template not null") for (template) in select_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() d = json.loads(template) # convert to parse tree node here t = ParseTreeNode.from_dict(d) commands_plus = t.get_commands_plus_list() for c in commands_plus: commands_plus_counts[c] += 1 top_commands_plus = sorted(commands_plus.items(), key=lambda x: x[1], reverse=True)[:20] for (commands_plus, counts) in top_commands_plus: print counts, commands_plus db.close()
def main(): counts = [] #labels = [] db = connect_db() #cursor = db.execute("SELECT count(*) AS cnt, arg FROM args GROUP BY arg ORDER BY cnt DESC") cursor = db.execute("select count(distinct queries.user_id) as cnt, arg from args, queries where args.query_id = queries.id group by arg order by cnt desc") for (cnt, arg) in cursor.fetchall(): counts.append(cnt) #labels.append(arg) db.close() ind = range(len(counts)) counts = np.log(counts) print counts[0:20] #width = 1.0 #bars = plt.bar(ind, counts, width, color="green", alpha=0.5) plt.plot(ind, counts) plt.ylabel("log( Number of times argument appears )") plt.xlabel("Argument rank") #plt.xticks(rotation=90) #plt.xticks([i+width/2 for i in ind], labels) plt.grid(True) plt.show()
def main(): db = connect_db() #args_cursor = db.execute("SELECT command_id, arg FROM args ORDER BY arg") args_cursor = db.execute("select command_id, arg from (select command_id, arg from commands, args, templates, queries where queries.id = commands.query_id and queries.id = args.query_id and templates.query_id = queries.id group by arg, command, template) order by arg") args = {} done = 0 for (command_id, arg) in args_cursor.fetchall(): done += 1 if done % 100 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() if arg == '"': continue cmds_cursor = db.execute("SELECT command FROM commands WHERE id=?", [command_id]) cmd = cmds_cursor.fetchall()[0][0] if not arg in args: args[arg] = {} if not cmd in args[arg]: args[arg][cmd] = 0 args[arg][cmd] += 1 db.commit() for (arg, cmd_counts) in args.iteritems(): print json.dumps([arg, cmd_counts.items()]) db.close()
def main(): done = 0 db = connect_db() #select_cursor = db.execute("SELECT id, text FROM queries WHERE id>?", [273347]) select_cursor = db.execute("SELECT min(id), text FROM queries WHERE id>? GROUP BY text", [0]) for (id, query) in select_cursor.fetchall(): done += 1 if done % 10 == 0: sys.stderr.write(str(done) + " done\n") sys.stderr.flush() check_cursor = db.execute("SELECT * FROM templates \ WHERE query_id=?", [str(id)]) if len(check_cursor.fetchall()) == 0: template = queryutils.extract_template(query) if template is None: last_query = query last_failed = True continue s = template.dumps() insert_cursor = db.cursor() insert_cursor.execute("INSERT INTO templates (query_id, template) \ VALUES (?,?)", [id, s]) db.commit() db.close()
def read_data_from_db(): counts = [] optypes = [] db = connect_db() cursor = db.execute("select count(*) as count, operation_type from commands group by operation_type order by count") for (cnt, operation_type) in cursor.fetchall(): if operation_type == "FilterSelection": operation_type = "Filter" if operation_type == "InputtingSelection": operation_type = "Input" if operation_type == "TransformingProjection": operation_type = "Transform" if operation_type == "Projection": operation_type = "Project" if operation_type == "ExtendedProjection": operation_type = "Extend" if operation_type == "WindowingProjection": operation_type = "Window" if operation_type == "Aggregation": operation_type = "Aggregate" counts.append(cnt) optypes.append(operation_type) db.close() return counts, optypes
def load_db(): global db db = connect_db() load_main() db.close()
def execute_db_script(script): with closing(connect_db()) as db: with app.open_resource(script) as f: db.cursor().executescript(f.read()) db.commit()
def extract_context(cleanarg, rawlist): c = Context(cleanarg) db = connect_db() number_commands = 0 #print for arg in rawlist: #print "arg: ", arg cursor = db.execute("SELECT command, text FROM queries, commands, args \ WHERE queries.id = args.query_id \ AND args.command_id = commands.id \ AND arg=?", [arg]) for (command, query) in cursor.fetchall(): #print "\tcmd:", command number_commands += 1 set = False set_field = False if command.find("SEARCH") == 0 or command.find("WHERE") == 0: for comparator in ["=", ">", "<", "!=", "=="]: if query.find(arg + comparator) > -1 or query.find(arg + " " + comparator) > -1: c.grp_filtered_on_as_field += 1 set = True set_field = True if not set_field: c.grp_filtered_on_as_value += 1 set = True if command.find("DEDUP") == 0: c.grp_filtered_on_as_field += 1 set = True if command.find("TOP") == 0: try: int(c.arg) c.grp_argument_to_option += 1 set = True except: c.grp_argument_to_top += 1 set = True if command.find("SORT") == 0: try: int(arg) c.grp_argument_to_option += 1 set = True except: if query.find("=" + arg) == -1: c.grp_sorted_by += 1 set = True if command.find("FIELDS(PLUS)") == 0 or command.find("TABLE") == 0 or command.find("EXPORT") == 0: c.grp_projected += 1 set = True if command.find("FIELDS(MINUS)") == 0: c.grp_unprojected += 1 set = True if command.find("INPUTLOOKUP") == 0 or command.find("ABSTRACT") == 0: if query.find("=" + arg) > -1: c.grp_argument_to_option += 1 set = True if command.find("HEAD") == 0: c.grp_argument_to_option += 1 set = True if command.find("STATS") == 0 or command.find("TIMECHART") == 0 or command.find("CHART") == 0: for function in ["count", "min", "avg", "max", "sum", "c", "values", "range", "last", "distinct_count", "dc", "mode", "var"]: if query.find(function + " " + arg) > -1 or query.find(function + "(" + arg + ")") > -1 or query.find(function + " (" + arg + ")") > -1: c.grp_argument_to_aggregation += 1 set = True if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0 or command.find("RENAME") == 0) and command.find("AS") > -1: if query.find("as " + arg) > -1 or query.find("AS " + arg) > -1: c.grp_argument_renamed_as += 1 set = True if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0) and command.find("OVER") > -1: if query.find("over " + arg) > -1 or query.find("OVER " + arg) > -1: c.grp_field_used_as_function_domain += 1 set = True if (command.find("STATS") == 0 or command.find("EVAL") == 0 or command.find("CHART") == 0 or command.find("TIMECHART") == 0) and command.find("BY") > -1: if query.find("by " + arg) > -1 or query.find("BY " + arg) > -1 or query.find("by") < query.rfind(arg): c.grp_grouped_by += 1 set = True if command.find("EVAL") > -1 and (command.find("DIVIDES") > -1 or command.find("TIMES") > -1 or command.find("PLUS") > -1): c.grp_argument_to_arithmetic_transformation += 1 set = True if command.find("EVAL") > -1: if query.find(arg + "=") > -1: c.grp_field_used_in_conditional += 1 set = True else: c.grp_value_used_in_other_transformation += 1 #print command set = True if command.find("MULTIKV") == 0 or command.find("CONVERT") == 0 or command.find("BUCKET") == 0 or command.find("REX") == 0 or command.find("REPLACE") == 0 or command.find("REGEX") == 0 or command.find("MAKEMV") == 0 or command.find("MVEXPAND") == 0: c.grp_field_used_in_other_transformation += 1 #print command set = True if command.find("TIMECHART") == 0: if query.find("=" + arg) > -1 or query.find("= " + arg) > -1: try: int(arg) set = True c.grp_argument_to_option += 1 except: pass if arg == "false" and not set: c.grp_argument_to_option += 1 set = True if not set: print "\t\t\tNo case for this one!" db.close() if c.number_set() < number_commands: print "Missed a case!", arg exit() return c
def main(): len_one_sessions = 0 generalizing_parsetrees = 0 specializing_parsetrees = 0 identical_parsetrees = 0 total = 0 inds = [] gens = [] specs = [] others = [] sessions = read_in_sessions() for (session_id, queries) in sessions.iteritems(): queries = sorted(queries, key=lambda x: x[0]) if len(queries) == 1: len_one_sessions += 1 elif is_identical_parsetrees(queries): identical_parsetrees += 1 inds.append((session_id, queries)) elif is_generalizing_parsetrees(queries): generalizing_parsetrees += 1 gens.append((session_id, queries)) elif is_specializing_parsetrees(queries): specializing_parsetrees += 1 specs.append((session_id, queries)) else: others.append((session_id, queries)) total += 1 other = total - len_one_sessions - generalizing_parsetrees - specializing_parsetrees - identical_parsetrees print "Length one sessions: ", len_one_sessions print "Generalizing parsetrees sessions: ", generalizing_parsetrees print "Specializing parsetree sessions: ", specializing_parsetrees print "Identical parsetree sessions: ", identical_parsetrees print "Other (unknown): ", other print "Total: ", total print # Print inds info. inds_avg_time_length = 0. inds_min_time_length = 1e12 inds_max_time_length = 0. inds_avg_item_length = 0. inds_min_item_length = 1e12 inds_max_item_length = 0. for (sid, templates) in inds: first = True for (time, qid, template) in templates: if first: first = False firsttime = time time_length = time - firsttime item_length = len(templates) inds_max_time_length = max(time_length, inds_max_time_length) inds_min_time_length = min(time_length, inds_min_time_length) inds_avg_time_length += time_length inds_max_item_length = max(item_length, inds_max_item_length) inds_min_item_length = min(item_length, inds_min_item_length) inds_avg_item_length += item_length inds_avg_time_length = inds_avg_time_length / len(inds) inds_avg_item_length = inds_avg_item_length / len(inds) print "Identical sessions max time length:", inds_max_time_length print "Identical sessions min time length:", inds_min_time_length print "Identical sessions avg time length:", inds_avg_time_length print print "Identical sessions max item length:", inds_max_item_length print "Identical sessions min item length:", inds_min_item_length print "Identical sessions avg item length:", inds_avg_item_length print # Print gens info. gens_avg_time_length = 0. gens_min_time_length = 1e12 gens_max_time_length = 0. gens_avg_item_length = 0. gens_min_item_length = 1e12 gens_max_item_length = 0. for (sid, templates) in gens: first = True for (time, qid, template) in templates: if first: first = False firsttime = time time_length = time - firsttime item_length = len(templates) gens_max_time_length = max(time_length, gens_max_time_length) gens_min_time_length = min(time_length, gens_min_time_length) gens_avg_time_length += time_length gens_max_item_length = max(item_length, gens_max_item_length) gens_min_item_length = min(item_length, gens_min_item_length) gens_avg_item_length += item_length gens_avg_time_length = gens_avg_time_length / len(gens) gens_avg_item_length = gens_avg_item_length / len(gens) print "Generalizing sessions max time length:", gens_max_time_length print "Generalizing sessions min time length:", gens_min_time_length print "Generalizing sessions avg time length:", gens_avg_time_length print print "Generalizing sessions max item length:", gens_max_item_length print "Generalizing sessions min item length:", gens_min_item_length print "Generalizing sessions avg item length:", gens_avg_item_length print # Print specs info. specs_avg_time_length = 0. specs_min_time_length = 1e12 specs_max_time_length = 0. specs_avg_item_length = 0. specs_min_item_length = 1e12 specs_max_item_length = 0. for (sid, templates) in specs: first = True for (time, qid, template) in templates: if first: first = False firsttime = time length = time - firsttime item_length = len(templates) specs_max_time_length = max(length, specs_max_time_length) specs_min_time_length = min(length, specs_min_time_length) specs_avg_time_length += specs_avg_time_length specs_max_item_length = max(item_length, specs_max_item_length) specs_min_item_length = min(item_length, specs_min_item_length) specs_avg_item_length += item_length specs_avg_time_length = length / len(specs) specs_avg_item_length = specs_avg_item_length / len(specs) print "Specializing sessions max time length:", specs_max_time_length print "Specializing sessions min time length:", specs_min_time_length print "Specializing sessions avg time length:", specs_avg_time_length print print "Specializing sessions max item length:", specs_max_item_length print "Specializing sessions min item length:", specs_min_item_length print "Specializing sessions avg item length:", specs_avg_item_length print # Print others. db = connect_db() for (sid, templates) in others: print for (time, qid, template) in templates: qcursor = db.execute("SELECT text FROM queries WHERE id=?", [qid]) query = qcursor.fetchall()[0][0] print query.strip() #template.print_tree() print print "__________________________________________________________________________" db.close()
def main(): db = connect_db() cursor = db.execute("SELECT command, text FROM commands, queries \ WHERE commands.query_id = queries.id \ AND operation_type=?", ["ExtendedProjection"]) types = defaultdict(int) total = 0 for (command, query) in cursor.fetchall(): if command == "EVAL" or command == "EVAL(EQ)": continue total += 1 added_to_something = False if command.find("REX") == 0: types["string extraction"] += 1 added_to_something = True if command.find("EVAL") == 0: if command.find("CASE") > -1: types["conditional"] += 1 added_to_something = True if command.find("COALESCE") > -1: types["conditional"] += 1 added_to_something = True if command.find("CONCAT") > -1: types["string manipulation"] += 1 added_to_something = True if command.find("DIVIDES") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("FLOOR") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("IF") > -1: types["conditional"] += 1 added_to_something = True if command.find("LIKE") > -1: types["string match"] += 1 added_to_something = True if command.find("MATCH") > -1: types["string match"] += 1 added_to_something = True if command.find("MINUS") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("MVCOUNT") > -1: types["multivalue"] += 1 added_to_something = True if command.find("MVINDEX") > -1: types["multivalue"] += 1 added_to_something = True if command.find("PLUS") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("REPLACE") > -1: types["string concatenation"] += 1 added_to_something = True if command.find("ROUND") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("SPLIT") > -1: types["string concatenation"] += 1 added_to_something = True if command.find("STRPTIME") > -1: types["time manipulation"] += 1 added_to_something = True if command.find("SUBSTR") > -1: types["string manipulation"] += 1 added_to_something = True if command.find("TIMES") > -1: types["arithmetic"] += 1 added_to_something = True if command.find("TONUMBER") > -1: types["conversion"] += 1 added_to_something = True if command.find("TRIM") > -1: types["string manipulation"] += 1 added_to_something = True if command.find("UPPER") > -1: types["string manipulation"] += 1 added_to_something = True if not added_to_something: print "Missed a case!", command cursor = db.execute("SELECT command, text FROM commands, queries \ WHERE commands.query_id = queries.id \ AND operation_type=?", ["Projection"]) for (command, query) in cursor.fetchall(): total += 1 added_to_something = False if command.find("MINUS") > -1: types["unproject"] += 1 added_to_something = True else: types["project"] += 1 added_to_something = True if not added_to_something: print "Missed a case!", command cursor = db.execute("SELECT command, text FROM commands, queries \ WHERE commands.query_id = queries.id \ AND operation_type=?", ["TransformingProjection"]) for (command, query) in cursor.fetchall(): total += 1 added_to_something = False if command.find("BUCKET") == 0: types["bucket"] += 1 added_to_something = True if command.find("CONVERT") == 0: types["conversion"] += 1 added_to_something = True if command.find("FILLNULL") == 0: types["fill null"] += 1 added_to_something = True if command.find("MAKEMV") == 0: types["multivalue"] += 1 added_to_something = True if command.find("MULTIKV") == 0: types["string extraction"] += 1 added_to_something = True if command.find("REPLACE") == 0: types["string manipulation"] += 1 added_to_something = True if not added_to_something: print "Missed a case!", command db.close() print_results(types, total)