def test_attach1(self): # enable USDT probe from given PID and verifier generated BPF programs u = USDT(pid=int(self.app.pid)) u.enable_probe(probe="probe", fn_name="do_trace") b = BPF(text=self.bpf_text, usdt_contexts=[u]) # processing events self.probe_value_1 = 0 self.probe_value_2 = 0 self.probe_value_3 = 0 self.probe_value_other = 0 def print_event(cpu, data, size): result = ct.cast(data, ct.POINTER(ct.c_int)).contents if result.value == 1: self.probe_value_1 = 1 elif result.value == 2: self.probe_value_2 = 1 elif result.value == 3: self.probe_value_3 = 1 else: self.probe_value_other = 1 b["event"].open_perf_buffer(print_event) for i in range(10): b.perf_buffer_poll() self.assertTrue(self.probe_value_1 != 0) self.assertTrue(self.probe_value_2 != 0) self.assertTrue(self.probe_value_3 != 0) self.assertTrue(self.probe_value_other == 0)
def print_usdt(pid, lib): reader = USDT(path=lib, pid=pid) probes_seen = [] for probe in reader.enumerate_probes(): probe_name = probe.short_name() if not args.filter or fnmatch.fnmatch(probe_name, args.filter): if probe_name in probes_seen: continue probes_seen.append(probe_name) print_usdt_details(probe)
def print_usdt(pid, lib): reader = USDT(path=lib, pid=pid) probes_seen = [] for probe in reader.enumerate_probes(): probe_name = probe.short_name() if not args.filter or fnmatch.fnmatch(probe_name, args.filter): if probe_name in probes_seen: continue probes_seen.append(probe_name) if args.variables: print(probe) else: print("%s %s:%s" % (probe.bin_path, probe.provider, probe.name))
def _enable_probes(self): self.usdts = [] for pid in self.targets: usdt = USDT(pid=pid) for event in self.events: try: usdt.enable_probe(event, "%s_%s" % (self.language, event)) except Exception: # This process might not have a recent version of the USDT # probes enabled, or might have been compiled without USDT # probes at all. The process could even have been shut down # and the pid been recycled. We have to gracefully handle # the possibility that we can't attach probes to it at all. pass self.usdts.append(usdt)
def _find_usdt_probe(self): target = Probe.pid if Probe.pid and Probe.pid != -1 else Probe.tgid self.usdt = USDT(path=self.library, pid=target) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name: return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name)
def _enable_probes(self): self.usdts = [] for pid in self.targets: try: usdt = USDT(pid=pid) except USDTException: # avoid race condition on pid going away. print("failed to instrument %d" % pid, file=sys.stderr) continue for event in self.events: try: usdt.enable_probe(event, "%s_%s" % (self.language, event)) except Exception: # This process might not have a recent version of the USDT # probes enabled, or might have been compiled without USDT # probes at all. The process could even have been shut down # and the pid been recycled. We have to gracefully handle # the possibility that we can't attach probes to it at all. pass self.usdts.append(usdt)
def do_object_creation_probes(): # load BPF program bpf_text = """ #include <uapi/linux/ptrace.h> struct key_t { char objectname[128]; }; BPF_HASH(objmap, struct key_t, u64); int do_count(struct pt_regs *ctx) { struct key_t key = {}; bpf_probe_read(&key.objectname, sizeof(key.objectname), (void *)(unsigned long)ctx->r13); objmap.increment(key); return 0; } """ u = USDT(pid=args.pid) u.enable_probe(probe="object__create", fn_name="do_count") b = BPF(text=bpf_text, usdt=u) print("Press Ctrl-C to display/exit") try: sleep(99999999) except KeyboardInterrupt: pass data = b.get_table("objmap") data = sorted(data.items(), key=lambda kv: kv[1].value) for key, value in data: print("\t%-10s %s" % \ (str(value.value), str(key.objectname.decode())))
def _generate_functions(self, template): self.usdt = None text = "" if self.type == "p" and not self.library: functions = BPF.get_kprobe_functions(self.pattern) verify_limit(len(functions)) for function in functions: text += self._add_function(template, function) elif self.type == "p" and self.library: # uprobes are tricky because the same function may have multiple # addresses, and the same address may be mapped to multiple # functions. We aren't allowed to create more than one uprobe # per address, so track unique addresses and ignore functions that # map to an address that we've already seen. Also ignore functions # that may repeat multiple times with different addresses. addresses, functions = (set(), set()) functions_and_addresses = BPF.get_user_functions_and_addresses( self.library, self.pattern) verify_limit(len(functions_and_addresses)) for function, address in functions_and_addresses: if address in addresses or function in functions: continue addresses.add(address) functions.add(function) text += self._add_function(template, function) elif self.type == "t": tracepoints = BPF.get_tracepoints(self.pattern) verify_limit(len(tracepoints)) for tracepoint in tracepoints: text += self._add_function(template, tracepoint) elif self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) matches = [] for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): matches.append(probe.name) verify_limit(len(matches)) for match in matches: new_func = "trace_count_%d" % self.matched text += self._add_function(template, match) self.usdt.enable_probe(match, new_func) if debug: print(self.usdt.get_text()) return text
"-DHHVM_TRACING_MAX_STACK_FRAMES={}".format(200), "-DHHVM_TRACING_FILE_NAME_MAX={}".format(128), "-DHHVM_TRACING_CLASS_NAME_MAX={}".format(128), "-DHHVM_TRACING_FUNCTION_MAX={}".format(128), ] bpf_text = "" with open("../strobelight_hhvm_structs.h", "r") as structs_file: bpf_text += structs_file.read() + "\n" with open("../strobelight_hhvm_stacks_usdt_probe.c", "r") as stacks_file: bpf_text += stacks_file.read() + "\n" usdts = [] for pid in args.pids: usdt = USDT(pid=pid) usdt.enable_probe("hhvm_stack", "on_hhvm_event_hook") print("Enabled tracing on {}\n".format(pid)) usdts.append(usdt) b = BPF(text=bpf_text, usdt_contexts=usdts, cflags=cflags) class HackSymbol: """Struct from strobelight_hhvm_structs.h""" def __init__(self, bcc_obj): self.line = bcc_obj.line encoding = "utf-8" self.file_name = bcc_obj.file_name.decode(encoding) self.class_name = bcc_obj.class_name.decode(encoding) self.function = bcc_obj.function.decode(encoding)
return 0; } """ if not args.server: text += probe.replace("PROBE", "client").replace("TYPE", "CLIENT") if not args.client: text += probe.replace("PROBE", "server").replace("TYPE", "SERVER") CLIENT = 0 SERVER = 1 if args.stack: text = "#define NEED_STACK\n" + text usdt = USDT(pid=args.pid) if not args.server: usdt.enable_probe("http__client__request", "client_start") usdt.enable_probe("http__client__response", "client_end") if not args.client: usdt.enable_probe("http__server__request", "server_start") usdt.enable_probe("http__server__response", "server_end") bpf = BPF(text=text, usdt_contexts=[usdt], debug=DEBUG_PREPROCESSOR if args.debug else 0) class Data(ct.Structure): _fields_ = [ ("start_ns", ct.c_ulong), ("duration_ns", ct.c_ulong), ("port", ct.c_int), ("type", ct.c_int),
class Probe(object): probe_count = 0 streq_index = 0 max_events = None event_count = 0 first_ts = 0 use_localtime = True tgid = -1 pid = -1 @classmethod def configure(cls, args): cls.max_events = args.max_events cls.use_localtime = not args.offset cls.first_ts = Time.monotonic_time() cls.tgid = args.tgid or -1 cls.pid = args.pid or -1 def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None self.streq_functions = "" self.raw_probe = probe self.string_size = string_size self.kernel_stack = kernel_stack self.user_stack = user_stack Probe.probe_count += 1 self._parse_probe() self.probe_num = Probe.probe_count self.probe_name = "probe_%s_%d" % (self._display_function(), self.probe_num) def __str__(self): return "%s:%s:%s FLT=%s ACT=%s/%s" % ( self.probe_type, self.library, self._display_function(), self.filter, self.types, self.values, ) def is_default_action(self): return self.python_format == "" def _bail(self, error): raise ValueError("error in probe '%s': %s" % (self.raw_probe, error)) def _parse_probe(self): text = self.raw_probe # Everything until the first space is the probe specifier first_space = text.find(" ") spec = text[:first_space] if first_space >= 0 else text self._parse_spec(spec) if first_space >= 0: text = text[first_space:].lstrip() else: text = "" # If we now have a (, wait for the balanced closing ) and that # will be the predicate self.filter = None if len(text) > 0 and text[0] == "(": balance = 1 for i in range(1, len(text)): if text[i] == "(": balance += 1 if text[i] == ")": balance -= 1 if balance == 0: self._parse_filter(text[: i + 1]) text = text[i + 1 :] break if self.filter is None: self._bail("unmatched end of predicate") if self.filter is None: self.filter = "1" # The remainder of the text is the printf action self._parse_action(text.lstrip()) def _parse_spec(self, spec): parts = spec.split(":") # Two special cases: 'func' means 'p::func', 'lib:func' means # 'p:lib:func'. Other combinations need to provide an empty # value between delimiters, e.g. 'r::func' for a kretprobe on # the function func. if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] if len(parts[0]) == 0: self.probe_type = "p" elif parts[0] in ["p", "r", "t", "u"]: self.probe_type = parts[0] else: self._bail("probe type must be '', 'p', 't', 'r', " + "or 'u', but got '%s'" % parts[0]) if self.probe_type == "t": self.tp_category = parts[1] self.tp_event = parts[2] self.library = "" # kernel self.function = "" # from TRACEPOINT_PROBE elif self.probe_type == "u": self.library = parts[1] self.usdt_name = parts[2] self.function = "" # no function, just address # We will discover the USDT provider by matching on # the USDT name in the specified library self._find_usdt_probe() else: self.library = parts[1] self.function = parts[2] def _find_usdt_probe(self): target = Probe.pid if Probe.pid and Probe.pid != -1 else Probe.tgid self.usdt = USDT(path=self.library, pid=target) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name: return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name) def _parse_filter(self, filt): self.filter = self._rewrite_expr(filt) def _parse_types(self, fmt): for match in re.finditer(r"[^%]%(s|u|d|llu|lld|hu|hd|x|llx|c|K|U)", fmt): self.types.append(match.group(1)) fmt = re.sub(r"([^%]%)(u|d|llu|lld|hu|hd)", r"\1d", fmt) fmt = re.sub(r"([^%]%)(x|llx)", r"\1x", fmt) fmt = re.sub("%K|%U", "%s", fmt) self.python_format = fmt.strip('"') def _parse_action(self, action): self.values = [] self.types = [] self.python_format = "" if len(action) == 0: return action = action.strip() match = re.search(r"(\".*?\"),?(.*)", action) if match is None: self._bail('expected format string in "s') self.raw_format = match.group(1) self._parse_types(self.raw_format) for part in re.split('(?<!"),', match.group(2)): part = self._rewrite_expr(part) if len(part) > 0: self.values.append(part) aliases = { "retval": "PT_REGS_RC(ctx)", "arg1": "PT_REGS_PARM1(ctx)", "arg2": "PT_REGS_PARM2(ctx)", "arg3": "PT_REGS_PARM3(ctx)", "arg4": "PT_REGS_PARM4(ctx)", "arg5": "PT_REGS_PARM5(ctx)", "arg6": "PT_REGS_PARM6(ctx)", "$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)", "$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)", "$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)", "$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)", "$cpu": "bpf_get_smp_processor_id()", } def _generate_streq_function(self, string): fname = "streq_%d" % Probe.streq_index Probe.streq_index += 1 self.streq_functions += """ static inline bool %s(char const *ignored, unsigned long str) { char needle[] = %s; char haystack[sizeof(needle)]; bpf_probe_read(&haystack, sizeof(haystack), (void *)str); for (int i = 0; i < sizeof(needle); ++i) { if (needle[i] != haystack[i]) { return false; } } return true; } """ % ( fname, string, ) return fname def _rewrite_expr(self, expr): for alias, replacement in Probe.aliases.items(): # For USDT probes, we replace argN values with the # actual arguments for that probe obtained using # bpf_readarg_N macros emitted at BPF construction. if alias.startswith("arg") and self.probe_type == "u": continue expr = expr.replace(alias, replacement) matches = re.finditer('STRCMP\\(("[^"]+\\")', expr) for match in matches: string = match.group(1) fname = self._generate_streq_function(string) expr = expr.replace("STRCMP", fname, 1) return expr p_type = { "u": ct.c_uint, "d": ct.c_int, "llu": ct.c_ulonglong, "lld": ct.c_longlong, "hu": ct.c_ushort, "hd": ct.c_short, "x": ct.c_uint, "llx": ct.c_ulonglong, "c": ct.c_ubyte, "K": ct.c_ulonglong, "U": ct.c_ulonglong, } def _generate_python_field_decl(self, idx, fields): field_type = self.types[idx] if field_type == "s": ptype = ct.c_char * self.string_size else: ptype = Probe.p_type[field_type] fields.append(("v%d" % idx, ptype)) def _generate_python_data_decl(self): self.python_struct_name = "%s_%d_Data" % (self._display_function(), self.probe_num) fields = [ ("timestamp_ns", ct.c_ulonglong), ("tgid", ct.c_uint), ("pid", ct.c_uint), ("comm", ct.c_char * 16), # TASK_COMM_LEN ] for i in range(0, len(self.types)): self._generate_python_field_decl(i, fields) if self.kernel_stack: fields.append(("kernel_stack_id", ct.c_int)) if self.user_stack: fields.append(("user_stack_id", ct.c_int)) return type(self.python_struct_name, (ct.Structure,), dict(_fields_=fields)) c_type = { "u": "unsigned int", "d": "int", "llu": "unsigned long long", "lld": "long long", "hu": "unsigned short", "hd": "short", "x": "unsigned int", "llx": "unsigned long long", "c": "char", "K": "unsigned long long", "U": "unsigned long long", } fmt_types = c_type.keys() def _generate_field_decl(self, idx): field_type = self.types[idx] if field_type == "s": return "char v%d[%d];\n" % (idx, self.string_size) if field_type in Probe.fmt_types: return "%s v%d;\n" % (Probe.c_type[field_type], idx) self._bail("unrecognized format specifier %s" % field_type) def _generate_data_decl(self): # The BPF program will populate values into the struct # according to the format string, and the Python program will # construct the final display string. self.events_name = "%s_events" % self.probe_name self.struct_name = "%s_data_t" % self.probe_name self.stacks_name = "%s_stacks" % self.probe_name stack_table = "BPF_STACK_TRACE(%s, 1024);" % self.stacks_name if (self.kernel_stack or self.user_stack) else "" data_fields = "" for i, field_type in enumerate(self.types): data_fields += " " + self._generate_field_decl(i) kernel_stack_str = " int kernel_stack_id;" if self.kernel_stack else "" user_stack_str = " int user_stack_id;" if self.user_stack else "" text = """ struct %s { u64 timestamp_ns; u32 tgid; u32 pid; char comm[TASK_COMM_LEN]; %s %s %s }; BPF_PERF_OUTPUT(%s); %s """ return text % (self.struct_name, data_fields, kernel_stack_str, user_stack_str, self.events_name, stack_table) def _generate_field_assign(self, idx): field_type = self.types[idx] expr = self.values[idx].strip() text = "" if self.probe_type == "u" and expr[0:3] == "arg": text = (" u64 %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") % (expr, expr[3], expr) if field_type == "s": return ( text + """ if (%s != 0) { bpf_probe_read(&__data.v%d, sizeof(__data.v%d), (void *)%s); } """ % (expr, idx, idx, expr) ) if field_type in Probe.fmt_types: return text + " __data.v%d = (%s)%s;\n" % (idx, Probe.c_type[field_type], expr) self._bail("unrecognized field type %s" % field_type) def _generate_usdt_filter_read(self): text = "" if self.probe_type == "u": for arg, _ in Probe.aliases.items(): if not (arg.startswith("arg") and (arg in self.filter)): continue arg_index = int(arg.replace("arg", "")) arg_ctype = self.usdt.get_probe_arg_ctype(self.usdt_name, arg_index) if not arg_ctype: self._bail("Unable to determine type of {} " "in the filter".format(arg)) text += """ {} {}_filter; bpf_usdt_readarg({}, ctx, &{}_filter); """.format( arg_ctype, arg, arg_index, arg ) self.filter = self.filter.replace(arg, "{}_filter".format(arg)) return text def generate_program(self, include_self): data_decl = self._generate_data_decl() # kprobes don't have built-in pid filters, so we have to add # it to the function body: if len(self.library) == 0 and Probe.pid != -1: pid_filter = ( """ if (__pid != %d) { return 0; } """ % Probe.pid ) elif len(self.library) == 0 and Probe.tgid != -1: pid_filter = ( """ if (__tgid != %d) { return 0; } """ % Probe.tgid ) elif not include_self: pid_filter = ( """ if (__tgid == %d) { return 0; } """ % os.getpid() ) else: pid_filter = "" prefix = "" signature = "struct pt_regs *ctx" data_fields = "" for i, expr in enumerate(self.values): data_fields += self._generate_field_assign(i) if self.probe_type == "t": heading = "TRACEPOINT_PROBE(%s, %s)" % (self.tp_category, self.tp_event) ctx_name = "args" else: heading = "int %s(%s)" % (self.probe_name, signature) ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ __data.user_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % ( self.stacks_name, ctx_name, ) if self.kernel_stack: stack_trace += """ __data.kernel_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID );""" % ( self.stacks_name, ctx_name, ) text = ( heading + """ { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __tgid = __pid_tgid >> 32; u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half %s %s %s if (!(%s)) return 0; struct %s __data = {0}; __data.timestamp_ns = bpf_ktime_get_ns(); __data.tgid = __tgid; __data.pid = __pid; bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); return 0; } """ ) text = text % ( pid_filter, prefix, self._generate_usdt_filter_read(), self.filter, self.struct_name, data_fields, stack_trace, self.events_name, ctx_name, ) return self.streq_functions + data_decl + "\n" + text @classmethod def _time_off_str(cls, timestamp_ns): return "%.6f" % (1e-9 * (timestamp_ns - cls.first_ts)) def _display_function(self): if self.probe_type == "p" or self.probe_type == "r": return self.function elif self.probe_type == "u": return self.usdt_name else: # self.probe_type == 't' return self.tp_event def print_stack(self, bpf, stack_id, tgid): if stack_id < 0: print(" %d" % stack_id) return stack = list(bpf.get_table(self.stacks_name).walk(stack_id)) for addr in stack: print(" %016x %s" % (addr, bpf.sym(addr, tgid))) def _format_message(self, bpf, tgid, values): # Replace each %K with kernel sym and %U with user sym in tgid kernel_placeholders = [i for i in xrange(0, len(self.types)) if self.types[i] == "K"] user_placeholders = [i for i in xrange(0, len(self.types)) if self.types[i] == "U"] for kp in kernel_placeholders: values[kp] = bpf.ksymaddr(values[kp]) for up in user_placeholders: values[up] = bpf.symaddr(values[up], tgid) return self.python_format % tuple(values) def print_event(self, bpf, cpu, data, size): # Cast as the generated structure type and display # according to the format string in the probe. event = ct.cast(data, ct.POINTER(self.python_struct)).contents values = map(lambda i: getattr(event, "v%d" % i), range(0, len(self.values))) msg = self._format_message(bpf, event.tgid, values) time = strftime("%H:%M:%S") if Probe.use_localtime else Probe._time_off_str(event.timestamp_ns) print( "%-8s %-6d %-6d %-12s %-16s %s" % (time[:8], event.tgid, event.pid, event.comm, self._display_function(), msg) ) if self.kernel_stack: self.print_stack(bpf, event.kernel_stack_id, -1) if self.user_stack: self.print_stack(bpf, event.user_stack_id, event.tgid) if self.user_stack or self.kernel_stack: print("") Probe.event_count += 1 if Probe.max_events is not None and Probe.event_count >= Probe.max_events: exit() def attach(self, bpf, verbose): if len(self.library) == 0: self._attach_k(bpf) else: self._attach_u(bpf) self.python_struct = self._generate_python_data_decl() callback = partial(self.print_event, bpf) bpf[self.events_name].open_perf_buffer(callback) def _attach_k(self, bpf): if self.probe_type == "r": bpf.attach_kretprobe(event=self.function, fn_name=self.probe_name) elif self.probe_type == "p": bpf.attach_kprobe(event=self.function, fn_name=self.probe_name) # Note that tracepoints don't need an explicit attach def _attach_u(self, bpf): libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "u": pass # Was already enabled by the BPF constructor elif self.probe_type == "r": bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.pid) else: bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.pid)
timestampp = temp.lookup(&pid); if (!timestampp) return 0; u64 delta = bpf_ktime_get_ns() - *timestampp; if (delta/1000000 < %d) return 0; delta /= %d; latency.increment(bpf_log2l(delta)); temp.delete(&pid); return 0; } """ % (args.threshold, 1000000 if args.milliseconds else 1000) usdt = USDT(pid=int(dbpid)) usdt.enable_probe("query__start", "probe_start") usdt.enable_probe("query__done", "probe_end") bpf = BPF(text=program, usdt_contexts=[usdt]) if args.verbose: print(usdt.get_text()) print(program) print( "Tracing database queries slower than %dms for PID %d... Ctrl+C to quit." % (args.threshold, dbpid)) try: sleep(999999999999) except KeyboardInterrupt:
# load BPF program bpf_text = """ #include <uapi/linux/ptrace.h> int do_trace(struct pt_regs *ctx) { uint64_t addr; char path[128]={0}; bpf_usdt_readarg(6, ctx, &addr); bpf_probe_read(&path, sizeof(path), (void *)addr); bpf_trace_printk("path:%s\\n", path); return 0; }; """ # enable USDT probe from given PID u = USDT(pid=int(pid)) u.enable_probe(probe="http__server__request", fn_name="do_trace") if debug: print(u.get_text()) print(bpf_text) # initialize BPF b = BPF(text=bpf_text, usdt_contexts=[u]) # header print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "ARGS")) # format output while 1: try: (task, pid, cpu, flags, ts, msg) = b.trace_fields()
def enable_usdt_probe(self, probe_name, fn_name): if not self.usdt_ctx: self.usdt_ctx = USDT(pid=self.args.pid) self.usdt_ctx.enable_probe(probe_name, fn_name)
hash_leaf->average = hash_leaf->total / hash_leaf->count; return 0; } """ bpf_text = bpf_text.replace("SAMPLE_COUNT", str(this_count)) bpf_text = bpf_text.replace("FILTER_STRING", this_filter) if this_filter: bpf_text = bpf_text.replace("FILTER", "if (!filter(start_data.input)) { return 0; }") else: bpf_text = bpf_text.replace("FILTER", "") # Create USDT context print("Attaching probes to pid %d" % this_pid) usdt_ctx = USDT(pid=this_pid) usdt_ctx.enable_probe(probe="operation_start", fn_name="trace_operation_start") usdt_ctx.enable_probe(probe="operation_end", fn_name="trace_operation_end") # Create BPF context, load BPF program bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel) print("Tracing... Hit Ctrl-C to end.") lat_hash = bpf_ctx.get_table("lat_hash") while (1): try: sleep(this_interval) except KeyboardInterrupt: exit()
default=1) parser.add_argument('-d', type=int, dest='duration_seconds', help='Recording duration in seconds', default=10) return parser args = get_arg_parser().parse_args() print(prog % (next_power_of_two(args.sampling_frequency) - 1)) pid = args.pid usdt = USDT(path=args.lib_jvm_path, pid=args.pid) usdt.enable_probe(probe="object__alloc", fn_name="trace_alloc") bpf = BPF(text=prog % (next_power_of_two(args.sampling_frequency) - 1), usdt_contexts=[usdt]) time.sleep(float(str(args.duration_seconds))) if len(bpf["tids"]) == 0: print("No data found - are DTrace probes enabled in running process?") stack_traces = bpf["stack_traces"] all_stacks = [] stack_counts = dict() for k, v in bpf["counts"].iteritems(): stack = []
class Probe(object): probe_count = 0 streq_index = 0 max_events = None event_count = 0 first_ts = 0 print_time = False use_localtime = True time_field = False print_cpu = False print_address = False tgid = -1 pid = -1 page_cnt = None @classmethod def configure(cls, args): cls.max_events = args.max_events cls.print_time = args.timestamp or args.time cls.use_localtime = not args.timestamp cls.time_field = cls.print_time and (not cls.use_localtime) cls.print_cpu = args.print_cpu cls.print_address = args.address cls.first_ts = BPF.monotonic_time() cls.tgid = args.tgid or -1 cls.pid = args.pid or -1 cls.page_cnt = args.buffer_pages cls.bin_cmp = args.bin_cmp def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None self.streq_functions = "" self.raw_probe = probe self.string_size = string_size self.kernel_stack = kernel_stack self.user_stack = user_stack Probe.probe_count += 1 self._parse_probe() self.probe_num = Probe.probe_count self.probe_name = "probe_%s_%d" % \ (self._display_function(), self.probe_num) self.probe_name = re.sub(r'[^A-Za-z0-9_]', '_', self.probe_name) # compiler can generate proper codes for function # signatures with "syscall__" prefix if self.is_syscall_kprobe: self.probe_name = "syscall__" + self.probe_name[6:] def __str__(self): return "%s:%s:%s FLT=%s ACT=%s/%s" % (self.probe_type, self.library, self._display_function(), self.filter, self.types, self.values) def is_default_action(self): return self.python_format == "" def _bail(self, error): raise ValueError("error in probe '%s': %s" % (self.raw_probe, error)) def _parse_probe(self): text = self.raw_probe # There might be a function signature preceding the actual # filter/print part, or not. Find the probe specifier first -- # it ends with either a space or an open paren ( for the # function signature part. # opt. signature # probespec | rest # --------- ---------- -- (spec, sig, rest) = re.match(r'([^ \t\(]+)(\([^\(]*\))?(.*)', text).groups() self._parse_spec(spec) # Remove the parens self.signature = sig[1:-1] if sig else None if self.signature and self.probe_type in ['u', 't']: self._bail("USDT and tracepoint probes can't have " + "a function signature; use arg1, arg2, " + "... instead") text = rest.lstrip() # If we now have a (, wait for the balanced closing ) and that # will be the predicate self.filter = None if len(text) > 0 and text[0] == "(": balance = 1 for i in range(1, len(text)): if text[i] == "(": balance += 1 if text[i] == ")": balance -= 1 if balance == 0: self._parse_filter(text[:i + 1]) text = text[i + 1:] break if self.filter is None: self._bail("unmatched end of predicate") if self.filter is None: self.filter = "1" # The remainder of the text is the printf action self._parse_action(text.lstrip()) def _parse_spec(self, spec): parts = spec.split(":") # Two special cases: 'func' means 'p::func', 'lib:func' means # 'p:lib:func'. Other combinations need to provide an empty # value between delimiters, e.g. 'r::func' for a kretprobe on # the function func. if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] if len(parts[0]) == 0: self.probe_type = "p" elif parts[0] in ["p", "r", "t", "u"]: self.probe_type = parts[0] else: self._bail("probe type must be '', 'p', 't', 'r', " + "or 'u', but got '%s'" % parts[0]) if self.probe_type == "t": self.tp_category = parts[1] self.tp_event = parts[2] self.library = "" # kernel self.function = "" # from TRACEPOINT_PROBE elif self.probe_type == "u": self.library = ':'.join(parts[1:-1]) self.usdt_name = parts[-1] self.function = "" # no function, just address # We will discover the USDT provider by matching on # the USDT name in the specified library self._find_usdt_probe() else: self.library = ':'.join(parts[1:-1]) self.function = parts[-1] # only x64 syscalls needs checking, no other syscall wrapper yet. self.is_syscall_kprobe = False if self.probe_type == "p" and len(self.library) == 0 and \ self.function[:10] == "__x64_sys_": self.is_syscall_kprobe = True def _find_usdt_probe(self): target = Probe.pid if Probe.pid and Probe.pid != -1 \ else Probe.tgid self.usdt = USDT(path=self.library, pid=target) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name.encode('ascii'): return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name) def _parse_filter(self, filt): self.filter = self._rewrite_expr(filt) def _parse_types(self, fmt): for match in re.finditer( r'[^%]%(s|u|d|lu|llu|ld|lld|hu|hd|x|lx|llx|c|K|U)', fmt): self.types.append(match.group(1)) fmt = re.sub(r'([^%]%)(u|d|lu|llu|ld|lld|hu|hd)', r'\1d', fmt) fmt = re.sub(r'([^%]%)(x|lx|llx)', r'\1x', fmt) fmt = re.sub('%K|%U', '%s', fmt) self.python_format = fmt.strip('"') def _parse_action(self, action): self.values = [] self.types = [] self.python_format = "" if len(action) == 0: return action = action.strip() match = re.search(r'(\".*?\"),?(.*)', action) if match is None: self._bail("expected format string in \"s") self.raw_format = match.group(1) self._parse_types(self.raw_format) for part in re.split('(?<!"),', match.group(2)): part = self._rewrite_expr(part) if len(part) > 0: self.values.append(part) aliases_arg = { "arg1": "PT_REGS_PARM1(ctx)", "arg2": "PT_REGS_PARM2(ctx)", "arg3": "PT_REGS_PARM3(ctx)", "arg4": "PT_REGS_PARM4(ctx)", "arg5": "PT_REGS_PARM5(ctx)", "arg6": "PT_REGS_PARM6(ctx)", } aliases_indarg = { "arg1": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})", "arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM2(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})", "arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM3(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})", "arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM4(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})", "arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM5(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})", "arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM6(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})", } aliases_common = { "retval": "PT_REGS_RC(ctx)", "$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)", "$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)", "$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)", "$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)", "$cpu": "bpf_get_smp_processor_id()", "$task" : "((struct task_struct *)bpf_get_current_task())" } def _generate_streq_function(self, string): fname = "streq_%d" % Probe.streq_index Probe.streq_index += 1 self.streq_functions += """ static inline bool %s(char const *ignored, uintptr_t str) { char needle[] = %s; char haystack[sizeof(needle)]; bpf_probe_read(&haystack, sizeof(haystack), (void *)str); for (int i = 0; i < sizeof(needle) - 1; ++i) { if (needle[i] != haystack[i]) { return false; } } return true; } """ % (fname, string) return fname def _rewrite_expr(self, expr): if self.is_syscall_kprobe: for alias, replacement in Probe.aliases_indarg.items(): expr = expr.replace(alias, replacement) else: for alias, replacement in Probe.aliases_arg.items(): # For USDT probes, we replace argN values with the # actual arguments for that probe obtained using # bpf_readarg_N macros emitted at BPF construction. if self.probe_type == "u": continue expr = expr.replace(alias, replacement) for alias, replacement in Probe.aliases_common.items(): expr = expr.replace(alias, replacement) if self.bin_cmp: STRCMP_RE = 'STRCMP\\(\"([^"]+)\\"' else: STRCMP_RE = 'STRCMP\\(("[^"]+\\")' matches = re.finditer(STRCMP_RE, expr) for match in matches: string = match.group(1) fname = self._generate_streq_function(string) expr = expr.replace("STRCMP", fname, 1) return expr p_type = {"u": ct.c_uint, "d": ct.c_int, "lu": ct.c_ulong, "ld": ct.c_long, "llu": ct.c_ulonglong, "lld": ct.c_longlong, "hu": ct.c_ushort, "hd": ct.c_short, "x": ct.c_uint, "lx": ct.c_ulong, "llx": ct.c_ulonglong, "c": ct.c_ubyte, "K": ct.c_ulonglong, "U": ct.c_ulonglong} def _generate_python_field_decl(self, idx, fields): field_type = self.types[idx] if field_type == "s": ptype = ct.c_char * self.string_size else: ptype = Probe.p_type[field_type] fields.append(("v%d" % idx, ptype)) def _generate_python_data_decl(self): self.python_struct_name = "%s_%d_Data" % \ (self._display_function(), self.probe_num) fields = [] if self.time_field: fields.append(("timestamp_ns", ct.c_ulonglong)) if self.print_cpu: fields.append(("cpu", ct.c_int)) fields.extend([ ("tgid", ct.c_uint), ("pid", ct.c_uint), ("comm", ct.c_char * 16) # TASK_COMM_LEN ]) for i in range(0, len(self.types)): self._generate_python_field_decl(i, fields) if self.kernel_stack: fields.append(("kernel_stack_id", ct.c_int)) if self.user_stack: fields.append(("user_stack_id", ct.c_int)) return type(self.python_struct_name, (ct.Structure,), dict(_fields_=fields)) c_type = {"u": "unsigned int", "d": "int", "lu": "unsigned long", "ld": "long", "llu": "unsigned long long", "lld": "long long", "hu": "unsigned short", "hd": "short", "x": "unsigned int", "lx": "unsigned long", "llx": "unsigned long long", "c": "char", "K": "unsigned long long", "U": "unsigned long long"} fmt_types = c_type.keys() def _generate_field_decl(self, idx): field_type = self.types[idx] if field_type == "s": return "char v%d[%d];\n" % (idx, self.string_size) if field_type in Probe.fmt_types: return "%s v%d;\n" % (Probe.c_type[field_type], idx) self._bail("unrecognized format specifier %s" % field_type) def _generate_data_decl(self): # The BPF program will populate values into the struct # according to the format string, and the Python program will # construct the final display string. self.events_name = "%s_events" % self.probe_name self.struct_name = "%s_data_t" % self.probe_name self.stacks_name = "%s_stacks" % self.probe_name stack_table = "BPF_STACK_TRACE(%s, 1024);" % self.stacks_name \ if (self.kernel_stack or self.user_stack) else "" data_fields = "" for i, field_type in enumerate(self.types): data_fields += " " + \ self._generate_field_decl(i) time_str = "u64 timestamp_ns;" if self.time_field else "" cpu_str = "int cpu;" if self.print_cpu else "" kernel_stack_str = " int kernel_stack_id;" \ if self.kernel_stack else "" user_stack_str = " int user_stack_id;" \ if self.user_stack else "" text = """ struct %s { %s %s u32 tgid; u32 pid; char comm[TASK_COMM_LEN]; %s %s %s }; BPF_PERF_OUTPUT(%s); %s """ return text % (self.struct_name, time_str, cpu_str, data_fields, kernel_stack_str, user_stack_str, self.events_name, stack_table) def _generate_field_assign(self, idx): field_type = self.types[idx] expr = self.values[idx].strip() text = "" if self.probe_type == "u" and expr[0:3] == "arg": arg_index = int(expr[3]) arg_ctype = self.usdt.get_probe_arg_ctype( self.usdt_name, arg_index - 1) text = (" %s %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") \ % (arg_ctype, expr, expr[3], expr) if field_type == "s": return text + """ if (%s != 0) { void *__tmp = (void *)%s; bpf_probe_read(&__data.v%d, sizeof(__data.v%d), __tmp); } """ % (expr, expr, idx, idx) if field_type in Probe.fmt_types: return text + " __data.v%d = (%s)%s;\n" % \ (idx, Probe.c_type[field_type], expr) self._bail("unrecognized field type %s" % field_type) def _generate_usdt_filter_read(self): text = "" if self.probe_type != "u": return text for arg, _ in Probe.aliases_arg.items(): if not (arg in self.filter): continue arg_index = int(arg.replace("arg", "")) arg_ctype = self.usdt.get_probe_arg_ctype( self.usdt_name, arg_index - 1) if not arg_ctype: self._bail("Unable to determine type of {} " "in the filter".format(arg)) text += """ {} {}_filter; bpf_usdt_readarg({}, ctx, &{}_filter); """.format(arg_ctype, arg, arg_index, arg) self.filter = self.filter.replace( arg, "{}_filter".format(arg)) return text def generate_program(self, include_self): data_decl = self._generate_data_decl() if Probe.pid != -1: pid_filter = """ if (__pid != %d) { return 0; } """ % Probe.pid # uprobes can have a built-in tgid filter passed to # attach_uprobe, hence the check here -- for kprobes, we # need to do the tgid test by hand: elif len(self.library) == 0 and Probe.tgid != -1: pid_filter = """ if (__tgid != %d) { return 0; } """ % Probe.tgid elif not include_self: pid_filter = """ if (__tgid == %d) { return 0; } """ % os.getpid() else: pid_filter = "" prefix = "" signature = "struct pt_regs *ctx" if self.signature: signature += ", " + self.signature data_fields = "" for i, expr in enumerate(self.values): data_fields += self._generate_field_assign(i) if self.probe_type == "t": heading = "TRACEPOINT_PROBE(%s, %s)" % \ (self.tp_category, self.tp_event) ctx_name = "args" else: heading = "int %s(%s)" % (self.probe_name, signature) ctx_name = "ctx" time_str = """ __data.timestamp_ns = bpf_ktime_get_ns();""" if self.time_field else "" cpu_str = """ __data.cpu = bpf_get_smp_processor_id();""" if self.print_cpu else "" stack_trace = "" if self.user_stack: stack_trace += """ __data.user_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (self.stacks_name, ctx_name) if self.kernel_stack: stack_trace += """ __data.kernel_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (self.stacks_name, ctx_name) text = heading + """ { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __tgid = __pid_tgid >> 32; u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half %s %s %s if (!(%s)) return 0; struct %s __data = {0}; %s %s __data.tgid = __tgid; __data.pid = __pid; bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); return 0; } """ text = text % (pid_filter, prefix, self._generate_usdt_filter_read(), self.filter, self.struct_name, time_str, cpu_str, data_fields, stack_trace, self.events_name, ctx_name) return self.streq_functions + data_decl + "\n" + text @classmethod def _time_off_str(cls, timestamp_ns): return "%.6f" % (1e-9 * (timestamp_ns - cls.first_ts)) def _display_function(self): if self.probe_type == 'p' or self.probe_type == 'r': return self.function elif self.probe_type == 'u': return self.usdt_name else: # self.probe_type == 't' return self.tp_event def print_stack(self, bpf, stack_id, tgid): if stack_id < 0: print(" %d" % stack_id) return stack = list(bpf.get_table(self.stacks_name).walk(stack_id)) for addr in stack: print(" ", end="") if Probe.print_address: print("%16x " % addr, end="") print("%s" % (bpf.sym(addr, tgid, show_module=True, show_offset=True))) def _format_message(self, bpf, tgid, values): # Replace each %K with kernel sym and %U with user sym in tgid kernel_placeholders = [i for i, t in enumerate(self.types) if t == 'K'] user_placeholders = [i for i, t in enumerate(self.types) if t == 'U'] for kp in kernel_placeholders: values[kp] = bpf.ksym(values[kp], show_offset=True) for up in user_placeholders: values[up] = bpf.sym(values[up], tgid, show_module=True, show_offset=True) return self.python_format % tuple(values) def print_event(self, bpf, cpu, data, size): # Cast as the generated structure type and display # according to the format string in the probe. event = ct.cast(data, ct.POINTER(self.python_struct)).contents values = map(lambda i: getattr(event, "v%d" % i), range(0, len(self.values))) msg = self._format_message(bpf, event.tgid, values) if Probe.print_time: time = strftime("%H:%M:%S") if Probe.use_localtime else \ Probe._time_off_str(event.timestamp_ns) print("%-8s " % time[:8], end="") if Probe.print_cpu: print("%-3s " % event.cpu, end="") print("%-7d %-7d %-15s %-16s %s" % (event.tgid, event.pid, event.comm.decode('utf-8', 'replace'), self._display_function(), msg)) if self.kernel_stack: self.print_stack(bpf, event.kernel_stack_id, -1) if self.user_stack: self.print_stack(bpf, event.user_stack_id, event.tgid) if self.user_stack or self.kernel_stack: print("") Probe.event_count += 1 if Probe.max_events is not None and \ Probe.event_count >= Probe.max_events: exit() def attach(self, bpf, verbose): if len(self.library) == 0: self._attach_k(bpf) else: self._attach_u(bpf) self.python_struct = self._generate_python_data_decl() callback = partial(self.print_event, bpf) bpf[self.events_name].open_perf_buffer(callback, page_cnt=self.page_cnt) def _attach_k(self, bpf): if self.probe_type == "r": bpf.attach_kretprobe(event=self.function, fn_name=self.probe_name) elif self.probe_type == "p": bpf.attach_kprobe(event=self.function, fn_name=self.probe_name) # Note that tracepoints don't need an explicit attach def _attach_u(self, bpf): libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "u": pass # Was already enabled by the BPF constructor elif self.probe_type == "r": bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.tgid) else: bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.tgid)
class Probe(object): next_probe_index = 0 streq_index = 0 aliases = {"$PID": "(bpf_get_current_pid_tgid() >> 32)"} def _substitute_aliases(self, expr): if expr is None: return expr for alias, subst in Probe.aliases.items(): expr = expr.replace(alias, subst) return expr def _parse_signature(self): params = map(str.strip, self.signature.split(',')) self.param_types = {} for param in params: # If the type is a pointer, the * can be next to the # param name. Other complex types like arrays are not # supported right now. index = param.rfind('*') index = index if index != -1 else param.rfind(' ') param_type = param[0:index + 1].strip() param_name = param[index + 1:].strip() self.param_types[param_name] = param_type # Maintain list of user params. Then later decide to # switch to bpf_probe_read_kernel or bpf_probe_read_user. if "__user" in param_type.split(): self.probe_user_list.add(param_name) def _generate_entry(self): self.entry_probe_func = self.probe_func_name + "_entry" text = """ int PROBENAME(struct pt_regs *ctx SIGNATURE) { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __pid = __pid_tgid; // lower 32 bits u32 __tgid = __pid_tgid >> 32; // upper 32 bits PID_FILTER COLLECT return 0; } """ text = text.replace("PROBENAME", self.entry_probe_func) text = text.replace("SIGNATURE", "" if len(self.signature) == 0 else ", " + self.signature) text = text.replace("PID_FILTER", self._generate_pid_filter()) collect = "" for pname in self.args_to_probe: param_hash = self.hashname_prefix + pname if pname == "__latency": collect += """ u64 __time = bpf_ktime_get_ns(); %s.update(&__pid, &__time); """ % param_hash else: collect += "%s.update(&__pid, &%s);\n" % \ (param_hash, pname) text = text.replace("COLLECT", collect) return text def _generate_entry_probe(self): # Any $entry(name) expressions result in saving that argument # when entering the function. self.args_to_probe = set() regex = r"\$entry\((\w+)\)" for expr in self.exprs: for arg in re.finditer(regex, expr): self.args_to_probe.add(arg.group(1)) for arg in re.finditer(regex, self.filter): self.args_to_probe.add(arg.group(1)) if any(map(lambda expr: "$latency" in expr, self.exprs)) or \ "$latency" in self.filter: self.args_to_probe.add("__latency") self.param_types["__latency"] = "u64" # nanoseconds for pname in self.args_to_probe: if pname not in self.param_types: raise ValueError("$entry(%s): no such param" % arg) self.hashname_prefix = "%s_param_" % self.probe_hash_name text = "" for pname in self.args_to_probe: # Each argument is stored in a separate hash that is # keyed by pid. text += "BPF_HASH(%s, u32, %s);\n" % \ (self.hashname_prefix + pname, self.param_types[pname]) text += self._generate_entry() return text def _generate_retprobe_prefix(self): # After we're done here, there are __%s_val variables for each # argument we needed to probe using $entry(name), and they all # have values (which isn't necessarily the case if we missed # the method entry probe). text = "" self.param_val_names = {} for pname in self.args_to_probe: val_name = "__%s_val" % pname text += "%s *%s = %s.lookup(&__pid);\n" % \ (self.param_types[pname], val_name, self.hashname_prefix + pname) text += "if (%s == 0) { return 0 ; }\n" % val_name self.param_val_names[pname] = val_name return text def _replace_entry_exprs(self): for pname, vname in self.param_val_names.items(): if pname == "__latency": entry_expr = "$latency" val_expr = "(bpf_ktime_get_ns() - *%s)" % vname else: entry_expr = "$entry(%s)" % pname val_expr = "(*%s)" % vname for i in range(0, len(self.exprs)): self.exprs[i] = self.exprs[i].replace( entry_expr, val_expr) self.filter = self.filter.replace(entry_expr, val_expr) def _attach_entry_probe(self): if self.is_user: self.bpf.attach_uprobe(name=self.library, sym=self.function, fn_name=self.entry_probe_func, pid=self.pid or -1) else: self.bpf.attach_kprobe(event=self.function, fn_name=self.entry_probe_func) def _bail(self, error): raise ValueError("error parsing probe '%s': %s" % (self.raw_spec, error)) def _validate_specifier(self): # Everything after '#' is the probe label, ignore it spec = self.raw_spec.split('#')[0] parts = spec.strip().split(':') if len(parts) < 3: self._bail("at least the probe type, library, and " + "function signature must be specified") if len(parts) > 6: self._bail("extraneous ':'-separated parts detected") if parts[0] not in ["r", "p", "t", "u"]: self._bail("probe type must be 'p', 'r', 't', or 'u'" + " but got '%s'" % parts[0]) if re.match(r"\S+\(.*\)", parts[2]) is None: self._bail(("function signature '%s' has an invalid " + "format") % parts[2]) def _parse_expr_types(self, expr_types): if len(expr_types) == 0: self._bail("no expr types specified") self.expr_types = expr_types.split(',') def _parse_exprs(self, exprs): if len(exprs) == 0: self._bail("no exprs specified") self.exprs = exprs.split(',') def _make_valid_identifier(self, ident): return re.sub(r'[^A-Za-z0-9_]', '_', ident) def __init__(self, tool, type, specifier): self.usdt_ctx = None self.streq_functions = "" self.pid = tool.args.pid self.cumulative = tool.args.cumulative or False self.raw_spec = specifier self.probe_user_list = set() self.bin_cmp = False self._validate_specifier() spec_and_label = specifier.split('#') self.label = spec_and_label[1] \ if len(spec_and_label) == 2 else None parts = spec_and_label[0].strip().split(':') self.type = type # hist or freq self.probe_type = parts[0] fparts = parts[2].split('(') self.function = fparts[0].strip() if self.probe_type == "t": self.library = "" # kernel self.tp_category = parts[1] self.tp_event = self.function elif self.probe_type == "u": self.library = parts[1] self.probe_func_name = self._make_valid_identifier( "%s_probe%d" % (self.function, Probe.next_probe_index)) self._enable_usdt_probe() else: self.library = parts[1] self.is_user = len(self.library) > 0 self.signature = fparts[1].strip()[:-1] self._parse_signature() # If the user didn't specify an expression to probe, we probe # the retval in a ret probe, or simply the value "1" otherwise. self.is_default_expr = len(parts) < 5 if not self.is_default_expr: self._parse_expr_types(parts[3]) self._parse_exprs(parts[4]) if len(self.exprs) != len(self.expr_types): self._bail("mismatched # of exprs and types") if self.type == "hist" and len(self.expr_types) > 1: self._bail("histograms can only have 1 expr") else: if not self.probe_type == "r" and self.type == "hist": self._bail("histograms must have expr") self.expr_types = \ ["u64" if not self.probe_type == "r" else "int"] self.exprs = \ ["1" if not self.probe_type == "r" else "$retval"] self.filter = "" if len(parts) != 6 else parts[5] self._substitute_exprs() # Do we need to attach an entry probe so that we can collect an # argument that is required for an exit (return) probe? def check(expr): keywords = ["$entry", "$latency"] return any(map(lambda kw: kw in expr, keywords)) self.entry_probe_required = self.probe_type == "r" and \ (any(map(check, self.exprs)) or check(self.filter)) self.probe_func_name = self._make_valid_identifier( "%s_probe%d" % (self.function, Probe.next_probe_index)) self.probe_hash_name = self._make_valid_identifier( "%s_hash%d" % (self.function, Probe.next_probe_index)) Probe.next_probe_index += 1 def _enable_usdt_probe(self): self.usdt_ctx = USDT(path=self.library, pid=self.pid) self.usdt_ctx.enable_probe( self.function, self.probe_func_name) def _substitute_exprs(self): def repl(expr): expr = self._substitute_aliases(expr) rdict = StrcmpRewrite.rewrite_expr(expr, self.bin_cmp, self.library, self.probe_user_list, self.streq_functions, Probe.streq_index) expr = rdict["expr"] self.streq_functions = rdict["streq_functions"] Probe.streq_index = rdict["probeid"] return expr.replace("$retval", "PT_REGS_RC(ctx)") for i in range(0, len(self.exprs)): self.exprs[i] = repl(self.exprs[i]) self.filter = repl(self.filter) def _is_string(self, expr_type): return expr_type == "char*" or expr_type == "char *" def _generate_hash_field(self, i): if self._is_string(self.expr_types[i]): return "struct __string_t v%d;\n" % i else: return "%s v%d;\n" % (self.expr_types[i], i) def _generate_usdt_arg_assignment(self, i): expr = self.exprs[i] if self.probe_type == "u" and expr[0:3] == "arg": arg_index = int(expr[3]) arg_ctype = self.usdt_ctx.get_probe_arg_ctype( self.function, arg_index - 1) return (" %s %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") \ % (arg_ctype, expr, expr[3], expr) else: return "" def _generate_field_assignment(self, i): text = self._generate_usdt_arg_assignment(i) if self._is_string(self.expr_types[i]): if self.is_user or \ self.exprs[i] in self.probe_user_list: probe_readfunc = "bpf_probe_read_user" else: probe_readfunc = "bpf_probe_read_kernel" return (text + " %s(&__key.v%d.s," + " sizeof(__key.v%d.s), (void *)%s);\n") % \ (probe_readfunc, i, i, self.exprs[i]) else: return text + " __key.v%d = %s;\n" % \ (i, self.exprs[i]) def _generate_hash_decl(self): if self.type == "hist": return "BPF_HISTOGRAM(%s, %s);" % \ (self.probe_hash_name, self.expr_types[0]) else: text = "struct %s_key_t {\n" % self.probe_hash_name for i in range(0, len(self.expr_types)): text += self._generate_hash_field(i) text += "};\n" text += "BPF_HASH(%s, struct %s_key_t, u64);\n" % \ (self.probe_hash_name, self.probe_hash_name) return text def _generate_key_assignment(self): if self.type == "hist": return self._generate_usdt_arg_assignment(0) + \ ("%s __key = %s;\n" % (self.expr_types[0], self.exprs[0])) else: text = "struct %s_key_t __key = {};\n" % \ self.probe_hash_name for i in range(0, len(self.exprs)): text += self._generate_field_assignment(i) return text def _generate_hash_update(self): if self.type == "hist": return "%s.increment(bpf_log2l(__key));" % \ self.probe_hash_name else: return "%s.increment(__key);" % self.probe_hash_name def _generate_pid_filter(self): # Kernel probes need to explicitly filter pid, because the # attach interface doesn't support pid filtering if self.pid is not None and not self.is_user: return "if (__tgid != %d) { return 0; }" % self.pid else: return "" def generate_text(self): program = "" probe_text = """ DATA_DECL """ + ( "TRACEPOINT_PROBE(%s, %s)" % (self.tp_category, self.tp_event) if self.probe_type == "t" else "int PROBENAME(struct pt_regs *ctx SIGNATURE)") + """ { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __pid = __pid_tgid; // lower 32 bits u32 __tgid = __pid_tgid >> 32; // upper 32 bits PID_FILTER PREFIX KEY_EXPR if (!(FILTER)) return 0; COLLECT return 0; } """ prefix = "" signature = "" # If any entry arguments are probed in a ret probe, we need # to generate an entry probe to collect them if self.entry_probe_required: program += self._generate_entry_probe() prefix += self._generate_retprobe_prefix() # Replace $entry(paramname) with a reference to the # value we collected when entering the function: self._replace_entry_exprs() if self.probe_type == "p" and len(self.signature) > 0: # Only entry uprobes/kprobes can have user-specified # signatures. Other probes force it to (). signature = ", " + self.signature program += probe_text.replace("PROBENAME", self.probe_func_name) program = program.replace("SIGNATURE", signature) program = program.replace("PID_FILTER", self._generate_pid_filter()) decl = self._generate_hash_decl() key_expr = self._generate_key_assignment() collect = self._generate_hash_update() program = program.replace("DATA_DECL", decl) program = program.replace("KEY_EXPR", key_expr) program = program.replace("FILTER", "1" if len(self.filter) == 0 else self.filter) program = program.replace("COLLECT", collect) program = program.replace("PREFIX", prefix) return self.streq_functions + program def _attach_u(self): libpath = BPF.find_library(self.library) if libpath is None: libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "r": self.bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_func_name, pid=self.pid or -1) else: self.bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_func_name, pid=self.pid or -1) def _attach_k(self): if self.probe_type == "t": pass # Nothing to do for tracepoints elif self.probe_type == "r": self.bpf.attach_kretprobe(event=self.function, fn_name=self.probe_func_name) else: self.bpf.attach_kprobe(event=self.function, fn_name=self.probe_func_name) def attach(self, bpf): self.bpf = bpf if self.probe_type == "u": return if self.is_user: self._attach_u() else: self._attach_k() if self.entry_probe_required: self._attach_entry_probe() def _v2s(self, v): # Most fields can be converted with plain str(), but strings # are wrapped in a __string_t which has an .s field if "__string_t" in type(v).__name__: return str(v.s) return str(v) def _display_expr(self, i): # Replace ugly latency calculation with $latency expr = self.exprs[i].replace( "(bpf_ktime_get_ns() - *____latency_val)", "$latency") # Replace alias values back with the alias name for alias, subst in Probe.aliases.items(): expr = expr.replace(subst, alias) # Replace retval expression with $retval expr = expr.replace("PT_REGS_RC(ctx)", "$retval") # Replace ugly (*__param_val) expressions with param name return re.sub(r"\(\*__(\w+)_val\)", r"\1", expr) def _display_key(self, key): if self.is_default_expr: if not self.probe_type == "r": return "total calls" else: return "retval = %s" % str(key.v0) else: # The key object has v0, ..., vk fields containing # the values of the expressions from self.exprs def str_i(i): key_i = self._v2s(getattr(key, "v%d" % i)) return "%s = %s" % \ (self._display_expr(i), key_i) return ", ".join(map(str_i, range(0, len(self.exprs)))) def display(self, top): data = self.bpf.get_table(self.probe_hash_name) if self.type == "freq": print(self.label or self.raw_spec) print("\t%-10s %s" % ("COUNT", "EVENT")) sdata = sorted(data.items(), key=lambda p: p[1].value) if top is not None: sdata = sdata[-top:] for key, value in sdata: # Print some nice values if the user didn't # specify an expression to probe if self.is_default_expr: if not self.probe_type == "r": key_str = "total calls" else: key_str = "retval = %s" % \ self._v2s(key.v0) else: key_str = self._display_key(key) print("\t%-10s %s" % (str(value.value), key_str)) elif self.type == "hist": label = self.label or (self._display_expr(0) if not self.is_default_expr else "retval") data.print_log2_hist(val_type=label) if not self.cumulative: data.clear() def __str__(self): return self.label or self.raw_spec
def _enable_usdt_probe(self): self.usdt_ctx = USDT(path=self.library, pid=self.pid) self.usdt_ctx.enable_probe( self.function, self.probe_func_name)
class Probe(object): def __init__(self, pattern, use_regex=False, pid=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.matched = 0 self.trace_functions = {} # map location number to function name def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p" and not self.library: for index, function in self.trace_functions.items(): self.bpf.attach_kprobe(event=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "p" and self.library: for index, function in self.trace_functions.items(): self.bpf.attach_uprobe(name=self.library, sym=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "t": for index, function in self.trace_functions.items(): self.bpf.attach_tracepoint(tp=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def _add_function(self, template, probe_name): new_func = "trace_count_%d" % self.matched text = template.replace("PROBE_FUNCTION", new_func) text = text.replace("LOCATION", str(self.matched)) self.trace_functions[self.matched] = probe_name self.matched += 1 return text def _generate_functions(self, template): self.usdt = None text = "" if self.type == "p" and not self.library: functions = BPF.get_kprobe_functions(self.pattern) verify_limit(len(functions)) for function in functions: text += self._add_function(template, function) elif self.type == "p" and self.library: # uprobes are tricky because the same function may have multiple # addresses, and the same address may be mapped to multiple # functions. We aren't allowed to create more than one uprobe # per address, so track unique addresses and ignore functions that # map to an address that we've already seen. Also ignore functions # that may repeat multiple times with different addresses. addresses, functions = (set(), set()) functions_and_addresses = BPF.get_user_functions_and_addresses( self.library, self.pattern) verify_limit(len(functions_and_addresses)) for function, address in functions_and_addresses: if address in addresses or function in functions: continue addresses.add(address) functions.add(function) text += self._add_function(template, function) elif self.type == "t": tracepoints = BPF.get_tracepoints(self.pattern) verify_limit(len(tracepoints)) for tracepoint in tracepoints: text += self._add_function(template, tracepoint) elif self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) matches = [] for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): matches.append(probe.name) verify_limit(len(matches)) for match in matches: new_func = "trace_count_%d" % self.matched text += self._add_function(template, match) self.usdt.enable_probe(match, new_func) if debug: print(self.usdt.get_text()) return text def load(self): trace_count_text = """ int PROBE_FUNCTION(void *ctx) { FILTER int loc = LOCATION; u64 *val = counts.lookup(&loc); if (!val) { return 0; // Should never happen, # of locations is known } (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> BPF_TABLE("array", int, u64, counts, NUMLOCATIONS); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.pid: trace_count_text = trace_count_text.replace( 'FILTER', """u32 pid = bpf_get_current_pid_tgid() >> 32; if (pid != %d) { return 0; }""" % self.pid) else: trace_count_text = trace_count_text.replace('FILTER', '') bpf_text += self._generate_functions(trace_count_text) bpf_text = bpf_text.replace("NUMLOCATIONS", str(len(self.trace_functions))) if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else []) self.clear() # Initialize all array items to zero def counts(self): return self.bpf["counts"] def clear(self): counts = self.bpf["counts"] for location, _ in list(self.trace_functions.items()): counts[counts.Key(location)] = counts.Leaf()
def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, 0 );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """ #include <uapi/linux/ptrace.h> #include <linux/sched.h> // struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # filter text init filter_text = [] # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: # pid filter filter_text.append( 'u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }' % self.pid) if self.is_kernel_probe() and self.cpu: # cpu filter filter_text.append( 'struct task_struct *task; task = (struct task_struct*)bpf_get_current_task(); ' + 'if (task->cpu != %d) { return 0; }' % self.cpu) # trace_count_text = trace_count_text.replace('FILTER', '\n '.join(filter_text)) # Do per-pid statistics iff -P is provided if self.per_pid: # replace trace_count_text = trace_count_text.replace( 'GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace( 'STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None # usdt if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) # iterator all usdt of library above for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue # match pattern if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) # BPF self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
def test_attach1(self): # enable USDT probe from given PID and verifier generated BPF programs u = USDT(pid=int(self.app.pid)) u.enable_probe(probe="probe_point_1", fn_name="do_trace1") u.enable_probe(probe="probe_point_2", fn_name="do_trace2") u.enable_probe(probe="probe_point_3", fn_name="do_trace3") u.enable_probe(probe="probe_point_4", fn_name="do_trace4") u.enable_probe(probe="probe_point_5", fn_name="do_trace5") b = BPF(text=self.bpf_text, usdt_contexts=[u], debug=4) # Event states for each event: # 0 - probe not caught, 1 - probe caught with correct value, # 2 - probe caught with incorrect value self.evt_st_1 = 0 self.evt_st_2 = 0 self.evt_st_3 = 0 # define output data structure in Python class Data1(ct.Structure): _fields_ = [("v1", ct.c_char), ("v2", ct.c_int)] class Data2(ct.Structure): _fields_ = [("v1", ct.c_int), ("v2", ct.c_char)] class Data3(ct.Structure): _fields_ = [("v1", ct.c_int), ("v2", ct.c_int)] class Data4(ct.Structure): _fields_ = [("v1", ct.c_ulonglong), ("v2", ct.c_char * 64)] class Data5(ct.Structure): _fields_ = [("v1", ct.c_char * 64), ("v2", ct.c_ulonglong)] def check_event_val(event, event_state, v1, v2, v3, v4): if ((event.v1 == v1 and event.v2 == v2) or (event.v1 == v3 and event.v2 == v4)): if (event_state == 0 or event_state == 1): return 1 return 2 def print_event1(cpu, data, size): event = ct.cast(data, ct.POINTER(Data1)).contents self.evt_st_1 = check_event_val(event, self.evt_st_1, b'\x0d', 40, b'\x08', 200) def print_event2(cpu, data, size): event = ct.cast(data, ct.POINTER(Data2)).contents # pretend we have two identical probe points to simplify the code self.evt_st_2 = check_event_val(event, self.evt_st_2, 5, b'\x04', 5, b'\x04') def print_event3(cpu, data, size): event = ct.cast(data, ct.POINTER(Data3)).contents self.evt_st_3 = check_event_val(event, self.evt_st_3, 200, 40, 8, 13) def print_event4(cpu, data, size): event = ct.cast(data, ct.POINTER(Data4)).contents print("%s" % event.v2) def print_event5(cpu, data, size): event = ct.cast(data, ct.POINTER(Data5)).contents print("%s" % event.v1) # loop with callback to print_event b["event1"].open_perf_buffer(print_event1) b["event2"].open_perf_buffer(print_event2) b["event3"].open_perf_buffer(print_event3) b["event4"].open_perf_buffer(print_event4) b["event5"].open_perf_buffer(print_event5) # three iterations to make sure we get some probes and have time to process them for i in range(3): b.perf_buffer_poll() self.assertTrue(self.evt_st_1 == 1 and self.evt_st_2 == 1 and self.evt_st_3 == 1)
info->num_calls += 1; info->total_ns += bpf_ktime_get_ns() - e->timestamp; sysentry.delete(&pid); return 0; } #endif // LATENCY #endif // SYSCALLS """.replace("READ_CLASS", read_class) \ .replace("READ_METHOD", read_method) \ .replace("PID_FILTER", "if ((pid >> 32) != %d) { return 0; }" % args.pid) \ .replace("DEFINE_NOLANG", "#define NOLANG" if not language else "") \ .replace("DEFINE_LATENCY", "#define LATENCY" if args.latency else "") \ .replace("DEFINE_SYSCALLS", "#define SYSCALLS" if args.syscalls else "") if language: usdt = USDT(pid=args.pid) usdt.enable_probe_or_bail(entry_probe, "trace_entry") if args.latency: usdt.enable_probe_or_bail(return_probe, "trace_return") else: usdt = None if args.ebpf or args.verbose: if args.verbose and usdt: print(usdt.get_text()) print(program) if args.ebpf: exit() bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else []) if args.syscalls:
def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace( 'FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P, or for user stacks. if self.per_pid or not self.is_kernel_probe() or self.user_stack: trace_count_text = trace_count_text.replace( 'GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace( 'STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # kernel stacks only. skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-l", "--language", choices=languages, help="language to trace") parser.add_argument("pid", type=int, help="process id to attach to") parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode: print the BPF program (for debugging purposes)") parser.add_argument("-m", "--milliseconds", action="store_true", help="report times in milliseconds (default is microseconds)") parser.add_argument("-M", "--minimum", type=int, default=0, help="display only GCs longer than this many milliseconds") parser.add_argument("-F", "--filter", type=str, help="display only GCs whose description contains this text") args = parser.parse_args() usdt = USDT(pid=args.pid) program = """ struct gc_event_t { u64 probe_index; u64 elapsed_ns; u64 field1; u64 field2; u64 field3; u64 field4; char string1[32]; char string2[32]; }; struct entry_t { u64 start_ns; u64 field1;
class Probe(object): def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, pid=None, per_pid=False): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ self.kernel_stack = kernel_stack self.user_stack = user_stack parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.per_pid = per_pid self.matched = 0 def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace( 'FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P, or for user stacks. if self.per_pid or not self.is_kernel_probe() or self.user_stack: trace_count_text = trace_count_text.replace( 'GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace( 'STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # kernel stacks only. skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
def test_uncache(self): """ Tests the utxocache:uncache tracepoint API. https://github.com/bitcoin/bitcoin/blob/master/doc/tracing.md#tracepoint-utxocacheuncache """ # To trigger an UTXO uncache from the cache, we create an invalid transaction # spending a not-cached, but existing UTXO. During transaction validation, this # the UTXO is added to the utxo cache, but as the transaction is invalid, it's # uncached again. self.log.info("testing the utxocache:uncache tracepoint API") # Retrieve the txid for the UTXO created in the first block. This UTXO is not # in our UTXO cache. EARLY_BLOCK_HEIGHT = 1 block_1_hash = self.nodes[0].getblockhash(EARLY_BLOCK_HEIGHT) block_1 = self.nodes[0].getblock(block_1_hash) block_1_coinbase_txid = block_1["tx"][0] # Create a transaction and invalidate it by changing the txid of the previous # output to the coinbase txid of the block at height 1. invalid_tx = self.wallet.create_self_transfer( from_node=self.nodes[0])["tx"] invalid_tx.vin[0].prevout.hash = int(block_1_coinbase_txid, 16) self.log.info("hooking into the utxocache:uncache tracepoint") ctx = USDT(path=str(self.options.bitcoind)) ctx.enable_probe(probe="utxocache:uncache", fn_name="trace_utxocache_uncache") bpf = BPF(text=utxocache_changes_program, usdt_contexts=[ctx], debug=0) # The handle_* function is a ctypes callback function called from C. When # we assert in the handle_* function, the AssertError doesn't propagate # back to Python. The exception is ignored. We manually count and assert # that the handle_* functions succeeded. EXPECTED_HANDLE_UNCACHE_SUCCESS = 1 handle_uncache_succeeds = 0 def handle_utxocache_uncache(_, data, __): nonlocal handle_uncache_succeeds event = ctypes.cast(data, ctypes.POINTER(UTXOCacheChange)).contents self.log.info(f"handle_utxocache_uncache(): {event}") assert_equal(block_1_coinbase_txid, bytes(event.txid[::-1]).hex()) assert_equal(0, event.index) # prevout index assert_equal(EARLY_BLOCK_HEIGHT, event.height) assert_equal(50 * COIN, event.value) assert_equal(True, event.is_coinbase) handle_uncache_succeeds += 1 bpf["utxocache_uncache"].open_perf_buffer(handle_utxocache_uncache) self.log.info( "testmempoolaccept the invalid transaction to trigger an UTXO-cache uncache") result = self.nodes[0].testmempoolaccept( [invalid_tx.serialize().hex()])[0] assert_equal(result["allowed"], False) bpf.perf_buffer_poll(timeout=100) bpf.cleanup() self.log.info( f"check that we successfully traced {EXPECTED_HANDLE_UNCACHE_SUCCESS} uncaches") assert_equal(EXPECTED_HANDLE_UNCACHE_SUCCESS, handle_uncache_succeeds)
class Probe(object): def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, pid=None, per_pid=False): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ self.kernel_stack = kernel_stack self.user_stack = user_stack parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.per_pid = per_pid self.matched = 0 def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace('FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P, or for user stacks. if self.per_pid or not self.is_kernel_probe() or self.user_stack: trace_count_text = trace_count_text.replace('GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace('STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # kernel stacks only. skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
def do_gc_probes(): # load BPF program bpf_text = """ #include <uapi/linux/ptrace.h> #include <linux/sched.h> struct val_t { u32 pid; char comm[TASK_COMM_LEN]; u64 ts; }; struct data_t { u32 pid; u64 ts; u64 delta; char comm[TASK_COMM_LEN]; }; BPF_HASH(start, u32, struct val_t); BPF_PERF_OUTPUT(events); int do_entry(struct pt_regs *ctx) { if (!PT_REGS_PARM1(ctx)) return 0; struct val_t val = {}; u32 pid = bpf_get_current_pid_tgid(); if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) { val.pid = bpf_get_current_pid_tgid(); val.ts = bpf_ktime_get_ns(); start.update(&pid, &val); } return 0; } int do_return(struct pt_regs *ctx) { struct val_t *valp; struct data_t data = {}; u64 delta; u32 pid = bpf_get_current_pid_tgid(); u64 tsp = bpf_ktime_get_ns(); valp = start.lookup(&pid); if (valp == 0) return 0; // missed start bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); data.pid = valp->pid; data.delta = tsp - valp->ts; data.ts = tsp / 1000; if (data.delta > 0) events.perf_submit(ctx, &data, sizeof(data)); start.delete(&pid); return 0; } """ u = USDT(pid=args.pid) u.enable_probe(probe="gc__mark__begin", fn_name="do_entry") u.enable_probe(probe="gc__mark__end", fn_name="do_return") b = BPF(text=bpf_text ,usdt=u) # b.attach_uprobe(name="ruby", sym="gc_start_internal", fn_name="do_entry", pid=args.pid) # b.attach_uretprobe(name="ruby", sym="gc_start_internal", fn_name="do_return", pid=args.pid) print("%-9s %-6s %-16s %10s" % ("TIME", "PID", "COMM", "LATms")) b["events"].open_perf_buffer(print_gc_event) try: while 1: b.kprobe_poll() except: pass
if (!active) return 0; u64 *io_ns = io.lookup(&pid); if (io_ns) bpf_trace_printk("queryend: pid:%u, io time: %ul (ns)\\n", pid, *io_ns); else bpf_trace_printk("queryend: pid:%u, no io time\\n", pid); tracing.delete(&pid); io.delete(&pid); return 0; } """ u = USDT(path=args.bin_path) u.enable_probe(probe="query__start", fn_name="querystart") u.enable_probe(probe="query__done", fn_name="queryend") replacements = {} replacements["PLANID_FILTER"] = "" if args.planid is not None: replacements["PLANID_FILTER"] = \ "if (%s != %sULL) return 0;" % ("arg2", args.planid) for k, v in replacements.iteritems(): text=text.replace(k, v) debug = DEBUG_PREPROCESSOR|DEBUG_BPF if args.debug > 1 else 0
uint64_t addr; char query[128]; /* * Read the first argument from the query-start probe, which is the query. * The format of this probe is: * query-start(query, connectionid, database, user, host) * see: https://dev.mysql.com/doc/refman/5.7/en/dba-dtrace-ref-query.html */ bpf_usdt_readarg(1, ctx, &addr); bpf_trace_printk("%s\\n", addr); return 0; }; """ # enable USDT probe from given PID u = USDT(pid=int(pid)) u.enable_probe(probe="query__start", fn_name="do_trace") if debug: print(u.get_text()) print(bpf_text) # initialize BPF b = BPF(text=bpf_text, usdt_contexts=[u]) # header print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "QUERY")) # format output while 1: try: (task, pid, cpu, flags, ts, msg) = b.trace_fields()
def test_attach1(self): # Enable USDT probe from given PID and verifier generated BPF programs. u = USDT(pid=int(self.app.pid)) u.enable_probe(probe="probe_point_1", fn_name="do_trace1") u.enable_probe(probe="probe_point_2", fn_name="do_trace2") u2 = USDT(pid=int(self.app2.pid)) u2.enable_probe(probe="probe_point_2", fn_name="do_trace2") u2.enable_probe(probe="probe_point_3", fn_name="do_trace3") self.bpf_text = self.bpf_text.replace("FILTER", "pid == %d" % self.app.pid) b = BPF(text=self.bpf_text, usdt_contexts=[u, u2]) # Event states for each event: # 0 - probe not caught, 1 - probe caught with correct value, # 2 - probe caught with incorrect value self.evt_st_1 = 0 self.evt_st_2 = 0 self.evt_st_3 = 0 self.evt_st_4 = 0 self.evt_st_5 = 0 self.evt_st_6 = 0 def check_event_val(data, event_state, expected_val): result = ct.cast(data, ct.POINTER(ct.c_int)).contents if result.value == expected_val: if (event_state == 0 or event_state == 1): return 1 return 2 def print_event1(cpu, data, size): self.evt_st_1 = check_event_val(data, self.evt_st_1, 1) def print_event2(cpu, data, size): self.evt_st_2 = check_event_val(data, self.evt_st_2, 2) def print_event3(cpu, data, size): self.evt_st_3 = check_event_val(data, self.evt_st_3, 3) def print_event4(cpu, data, size): self.evt_st_4 = check_event_val(data, self.evt_st_4, 11) def print_event5(cpu, data, size): self.evt_st_5 = check_event_val(data, self.evt_st_5, 12) def print_event6(cpu, data, size): self.evt_st_6 = check_event_val(data, self.evt_st_6, 13) # loop with callback to print_event b["event1"].open_perf_buffer(print_event1) b["event2"].open_perf_buffer(print_event2) b["event3"].open_perf_buffer(print_event3) b["event4"].open_perf_buffer(print_event4) b["event5"].open_perf_buffer(print_event5) b["event6"].open_perf_buffer(print_event6) # three iterations to make sure we get some probes and have time to process them for i in range(5): b.perf_buffer_poll() # note that event1 and event4 do not really fire, so their state should be 0 # use separate asserts so that if test fails we know which one is the culprit self.assertTrue(self.evt_st_1 == 1) self.assertTrue(self.evt_st_2 == 1) self.assertTrue(self.evt_st_3 == 0) self.assertTrue(self.evt_st_4 == 0) self.assertTrue(self.evt_st_5 == 1) self.assertTrue(self.evt_st_6 == 1)
def load(self): trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.pid = GET_PID; key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS); u64 zero = 0; u64 *val = counts.lookup_or_init(&key, &zero); (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> struct key_t { u32 pid; int stackid; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace('FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P. if self.per_pid or not self.is_kernel_probe(): trace_count_text = trace_count_text.replace('GET_PID', 'bpf_get_current_pid_tgid() >> 32') else: trace_count_text = trace_count_text.replace( 'GET_PID', '0xffffffff') stack_flags = 'BPF_F_REUSE_STACKID' if not self.is_kernel_probe(): stack_flags += '| BPF_F_USER_STACK' # can't do both U *and* K trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags) self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
int endQueryOp(struct pt_regs *ctx) { struct endQueryOutput out = {}; void* summaryPtr = NULL; out.opCtx = NULL; bpf_usdt_readarg(1, ctx, &out.opCtx); if(!out.opCtx) return 0; bpf_usdt_readarg(2, ctx, &summaryPtr); if(!summaryPtr) return 0; bpf_probe_read(&out.summaryStats, sizeof(out.summaryStats), summaryPtr); bpf_usdt_readarg(3, ctx, &out.numResults); endQuery.perf_submit(ctx, &out, sizeof(out)); return 0; } """ probe = USDT(int(sys.argv[1])) probe.enable_probe(probe="findCmdPlan", fn_name="findCmdPlan") toAgg = USDT(int(sys.argv[1])) toAgg.enable_probe(probe="findToAgg", fn_name="findCmdToAgg") # beginQuery = USDT(int(sys.argv[1])) probe.enable_probe(probe="beginQueryOp", fn_name="beginQueryOp") probe.enable_probe(probe="endQueryOp", fn_name="endQueryOp") class BeginQuery(ct.Structure): _fields_ = [("nss", ct.c_char * 50), ("bson", ct.c_byte * 100), ("queryObjSz", ct.c_uint), ("nreturn", ct.c_longlong), ("nskip", ct.c_longlong)]
def test_attach1(self): # Enable USDT probe from given PID and verifier generated BPF programs. u = USDT(pid=int(self.app.pid)) u.enable_probe(probe="probe_point_1", fn_name="do_trace1") u.enable_probe(probe="probe_point_2", fn_name="do_trace2") u2 = USDT(pid=int(self.app2.pid)) u2.enable_probe(probe="probe_point_2", fn_name="do_trace2") u2.enable_probe(probe="probe_point_3", fn_name="do_trace3") self.bpf_text = self.bpf_text.replace("FILTER", "pid == %d" % self.app.pid) b = BPF(text=self.bpf_text, usdt_contexts=[u, u2]) # Event states for each event: # 0 - probe not caught, 1 - probe caught with correct value, # 2 - probe caught with incorrect value self.evt_st_1 = 0 self.evt_st_2 = 0 self.evt_st_3 = 0 self.evt_st_4 = 0 self.evt_st_5 = 0 self.evt_st_6 = 0 def check_event_val(data, event_state, expected_val): result = ct.cast(data, ct.POINTER(ct.c_int)).contents if result.value == expected_val: if (event_state == 0 or event_state == 1): return 1 return 2 def print_event1(cpu, data, size): self.evt_st_1 = check_event_val(data, self.evt_st_1, 1) def print_event2(cpu, data, size): self.evt_st_2 = check_event_val(data, self.evt_st_2, 2) def print_event3(cpu, data, size): self.evt_st_3 = check_event_val(data, self.evt_st_3, 3) def print_event4(cpu, data, size): self.evt_st_4 = check_event_val(data, self.evt_st_4, 11) def print_event5(cpu, data, size): self.evt_st_5 = check_event_val(data, self.evt_st_5, 12) def print_event6(cpu, data, size): self.evt_st_6 = check_event_val(data, self.evt_st_6, 13) # loop with callback to print_event b["event1"].open_perf_buffer(print_event1) b["event2"].open_perf_buffer(print_event2) b["event3"].open_perf_buffer(print_event3) b["event4"].open_perf_buffer(print_event4) b["event5"].open_perf_buffer(print_event5) b["event6"].open_perf_buffer(print_event6) # three iterations to make sure we get some probes and have time to process them for i in range(3): b.perf_buffer_poll() # note that event1 and event4 do not really fire, so their state should be 0 # use separate asserts so that if test fails we know which one is the culprit self.assertTrue(self.evt_st_1 == 1) self.assertTrue(self.evt_st_2 == 1) self.assertTrue(self.evt_st_3 == 0) self.assertTrue(self.evt_st_4 == 0) self.assertTrue(self.evt_st_5 == 1) self.assertTrue(self.evt_st_6 == 1)
#if MALLOC_TRACING u64 size; #else char name[50]; #endif }; struct val_t { u64 total_size; u64 num_allocs; }; BPF_HASH(allocs, struct key_t, struct val_t); """.replace("MALLOC_TRACING", "1" if args.language == "c" else "0") usdt = USDT(pid=args.pid) # # Java # if args.language == "java": program += """ int alloc_entry(struct pt_regs *ctx) { struct key_t key = {}; struct val_t *valp, zero = {}; u64 classptr = 0, size = 0; bpf_usdt_readarg(2, ctx, &classptr); bpf_usdt_readarg(4, ctx, &size); bpf_probe_read(&key.name, sizeof(key.name), (void *)classptr); valp = allocs.lookup_or_init(&key, &zero); valp->total_size += size;
info->num_calls += 1; info->total_ns += bpf_ktime_get_ns() - e->timestamp; sysentry.delete(&pid); return 0; } #endif // LATENCY #endif // SYSCALLS """.replace("READ_CLASS", read_class) \ .replace("READ_METHOD", read_method) \ .replace("PID_FILTER", "if ((pid >> 32) != %d) { return 0; }" % args.pid) \ .replace("DEFINE_NOLANG", "#define NOLANG" if not language else "") \ .replace("DEFINE_LATENCY", "#define LATENCY" if args.latency else "") \ .replace("DEFINE_SYSCALLS", "#define SYSCALLS" if args.syscalls else "") if language: usdt = USDT(pid=args.pid) usdt.enable_probe_or_bail(entry_probe, "trace_entry") if args.latency: usdt.enable_probe_or_bail(return_probe, "trace_return") else: usdt = None if args.verbose: if usdt: print(usdt.get_text()) print(program) bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else []) if args.syscalls: syscall_regex = "^[Ss]y[Ss]_.*" bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry")
def test_add_spent(self): """ Tests the utxocache:add utxocache:spent tracepoint API See https://github.com/bitcoin/bitcoin/blob/master/doc/tracing.md#tracepoint-utxocacheadd and https://github.com/bitcoin/bitcoin/blob/master/doc/tracing.md#tracepoint-utxocachespent """ self.log.info( "test the utxocache:add and utxocache:spent tracepoint API") self.log.info("create an unconfirmed transaction") self.wallet.send_self_transfer(from_node=self.nodes[0]) # We mine a block to trace changes (add/spent) to the active in-memory cache # of the UTXO set (see CoinsTip() of CCoinsViewCache). However, in some cases # temporary clones of the active cache are made. For example, during mining with # the generate RPC call, the block is first tested in TestBlockValidity(). There, # a clone of the active cache is modified during a test ConnectBlock() call. # These are implementation details we don't want to test here. Thus, after # mining, we invalidate the block, start the tracing, and then trace the cache # changes to the active utxo cache. self.log.info("mine and invalidate a block that is later reconsidered") block_hash = self.generate(self.wallet, 1)[0] self.nodes[0].invalidateblock(block_hash) self.log.info( "hook into the utxocache:add and utxocache:spent tracepoints") ctx = USDT(path=str(self.options.bitcoind)) ctx.enable_probe(probe="utxocache:add", fn_name="trace_utxocache_add") ctx.enable_probe(probe="utxocache:spent", fn_name="trace_utxocache_spent") bpf = BPF(text=utxocache_changes_program, usdt_contexts=[ctx], debug=0) # The handle_* function is a ctypes callback function called from C. When # we assert in the handle_* function, the AssertError doesn't propagate # back to Python. The exception is ignored. We manually count and assert # that the handle_* functions succeeded. EXPECTED_HANDLE_ADD_SUCCESS = 2 EXPECTED_HANDLE_SPENT_SUCCESS = 1 handle_add_succeeds = 0 handle_spent_succeeds = 0 expected_utxocache_spents = [] expected_utxocache_adds = [] def handle_utxocache_add(_, data, __): nonlocal handle_add_succeeds event = ctypes.cast(data, ctypes.POINTER(UTXOCacheChange)).contents self.log.info(f"handle_utxocache_add(): {event}") add = expected_utxocache_adds.pop(0) assert_equal(add["txid"], bytes(event.txid[::-1]).hex()) assert_equal(add["index"], event.index) assert_equal(add["height"], event.height) assert_equal(add["value"], event.value) assert_equal(add["is_coinbase"], event.is_coinbase) handle_add_succeeds += 1 def handle_utxocache_spent(_, data, __): nonlocal handle_spent_succeeds event = ctypes.cast(data, ctypes.POINTER(UTXOCacheChange)).contents self.log.info(f"handle_utxocache_spent(): {event}") spent = expected_utxocache_spents.pop(0) assert_equal(spent["txid"], bytes(event.txid[::-1]).hex()) assert_equal(spent["index"], event.index) assert_equal(spent["height"], event.height) assert_equal(spent["value"], event.value) assert_equal(spent["is_coinbase"], event.is_coinbase) handle_spent_succeeds += 1 bpf["utxocache_add"].open_perf_buffer(handle_utxocache_add) bpf["utxocache_spent"].open_perf_buffer(handle_utxocache_spent) # We trigger a block re-connection. This causes changes (add/spent) # to the UTXO-cache which in turn triggers the tracepoints. self.log.info("reconsider the previously invalidated block") self.nodes[0].reconsiderblock(block_hash) block = self.nodes[0].getblock(block_hash, 2) for (block_index, tx) in enumerate(block["tx"]): for vin in tx["vin"]: if "coinbase" not in vin: prevout_tx = self.nodes[0].getrawtransaction( vin["txid"], True) prevout_tx_block = self.nodes[0].getblockheader( prevout_tx["blockhash"]) spends_coinbase = "coinbase" in prevout_tx["vin"][0] expected_utxocache_spents.append({ "txid": vin["txid"], "index": vin["vout"], "height": prevout_tx_block["height"], "value": int(prevout_tx["vout"][vin["vout"]]["value"] * COIN), "is_coinbase": spends_coinbase, }) for (i, vout) in enumerate(tx["vout"]): if vout["scriptPubKey"]["type"] != "nulldata": expected_utxocache_adds.append({ "txid": tx["txid"], "index": i, "height": block["height"], "value": int(vout["value"] * COIN), "is_coinbase": block_index == 0, }) assert_equal(EXPECTED_HANDLE_ADD_SUCCESS, len(expected_utxocache_adds)) assert_equal(EXPECTED_HANDLE_SPENT_SUCCESS, len(expected_utxocache_spents)) bpf.perf_buffer_poll(timeout=200) bpf.cleanup() self.log.info( f"check that we successfully traced {EXPECTED_HANDLE_ADD_SUCCESS} adds and {EXPECTED_HANDLE_SPENT_SUCCESS} spent") assert_equal(0, len(expected_utxocache_adds)) assert_equal(0, len(expected_utxocache_spents)) assert_equal(EXPECTED_HANDLE_ADD_SUCCESS, handle_add_succeeds) assert_equal(EXPECTED_HANDLE_SPENT_SUCCESS, handle_spent_succeeds)
help='pid of process emitting probes') args = parser.parse_args() bpfs = dict() # These probes do not currently exist on any branch, and were used solely for my own testing. # You can use any probes that pass through a BSONObj size and a BSONObj char* in that order. probes = []#["query1", "query", "query1", "query1R", "queryR", "query2", "query3"] # necessary to have multiple bpfs, one for each object, # due to restriction on number of insns per bpf object. # copying out long strings requires a large number of insns, # and duplicating that for each entrypt function is prohibitively # large in terms of number of isns. for probe in probes: print(probe) usdt = USDT(pid=args.pid[0]) usdt.enable_probe(probe, fn_name="str_entry_{}".format(probe)) bpfs[probe] = BPF(text=code.replace("#PROBE#", probe), usdt_contexts=[usdt]) print(code) wrks = [] for probe in probes: print(probe) bpf = bpfs[probe] bpf["str_out_{}".format(probe)].open_perf_buffer(gen_callback(probe, bpfs)) wrks.append(WorkerThread(gen_work(probe, bpfs))) master = WorkerMaster(wrks) master.start_all() print("Ready")
class Probe(object): def __init__(self, pattern, use_regex=False, pid=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' /path:func -- probe a user-space function in binary '/path' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.matched = 0 self.trace_functions = {} # map location number to function name def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p" and not self.library: for index, function in self.trace_functions.items(): self.bpf.attach_kprobe( event=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "p" and self.library: for index, function in self.trace_functions.items(): self.bpf.attach_uprobe( name=self.library, sym=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "t": for index, function in self.trace_functions.items(): self.bpf.attach_tracepoint( tp=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "u": pass # Nothing to do -- attach already happened in `load` def _add_function(self, template, probe_name): new_func = "trace_count_%d" % self.matched text = template.replace("PROBE_FUNCTION", new_func) text = text.replace("LOCATION", str(self.matched)) self.trace_functions[self.matched] = probe_name self.matched += 1 return text def _generate_functions(self, template): self.usdt = None text = "" if self.type == "p" and not self.library: functions = BPF.get_kprobe_functions(self.pattern) verify_limit(len(functions)) for function in functions: text += self._add_function(template, function) elif self.type == "p" and self.library: # uprobes are tricky because the same function may have multiple # addresses, and the same address may be mapped to multiple # functions. We aren't allowed to create more than one uprobe # per address, so track unique addresses and ignore functions that # map to an address that we've already seen. Also ignore functions # that may repeat multiple times with different addresses. addresses, functions = (set(), set()) functions_and_addresses = BPF.get_user_functions_and_addresses( self.library, self.pattern) verify_limit(len(functions_and_addresses)) for function, address in functions_and_addresses: if address in addresses or function in functions: continue addresses.add(address) functions.add(function) text += self._add_function(template, function) elif self.type == "t": tracepoints = BPF.get_tracepoints(self.pattern) verify_limit(len(tracepoints)) for tracepoint in tracepoints: text += self._add_function(template, tracepoint) elif self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) matches = [] for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): matches.append(probe.name) verify_limit(len(matches)) for match in matches: new_func = "trace_count_%d" % self.matched text += self._add_function(template, match) self.usdt.enable_probe(match, new_func) if debug: print(self.usdt.get_text()) return text def load(self): trace_count_text = """ int PROBE_FUNCTION(void *ctx) { FILTER int loc = LOCATION; u64 *val = counts.lookup(&loc); if (!val) { return 0; // Should never happen, # of locations is known } (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> BPF_ARRAY(counts, u64, NUMLOCATIONS); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.pid: trace_count_text = trace_count_text.replace('FILTER', """u32 pid = bpf_get_current_pid_tgid() >> 32; if (pid != %d) { return 0; }""" % self.pid) else: trace_count_text = trace_count_text.replace('FILTER', '') bpf_text += self._generate_functions(trace_count_text) bpf_text = bpf_text.replace("NUMLOCATIONS", str(len(self.trace_functions))) if debug: print(bpf_text) if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else []) self.clear() # Initialize all array items to zero def counts(self): return self.bpf["counts"] def clear(self): counts = self.bpf["counts"] for location, _ in list(self.trace_functions.items()): counts[counts.Key(location)] = counts.Leaf()
def test_flush(self): """ Tests the utxocache:flush tracepoint API. See https://github.com/bitcoin/bitcoin/blob/master/doc/tracing.md#tracepoint-utxocacheflush""" self.log.info("test the utxocache:flush tracepoint API") self.log.info("hook into the utxocache:flush tracepoint") ctx = USDT(path=str(self.options.bitcoind)) ctx.enable_probe(probe="utxocache:flush", fn_name="trace_utxocache_flush") bpf = BPF(text=utxocache_flushes_program, usdt_contexts=[ctx], debug=0) # The handle_* function is a ctypes callback function called from C. When # we assert in the handle_* function, the AssertError doesn't propagate # back to Python. The exception is ignored. We manually count and assert # that the handle_* functions succeeded. EXPECTED_HANDLE_FLUSH_SUCCESS = 3 handle_flush_succeeds = 0 possible_cache_sizes = set() expected_flushes = [] def handle_utxocache_flush(_, data, __): nonlocal handle_flush_succeeds event = ctypes.cast(data, ctypes.POINTER(UTXOCacheFlush)).contents self.log.info(f"handle_utxocache_flush(): {event}") expected = expected_flushes.pop(0) assert_equal(expected["mode"], FLUSHMODE_NAME[event.mode]) possible_cache_sizes.remove(event.size) # fails if size not in set # sanity checks only assert(event.memory > 0) assert(event.duration > 0) handle_flush_succeeds += 1 bpf["utxocache_flush"].open_perf_buffer(handle_utxocache_flush) self.log.info("stop the node to flush the UTXO cache") UTXOS_IN_CACHE = 104 # might need to be changed if the eariler tests are modified # A node shutdown causes two flushes. One that flushes UTXOS_IN_CACHE # UTXOs and one that flushes 0 UTXOs. Normally the 0-UTXO-flush is the # second flush, however it can happen that the order changes. possible_cache_sizes = {UTXOS_IN_CACHE, 0} flush_for_shutdown = {"mode": "ALWAYS", "for_prune": False} expected_flushes.extend([flush_for_shutdown, flush_for_shutdown]) self.stop_node(0) bpf.perf_buffer_poll(timeout=200) self.log.info("check that we don't expect additional flushes") assert_equal(0, len(expected_flushes)) assert_equal(0, len(possible_cache_sizes)) self.log.info("restart the node with -prune") self.start_node(0, ["-fastprune=1", "-prune=1"]) BLOCKS_TO_MINE = 350 self.log.info(f"mine {BLOCKS_TO_MINE} blocks to be able to prune") self.generate(self.wallet, BLOCKS_TO_MINE) # we added BLOCKS_TO_MINE coinbase UTXOs to the cache possible_cache_sizes = {BLOCKS_TO_MINE} expected_flushes.append( {"mode": "NONE", "for_prune": True, "size_fn": lambda x: x == BLOCKS_TO_MINE}) self.log.info(f"prune blockchain to trigger a flush for pruning") self.nodes[0].pruneblockchain(315) bpf.perf_buffer_poll(timeout=500) bpf.cleanup() self.log.info( f"check that we don't expect additional flushes and that the handle_* function succeeded") assert_equal(0, len(expected_flushes)) assert_equal(0, len(possible_cache_sizes)) assert_equal(EXPECTED_HANDLE_FLUSH_SUCCESS, handle_flush_succeeds)
./uthreads -l java 185 # trace Java threads in process 185 ./uthreads 12245 # trace only pthreads in process 12245 """ parser = argparse.ArgumentParser( description="Trace thread creation/destruction events in " + "high-level languages.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-l", "--language", choices=languages + ["none"], help="language to trace (none for pthreads only)") parser.add_argument("pid", type=int, help="process id to attach to") parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode: print the BPF program (for debugging purposes)") args = parser.parse_args() usdt = USDT(pid=args.pid) program = """ struct thread_event_t { u64 runtime_id; u64 native_id; char type[8]; char name[80]; }; BPF_PERF_OUTPUT(threads); int trace_pthread(struct pt_regs *ctx) { struct thread_event_t te = {}; u64 start_routine = 0; char type[] = "pthread";
"-DHHVM_TRACING_MAX_STACK_FRAMES={}".format(200), "-DHHVM_TRACING_FILE_NAME_MAX={}".format(128), "-DHHVM_TRACING_CLASS_NAME_MAX={}".format(128), "-DHHVM_TRACING_FUNCTION_MAX={}".format(128), ] bpf_text = "" with open("../strobelight_hhvm_structs.h", "r") as structs_file: bpf_text += structs_file.read() + "\n" with open("./strobelight_hhvm_signal_latency.c", "r") as latency_file: bpf_text += latency_file.read() usdts = [] for pid in args.pids: usdt = USDT(pid=pid) usdt.enable_probe("hhvm_surprise", "on_hhvm_signal_handler") print("Enabled tracing on {}\n".format(pid)) usdts.append(usdt) bpf = BPF(text=bpf_text, usdt_contexts=usdts, cflags=cflags) hhvm_pids = bpf.get_table("hhvm_pids") for pid in args.pids: hhvm_pids[c_int(pid)] = c_int(1) # A generic perf event for driving the signal bpf.attach_perf_event( ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CPU_CYCLES, fn_name="on_event",
def test_attach1(self): # enable USDT probe from given PID and verifier generated BPF programs u = USDT(pid=int(self.app.pid)) u.enable_probe(probe="probe_point_1", fn_name="do_trace1") u.enable_probe(probe="probe_point_2", fn_name="do_trace2") u.enable_probe(probe="probe_point_3", fn_name="do_trace3") u.enable_probe(probe="probe_point_4", fn_name="do_trace4") u.enable_probe(probe="probe_point_5", fn_name="do_trace5") b = BPF(text=self.bpf_text, usdt_contexts=[u], debug=4) # Event states for each event: # 0 - probe not caught, 1 - probe caught with correct value, # 2 - probe caught with incorrect value self.evt_st_1 = 0 self.evt_st_2 = 0 self.evt_st_3 = 0 # define output data structure in Python class Data1(ct.Structure): _fields_ = [("v1", ct.c_char), ("v2", ct.c_int)] class Data2(ct.Structure): _fields_ = [("v1", ct.c_int), ("v2", ct.c_char)] class Data3(ct.Structure): _fields_ = [("v1", ct.c_int), ("v2", ct.c_int)] class Data4(ct.Structure): _fields_ = [("v1", ct.c_ulonglong), ("v2", ct.c_char * 64)] class Data5(ct.Structure): _fields_ = [("v1", ct.c_char * 64), ("v2", ct.c_ulonglong)] def check_event_val(event, event_state, v1, v2, v3, v4): if ((event.v1 == v1 and event.v2 == v2) or (event.v1 == v3 and event.v2 == v4)): if (event_state == 0 or event_state == 1): return 1 return 2 def print_event1(cpu, data, size): event = ct.cast(data, ct.POINTER(Data1)).contents self.evt_st_1 = check_event_val(event, self.evt_st_1, b'\x0d', 40, b'\x08', 200) def print_event2(cpu, data, size): event = ct.cast(data, ct.POINTER(Data2)).contents # pretend we have two identical probe points to simplify the code self.evt_st_2 = check_event_val(event, self.evt_st_2, 5, b'\x04', 5, b'\x04') def print_event3(cpu, data, size): event = ct.cast(data, ct.POINTER(Data3)).contents self.evt_st_3 = check_event_val(event, self.evt_st_3, 200, 40, 8, 13) def print_event4(cpu, data, size): event = ct.cast(data, ct.POINTER(Data4)).contents print("%s" % event.v2) def print_event5(cpu, data, size): event = ct.cast(data, ct.POINTER(Data5)).contents print("%s" % event.v1) # loop with callback to print_event b["event1"].open_perf_buffer(print_event1) b["event2"].open_perf_buffer(print_event2) b["event3"].open_perf_buffer(print_event3) b["event4"].open_perf_buffer(print_event4) b["event5"].open_perf_buffer(print_event5) # three iterations to make sure we get some probes and have time to process them for i in range(3): b.kprobe_poll() self.assertTrue(self.evt_st_1 == 1 and self.evt_st_2 == 1 and self.evt_st_3 == 1)
""" parser = argparse.ArgumentParser( description="Trace thread creation/destruction events in " + "high-level languages.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) parser.add_argument("-l", "--language", choices=languages + ["none"], help="language to trace (none for pthreads only)") parser.add_argument("pid", type=int, help="process id to attach to") parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode: print the BPF program (for debugging purposes)") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() usdt = USDT(pid=args.pid) program = """ struct thread_event_t { u64 runtime_id; u64 native_id; char type[8]; char name[80]; }; BPF_PERF_OUTPUT(threads); int trace_pthread(struct pt_regs *ctx) { struct thread_event_t te = {}; u64 start_routine = 0; char type[] = "pthread";
timestampp = temp.lookup(&pid); if (!timestampp) return 0; u64 delta = bpf_ktime_get_ns() - *timestampp; if (delta/1000000 < %d) return 0; delta /= %d; latency.increment(bpf_log2l(delta)); temp.delete(&pid); return 0; } """ % (args.threshold, 1000000 if args.milliseconds else 1000) usdt = USDT(pid=int(dbpid)) usdt.enable_probe("query__start", "probe_start") usdt.enable_probe("query__done", "probe_end") bpf = BPF(text=program, usdt_contexts=[usdt]) if args.verbose: print(usdt.get_text()) print(program) print("Tracing database queries slower than %dms for PID %d... Ctrl+C to quit." % (args.threshold, dbpid)) try: sleep(999999999999) except KeyboardInterrupt: pass
def main(): # # Don't like these globals, but ctx passing does not seem to work with the # existing open_ring_buffer() API :( # global b global options # # Argument parsing # parser = argparse.ArgumentParser() parser.add_argument("--buffer-page-count", help="Number of BPF ring buffer pages, default 1024", type=int, default=1024, metavar="NUMBER") parser.add_argument("-D", "--debug", help="Enable eBPF debugging", type=int, const=0x3f, default=0, nargs='?') parser.add_argument('-d', '--packet-decode', help='Display packet content in selected mode, ' 'default none', choices=['none', 'hex', 'decode'], default='none') parser.add_argument("-f", "--flow-key-size", help="Set maximum flow key size to capture, " "default 64", type=buffer_size_type, default=64, metavar="[64-2048]") parser.add_argument('-k', '--flow-key-decode', help='Display flow-key content in selected mode, ' 'default none', choices=['none', 'hex', 'nlraw'], default='none') parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", help="ovs-vswitch's PID", type=int, default=None) parser.add_argument("-s", "--packet-size", help="Set maximum packet size to capture, " "default 64", type=buffer_size_type, default=64, metavar="[64-2048]") parser.add_argument("-w", "--pcap", metavar="PCAP_FILE", help="Write upcall packets to specified pcap file.", type=str, default=None) options = parser.parse_args() # # Find the PID of the ovs-vswitchd daemon if not specified. # if options.pid is None: for proc in psutil.process_iter(): if 'ovs-vswitchd' in proc.name(): if options.pid is not None: print("ERROR: Multiple ovs-vswitchd daemons running, " "use the -p option!") sys.exit(-1) options.pid = proc.pid # # Error checking on input parameters # if options.pid is None: print("ERROR: Failed to find ovs-vswitchd's PID!") sys.exit(-1) if options.pcap is not None: if exists(options.pcap): print("ERROR: Destination capture file \"{}\" already exists!". format(options.pcap)) sys.exit(-1) options.buffer_page_count = next_power_of_two(options.buffer_page_count) # # Attach the usdt probe # u = USDT(pid=int(options.pid)) try: u.enable_probe(probe="recv_upcall", fn_name="do_trace") except USDTException as e: print("ERROR: {}" "ovs-vswitchd!".format( (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip().replace( "--with-dtrace or --enable-dtrace", "--enable-usdt-probes"))) sys.exit(-1) # # Uncomment to see how arguments are decoded. # print(u.get_text()) # # # Attach probe to running process # source = ebpf_source.replace("<MAX_PACKET_VAL>", str(options.packet_size)) source = source.replace("<MAX_KEY_VAL>", str(options.flow_key_size)) source = source.replace("<BUFFER_PAGE_CNT>", str(options.buffer_page_count)) b = BPF(text=source, usdt_contexts=[u], debug=options.debug) # # Print header # print("{:<18} {:<4} {:<16} {:<10} {:<32} {:<4} {:<10} {:<10}".format( "TIME", "CPU", "COMM", "PID", "DPIF_NAME", "TYPE", "PKT_LEN", "FLOW_KEY_LEN")) # # Dump out all events # b['events'].open_ring_buffer(print_event) while 1: try: b.ring_buffer_poll() time.sleep(0.5) except KeyboardInterrupt: break dropcnt = b.get_table("dropcnt") for k in dropcnt.keys(): count = dropcnt.sum(k).value if k.value == 0 and count > 0: print("\nWARNING: Not all upcalls were captured, {} were dropped!" "\n Increase the BPF ring buffer size with the " "--buffer-page-count option.".format(count))
int command_failed(struct pt_regs *ctx) {{ struct failed_out out = {{}}; int err_code = 0; bpf_usdt_readarg(6, ctx, &err_code); error_hist.increment(err_code); const char* addr = NULL; bpf_usdt_readarg(2, ctx, &addr); bpf_probe_read_str(out.name, sizeof(out.name), addr); out.error_code = err_code; failed.perf_submit(ctx, &out, sizeof(out)); return 0; }} """.format(NUM_ERR_CODES=len(ERROR_CODES)) command_failed = USDT(pid=int(sys.argv[1])) command_failed.enable_probe(probe="commandFail", fn_name="command_failed") b = BPF(text = text, usdt_contexts=[command_failed]) command_to_errors = dict() def print_event(cpu, data, size): event = b["failed"].event(data) event_name = str(event.name, 'utf-8') if event_name not in command_to_errors: command_to_errors[event_name] = dict() if event.error_code not in command_to_errors[event_name]: command_to_errors[event_name][event.error_code] = 0 command_to_errors[event_name][event.error_code] += 1 b["failed"].open_perf_buffer(print_event)
bpf = BPF(text=program) bpf.attach_uprobe(name=args.path, sym=mysql_func_name, fn_name="query_start") bpf.attach_uretprobe(name=args.path, sym=mysql_func_name, fn_name="query_end") else: # USDT mode if not args.pids or len(args.pids) == 0: if args.db == "mysql": args.pids = map(int, subprocess.check_output( "pidof mysqld".split()).split()) elif args.db == "postgres": args.pids = map(int, subprocess.check_output( "pidof postgres".split()).split()) usdts = map(lambda pid: USDT(pid=pid), args.pids) for usdt in usdts: usdt.enable_probe("query__start", "query_start") usdt.enable_probe("query__done", "query_end") if args.verbose: print('\n'.join(map(lambda u: u.get_text(), usdts))) bpf = BPF(text=program, usdt_contexts=usdts) if args.verbose or args.ebpf: print(program) if args.ebpf: exit() start = BPF.monotonic_time()
class Tool(object): examples = """ Probe specifier syntax: {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label] Where: p,r,t,u -- probe at function entry, function exit, kernel tracepoint, or USDT probe in exit probes: can use $retval, $entry(param), $latency library -- the library that contains the function (leave empty for kernel functions) category -- the category of the kernel tracepoint (e.g. net, sched) function -- the function name to trace (or tracepoint name) signature -- the function's parameters, as in the C header type -- the type of the expression to collect (supports multiple) expr -- the expression to collect (supports multiple) filter -- the filter that is applied to collected values label -- the label for this probe in the resulting output EXAMPLES: argdist -H 'p::__kmalloc(u64 size):u64:size' Print a histogram of allocation sizes passed to kmalloc argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16' Print a frequency count of how many times process 1005 called malloc with an allocation size of 16 bytes argdist -C 'r:c:gets():char*:(char*)$retval#snooped strings' Snoop on all strings returned by gets() argdist -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size)#ns per byte' Print a histogram of nanoseconds per byte from kmalloc allocations argdist -C 'p::__kmalloc(size_t size, gfp_t flags):size_t:size:flags&GFP_ATOMIC' Print frequency count of kmalloc allocation sizes that have GFP_ATOMIC argdist -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005, but only show the top 5 busiest fds argdist -p 1005 -H 'r:c:read()' Print a histogram of results (sizes) returned by read() in process 1005 argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000' Print frequency of reads by process where the latency was >0.1ms argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' Print a histogram of read sizes that were longer than 1ms argdist -H \\ 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1' Print a histogram of buffer sizes passed to write() across all processes, where the file descriptor was 1 (STDOUT) argdist -C 'p:c:fork()#fork calls' Count fork() calls in libc across all processes Can also use funccount.py, which is easier and more flexible argdist -H 't:block:block_rq_complete():u32:tp.nr_sector' Print histogram of number of sectors in completing block I/O requests argdist -C 't:irq:irq_handler_entry():int:tp.irq' Aggregate interrupts by interrupt request (IRQ) argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337 Print frequency of function addresses used as a pthread start function, relying on the USDT pthread_start probe in process 1337 argdist -H \\ 'p:c:sleep(u32 seconds):u32:seconds' \\ 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec' Print histograms of sleep() and nanosleep() parameter values argdist -p 2780 -z 120 \\ -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1' Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters """ def __init__(self): parser = argparse.ArgumentParser(description="Trace a " + "function and display a summary of its parameter values.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=Tool.examples) parser.add_argument("-p", "--pid", type=int, help="id of the process to trace (optional)") parser.add_argument("-z", "--string-size", default=80, type=int, help="maximum string size to read from char* arguments") parser.add_argument("-i", "--interval", default=1, type=int, help="output interval, in seconds") parser.add_argument("-n", "--number", type=int, dest="count", help="number of outputs") parser.add_argument("-v", "--verbose", action="store_true", help="print resulting BPF program code before executing") parser.add_argument("-T", "--top", type=int, help="number of top results to show (not applicable to " + "histograms)") parser.add_argument("-H", "--histogram", nargs="*", dest="histspecifier", metavar="specifier", help="probe specifier to capture histogram of " + "(see examples below)") parser.add_argument("-C", "--count", nargs="*", dest="countspecifier", metavar="specifier", help="probe specifier to capture count of " + "(see examples below)") parser.add_argument("-I", "--include", nargs="*", metavar="header", help="additional header files to include in the BPF program") self.args = parser.parse_args() self.usdt_ctx = None def _create_probes(self): self.probes = [] for specifier in (self.args.countspecifier or []): self.probes.append(Probe(self, "freq", specifier)) for histspecifier in (self.args.histspecifier or []): self.probes.append(Probe(self, "hist", histspecifier)) if len(self.probes) == 0: print("at least one specifier is required") exit() def enable_usdt_probe(self, probe_name, fn_name): if not self.usdt_ctx: self.usdt_ctx = USDT(pid=self.args.pid) self.usdt_ctx.enable_probe(probe_name, fn_name) def _generate_program(self): bpf_source = """ struct __string_t { char s[%d]; }; #include <uapi/linux/ptrace.h> """ % self.args.string_size for include in (self.args.include or []): bpf_source += "#include <%s>\n" % include bpf_source += BPF.generate_auto_includes( map(lambda p: p.raw_spec, self.probes)) bpf_source += Tracepoint.generate_decl() bpf_source += Tracepoint.generate_entry_probe() for probe in self.probes: bpf_source += probe.generate_text() if self.args.verbose: if self.usdt_ctx: print(self.usdt_ctx.get_text()) print(bpf_source) self.bpf = BPF(text=bpf_source, usdt=self.usdt_ctx) def _attach(self): Tracepoint.attach(self.bpf) for probe in self.probes: probe.attach(self.bpf) if self.args.verbose: print("open uprobes: %s" % self.bpf.open_uprobes) print("open kprobes: %s" % self.bpf.open_kprobes) def _main_loop(self): count_so_far = 0 while True: try: sleep(self.args.interval) except KeyboardInterrupt: exit() print("[%s]" % strftime("%H:%M:%S")) for probe in self.probes: probe.display(self.args.top) count_so_far += 1 if self.args.count is not None and \ count_so_far >= self.args.count: exit() def _close_probes(self): for probe in self.probes: probe.close() if self.args.verbose: print("closed probe: " + str(probe)) def run(self): try: self._create_probes() self._generate_program() self._attach() self._main_loop() except: if self.args.verbose: traceback.print_exc() elif sys.exc_info()[0] is not SystemExit: print(sys.exc_info()[1]) self._close_probes()
def _find_usdt_probe(self): self.usdt = USDT(path=self.library, pid=Probe.pid) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name: return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name)
class Probe(object): next_probe_index = 0 streq_index = 0 aliases = {"$PID": "(bpf_get_current_pid_tgid() >> 32)"} def _substitute_aliases(self, expr): if expr is None: return expr for alias, subst in Probe.aliases.items(): expr = expr.replace(alias, subst) return expr def _parse_signature(self): params = map(str.strip, self.signature.split(',')) self.param_types = {} for param in params: # If the type is a pointer, the * can be next to the # param name. Other complex types like arrays are not # supported right now. index = param.rfind('*') index = index if index != -1 else param.rfind(' ') param_type = param[0:index + 1].strip() param_name = param[index + 1:].strip() self.param_types[param_name] = param_type def _generate_entry(self): self.entry_probe_func = self.probe_func_name + "_entry" text = """ int PROBENAME(struct pt_regs *ctx SIGNATURE) { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __pid = __pid_tgid; // lower 32 bits u32 __tgid = __pid_tgid >> 32; // upper 32 bits PID_FILTER COLLECT return 0; } """ text = text.replace("PROBENAME", self.entry_probe_func) text = text.replace("SIGNATURE", "" if len(self.signature) == 0 else ", " + self.signature) text = text.replace("PID_FILTER", self._generate_pid_filter()) collect = "" for pname in self.args_to_probe: param_hash = self.hashname_prefix + pname if pname == "__latency": collect += """ u64 __time = bpf_ktime_get_ns(); %s.update(&__pid, &__time); """ % param_hash else: collect += "%s.update(&__pid, &%s);\n" % \ (param_hash, pname) text = text.replace("COLLECT", collect) return text def _generate_entry_probe(self): # Any $entry(name) expressions result in saving that argument # when entering the function. self.args_to_probe = set() regex = r"\$entry\((\w+)\)" for expr in self.exprs: for arg in re.finditer(regex, expr): self.args_to_probe.add(arg.group(1)) for arg in re.finditer(regex, self.filter): self.args_to_probe.add(arg.group(1)) if any(map(lambda expr: "$latency" in expr, self.exprs)) or \ "$latency" in self.filter: self.args_to_probe.add("__latency") self.param_types["__latency"] = "u64" # nanoseconds for pname in self.args_to_probe: if pname not in self.param_types: raise ValueError("$entry(%s): no such param" % arg) self.hashname_prefix = "%s_param_" % self.probe_hash_name text = "" for pname in self.args_to_probe: # Each argument is stored in a separate hash that is # keyed by pid. text += "BPF_HASH(%s, u32, %s);\n" % \ (self.hashname_prefix + pname, self.param_types[pname]) text += self._generate_entry() return text def _generate_retprobe_prefix(self): # After we're done here, there are __%s_val variables for each # argument we needed to probe using $entry(name), and they all # have values (which isn't necessarily the case if we missed # the method entry probe). text = "" self.param_val_names = {} for pname in self.args_to_probe: val_name = "__%s_val" % pname text += "%s *%s = %s.lookup(&__pid);\n" % \ (self.param_types[pname], val_name, self.hashname_prefix + pname) text += "if (%s == 0) { return 0 ; }\n" % val_name self.param_val_names[pname] = val_name return text def _replace_entry_exprs(self): for pname, vname in self.param_val_names.items(): if pname == "__latency": entry_expr = "$latency" val_expr = "(bpf_ktime_get_ns() - *%s)" % vname else: entry_expr = "$entry(%s)" % pname val_expr = "(*%s)" % vname for i in range(0, len(self.exprs)): self.exprs[i] = self.exprs[i].replace( entry_expr, val_expr) self.filter = self.filter.replace(entry_expr, val_expr) def _attach_entry_probe(self): if self.is_user: self.bpf.attach_uprobe(name=self.library, sym=self.function, fn_name=self.entry_probe_func, pid=self.pid or -1) else: self.bpf.attach_kprobe(event=self.function, fn_name=self.entry_probe_func) def _bail(self, error): raise ValueError("error parsing probe '%s': %s" % (self.raw_spec, error)) def _validate_specifier(self): # Everything after '#' is the probe label, ignore it spec = self.raw_spec.split('#')[0] parts = spec.strip().split(':') if len(parts) < 3: self._bail("at least the probe type, library, and " + "function signature must be specified") if len(parts) > 6: self._bail("extraneous ':'-separated parts detected") if parts[0] not in ["r", "p", "t", "u"]: self._bail("probe type must be 'p', 'r', 't', or 'u'" + " but got '%s'" % parts[0]) if re.match(r"\S+\(.*\)", parts[2]) is None: self._bail(("function signature '%s' has an invalid " + "format") % parts[2]) def _parse_expr_types(self, expr_types): if len(expr_types) == 0: self._bail("no expr types specified") self.expr_types = expr_types.split(',') def _parse_exprs(self, exprs): if len(exprs) == 0: self._bail("no exprs specified") self.exprs = exprs.split(',') def _make_valid_identifier(self, ident): return re.sub(r'[^A-Za-z0-9_]', '_', ident) def __init__(self, tool, type, specifier): self.usdt_ctx = None self.streq_functions = "" self.pid = tool.args.pid self.cumulative = tool.args.cumulative or False self.raw_spec = specifier self._validate_specifier() spec_and_label = specifier.split('#') self.label = spec_and_label[1] \ if len(spec_and_label) == 2 else None parts = spec_and_label[0].strip().split(':') self.type = type # hist or freq self.probe_type = parts[0] fparts = parts[2].split('(') self.function = fparts[0].strip() if self.probe_type == "t": self.library = "" # kernel self.tp_category = parts[1] self.tp_event = self.function elif self.probe_type == "u": self.library = parts[1] self.probe_func_name = self._make_valid_identifier( "%s_probe%d" % (self.function, Probe.next_probe_index)) self._enable_usdt_probe() else: self.library = parts[1] self.is_user = len(self.library) > 0 self.signature = fparts[1].strip()[:-1] self._parse_signature() # If the user didn't specify an expression to probe, we probe # the retval in a ret probe, or simply the value "1" otherwise. self.is_default_expr = len(parts) < 5 if not self.is_default_expr: self._parse_expr_types(parts[3]) self._parse_exprs(parts[4]) if len(self.exprs) != len(self.expr_types): self._bail("mismatched # of exprs and types") if self.type == "hist" and len(self.expr_types) > 1: self._bail("histograms can only have 1 expr") else: if not self.probe_type == "r" and self.type == "hist": self._bail("histograms must have expr") self.expr_types = \ ["u64" if not self.probe_type == "r" else "int"] self.exprs = \ ["1" if not self.probe_type == "r" else "$retval"] self.filter = "" if len(parts) != 6 else parts[5] self._substitute_exprs() # Do we need to attach an entry probe so that we can collect an # argument that is required for an exit (return) probe? def check(expr): keywords = ["$entry", "$latency"] return any(map(lambda kw: kw in expr, keywords)) self.entry_probe_required = self.probe_type == "r" and \ (any(map(check, self.exprs)) or check(self.filter)) self.probe_func_name = self._make_valid_identifier( "%s_probe%d" % (self.function, Probe.next_probe_index)) self.probe_hash_name = self._make_valid_identifier( "%s_hash%d" % (self.function, Probe.next_probe_index)) Probe.next_probe_index += 1 def _enable_usdt_probe(self): self.usdt_ctx = USDT(path=self.library, pid=self.pid) self.usdt_ctx.enable_probe( self.function, self.probe_func_name) def _generate_streq_function(self, string): fname = "streq_%d" % Probe.streq_index Probe.streq_index += 1 self.streq_functions += """ static inline bool %s(char const *ignored, char const *str) { char needle[] = %s; char haystack[sizeof(needle)]; bpf_probe_read(&haystack, sizeof(haystack), (void *)str); for (int i = 0; i < sizeof(needle) - 1; ++i) { if (needle[i] != haystack[i]) { return false; } } return true; } """ % (fname, string) return fname def _substitute_exprs(self): def repl(expr): expr = self._substitute_aliases(expr) matches = re.finditer('STRCMP\\(("[^"]+\\")', expr) for match in matches: string = match.group(1) fname = self._generate_streq_function(string) expr = expr.replace("STRCMP", fname, 1) return expr.replace("$retval", "PT_REGS_RC(ctx)") for i in range(0, len(self.exprs)): self.exprs[i] = repl(self.exprs[i]) self.filter = repl(self.filter) def _is_string(self, expr_type): return expr_type == "char*" or expr_type == "char *" def _generate_hash_field(self, i): if self._is_string(self.expr_types[i]): return "struct __string_t v%d;\n" % i else: return "%s v%d;\n" % (self.expr_types[i], i) def _generate_usdt_arg_assignment(self, i): expr = self.exprs[i] if self.probe_type == "u" and expr[0:3] == "arg": arg_index = int(expr[3]) arg_ctype = self.usdt_ctx.get_probe_arg_ctype( self.function, arg_index - 1) return (" %s %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") \ % (arg_ctype, expr, expr[3], expr) else: return "" def _generate_field_assignment(self, i): text = self._generate_usdt_arg_assignment(i) if self._is_string(self.expr_types[i]): return (text + " bpf_probe_read(&__key.v%d.s," + " sizeof(__key.v%d.s), (void *)%s);\n") % \ (i, i, self.exprs[i]) else: return text + " __key.v%d = %s;\n" % \ (i, self.exprs[i]) def _generate_hash_decl(self): if self.type == "hist": return "BPF_HISTOGRAM(%s, %s);" % \ (self.probe_hash_name, self.expr_types[0]) else: text = "struct %s_key_t {\n" % self.probe_hash_name for i in range(0, len(self.expr_types)): text += self._generate_hash_field(i) text += "};\n" text += "BPF_HASH(%s, struct %s_key_t, u64);\n" % \ (self.probe_hash_name, self.probe_hash_name) return text def _generate_key_assignment(self): if self.type == "hist": return self._generate_usdt_arg_assignment(0) + \ ("%s __key = %s;\n" % (self.expr_types[0], self.exprs[0])) else: text = "struct %s_key_t __key = {};\n" % \ self.probe_hash_name for i in range(0, len(self.exprs)): text += self._generate_field_assignment(i) return text def _generate_hash_update(self): if self.type == "hist": return "%s.increment(bpf_log2l(__key));" % \ self.probe_hash_name else: return "%s.increment(__key);" % self.probe_hash_name def _generate_pid_filter(self): # Kernel probes need to explicitly filter pid, because the # attach interface doesn't support pid filtering if self.pid is not None and not self.is_user: return "if (__tgid != %d) { return 0; }" % self.pid else: return "" def generate_text(self): program = "" probe_text = """ DATA_DECL """ + ( "TRACEPOINT_PROBE(%s, %s)" % (self.tp_category, self.tp_event) if self.probe_type == "t" else "int PROBENAME(struct pt_regs *ctx SIGNATURE)") + """ { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __pid = __pid_tgid; // lower 32 bits u32 __tgid = __pid_tgid >> 32; // upper 32 bits PID_FILTER PREFIX if (!(FILTER)) return 0; KEY_EXPR COLLECT return 0; } """ prefix = "" signature = "" # If any entry arguments are probed in a ret probe, we need # to generate an entry probe to collect them if self.entry_probe_required: program += self._generate_entry_probe() prefix += self._generate_retprobe_prefix() # Replace $entry(paramname) with a reference to the # value we collected when entering the function: self._replace_entry_exprs() if self.probe_type == "p" and len(self.signature) > 0: # Only entry uprobes/kprobes can have user-specified # signatures. Other probes force it to (). signature = ", " + self.signature program += probe_text.replace("PROBENAME", self.probe_func_name) program = program.replace("SIGNATURE", signature) program = program.replace("PID_FILTER", self._generate_pid_filter()) decl = self._generate_hash_decl() key_expr = self._generate_key_assignment() collect = self._generate_hash_update() program = program.replace("DATA_DECL", decl) program = program.replace("KEY_EXPR", key_expr) program = program.replace("FILTER", "1" if len(self.filter) == 0 else self.filter) program = program.replace("COLLECT", collect) program = program.replace("PREFIX", prefix) return self.streq_functions + program def _attach_u(self): libpath = BPF.find_library(self.library) if libpath is None: libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "r": self.bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_func_name, pid=self.pid or -1) else: self.bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_func_name, pid=self.pid or -1) def _attach_k(self): if self.probe_type == "t": pass # Nothing to do for tracepoints elif self.probe_type == "r": self.bpf.attach_kretprobe(event=self.function, fn_name=self.probe_func_name) else: self.bpf.attach_kprobe(event=self.function, fn_name=self.probe_func_name) def attach(self, bpf): self.bpf = bpf if self.probe_type == "u": return if self.is_user: self._attach_u() else: self._attach_k() if self.entry_probe_required: self._attach_entry_probe() def _v2s(self, v): # Most fields can be converted with plain str(), but strings # are wrapped in a __string_t which has an .s field if "__string_t" in type(v).__name__: return str(v.s) return str(v) def _display_expr(self, i): # Replace ugly latency calculation with $latency expr = self.exprs[i].replace( "(bpf_ktime_get_ns() - *____latency_val)", "$latency") # Replace alias values back with the alias name for alias, subst in Probe.aliases.items(): expr = expr.replace(subst, alias) # Replace retval expression with $retval expr = expr.replace("PT_REGS_RC(ctx)", "$retval") # Replace ugly (*__param_val) expressions with param name return re.sub(r"\(\*__(\w+)_val\)", r"\1", expr) def _display_key(self, key): if self.is_default_expr: if not self.probe_type == "r": return "total calls" else: return "retval = %s" % str(key.v0) else: # The key object has v0, ..., vk fields containing # the values of the expressions from self.exprs def str_i(i): key_i = self._v2s(getattr(key, "v%d" % i)) return "%s = %s" % \ (self._display_expr(i), key_i) return ", ".join(map(str_i, range(0, len(self.exprs)))) def display(self, top): data = self.bpf.get_table(self.probe_hash_name) if self.type == "freq": print(self.label or self.raw_spec) print("\t%-10s %s" % ("COUNT", "EVENT")) sdata = sorted(data.items(), key=lambda p: p[1].value) if top is not None: sdata = sdata[-top:] for key, value in sdata: # Print some nice values if the user didn't # specify an expression to probe if self.is_default_expr: if not self.probe_type == "r": key_str = "total calls" else: key_str = "retval = %s" % \ self._v2s(key.v0) else: key_str = self._display_key(key) print("\t%-10s %s" % (str(value.value), key_str)) elif self.type == "hist": label = self.label or (self._display_expr(0) if not self.is_default_expr else "retval") data.print_log2_hist(val_type=label) if not self.cumulative: data.clear() def __str__(self): return self.label or self.raw_spec
class Probe(object): probe_count = 0 max_events = None event_count = 0 first_ts = 0 use_localtime = True pid = -1 @classmethod def configure(cls, args): cls.max_events = args.max_events cls.use_localtime = not args.offset cls.first_ts = Time.monotonic_time() cls.pid = args.pid or -1 def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None self.raw_probe = probe self.string_size = string_size self.kernel_stack = kernel_stack self.user_stack = user_stack Probe.probe_count += 1 self._parse_probe() self.probe_num = Probe.probe_count self.probe_name = "probe_%s_%d" % \ (self._display_function(), self.probe_num) def __str__(self): return "%s:%s:%s FLT=%s ACT=%s/%s" % (self.probe_type, self.library, self._display_function(), self.filter, self.types, self.values) def is_default_action(self): return self.python_format == "" def _bail(self, error): raise ValueError("error in probe '%s': %s" % (self.raw_probe, error)) def _parse_probe(self): text = self.raw_probe # Everything until the first space is the probe specifier first_space = text.find(' ') spec = text[:first_space] if first_space >= 0 else text self._parse_spec(spec) if first_space >= 0: text = text[first_space:].lstrip() else: text = "" # If we now have a (, wait for the balanced closing ) and that # will be the predicate self.filter = None if len(text) > 0 and text[0] == "(": balance = 1 for i in range(1, len(text)): if text[i] == "(": balance += 1 if text[i] == ")": balance -= 1 if balance == 0: self._parse_filter(text[:i+1]) text = text[i+1:] break if self.filter is None: self._bail("unmatched end of predicate") if self.filter is None: self.filter = "1" # The remainder of the text is the printf action self._parse_action(text.lstrip()) def _parse_spec(self, spec): parts = spec.split(":") # Two special cases: 'func' means 'p::func', 'lib:func' means # 'p:lib:func'. Other combinations need to provide an empty # value between delimiters, e.g. 'r::func' for a kretprobe on # the function func. if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] if len(parts[0]) == 0: self.probe_type = "p" elif parts[0] in ["p", "r", "t", "u"]: self.probe_type = parts[0] else: self._bail("probe type must be '', 'p', 't', 'r', " + "or 'u', but got '%s'" % parts[0]) if self.probe_type == "t": self.tp_category = parts[1] self.tp_event = parts[2] self.library = "" # kernel self.function = "" # generated from TRACEPOINT_PROBE elif self.probe_type == "u": self.library = parts[1] self.usdt_name = parts[2] self.function = "" # no function, just address # We will discover the USDT provider by matching on # the USDT name in the specified library self._find_usdt_probe() else: self.library = parts[1] self.function = parts[2] def _find_usdt_probe(self): self.usdt = USDT(path=self.library, pid=Probe.pid) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name: return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name) def _parse_filter(self, filt): self.filter = self._replace_args(filt) def _parse_types(self, fmt): for match in re.finditer( r'[^%]%(s|u|d|llu|lld|hu|hd|x|llx|c|K|U)', fmt): self.types.append(match.group(1)) fmt = re.sub(r'([^%]%)(u|d|llu|lld|hu|hd)', r'\1d', fmt) fmt = re.sub(r'([^%]%)(x|llx)', r'\1x', fmt) fmt = re.sub('%K|%U', '%s', fmt) self.python_format = fmt.strip('"') def _parse_action(self, action): self.values = [] self.types = [] self.python_format = "" if len(action) == 0: return action = action.strip() match = re.search(r'(\".*\"),?(.*)', action) if match is None: self._bail("expected format string in \"s") self.raw_format = match.group(1) self._parse_types(self.raw_format) for part in match.group(2).split(','): part = self._replace_args(part) if len(part) > 0: self.values.append(part) aliases = { "retval": "PT_REGS_RC(ctx)", "arg1": "PT_REGS_PARM1(ctx)", "arg2": "PT_REGS_PARM2(ctx)", "arg3": "PT_REGS_PARM3(ctx)", "arg4": "PT_REGS_PARM4(ctx)", "arg5": "PT_REGS_PARM5(ctx)", "arg6": "PT_REGS_PARM6(ctx)", "$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)", "$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)", "$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)", "$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)", "$cpu": "bpf_get_smp_processor_id()" } def _replace_args(self, expr): for alias, replacement in Probe.aliases.items(): # For USDT probes, we replace argN values with the # actual arguments for that probe obtained using special # bpf_readarg_N macros emitted at BPF construction. if alias.startswith("arg") and self.probe_type == "u": continue expr = expr.replace(alias, replacement) return expr p_type = { "u": ct.c_uint, "d": ct.c_int, "llu": ct.c_ulonglong, "lld": ct.c_longlong, "hu": ct.c_ushort, "hd": ct.c_short, "x": ct.c_uint, "llx": ct.c_ulonglong, "c": ct.c_ubyte, "K": ct.c_ulonglong, "U": ct.c_ulonglong } def _generate_python_field_decl(self, idx, fields): field_type = self.types[idx] if field_type == "s": ptype = ct.c_char * self.string_size else: ptype = Probe.p_type[field_type] fields.append(("v%d" % idx, ptype)) def _generate_python_data_decl(self): self.python_struct_name = "%s_%d_Data" % \ (self._display_function(), self.probe_num) fields = [ ("timestamp_ns", ct.c_ulonglong), ("pid", ct.c_uint), ("comm", ct.c_char * 16) # TASK_COMM_LEN ] for i in range(0, len(self.types)): self._generate_python_field_decl(i, fields) if self.kernel_stack: fields.append(("kernel_stack_id", ct.c_int)) if self.user_stack: fields.append(("user_stack_id", ct.c_int)) return type(self.python_struct_name, (ct.Structure,), dict(_fields_=fields)) c_type = { "u": "unsigned int", "d": "int", "llu": "unsigned long long", "lld": "long long", "hu": "unsigned short", "hd": "short", "x": "unsigned int", "llx": "unsigned long long", "c": "char", "K": "unsigned long long", "U": "unsigned long long" } fmt_types = c_type.keys() def _generate_field_decl(self, idx): field_type = self.types[idx] if field_type == "s": return "char v%d[%d];\n" % (idx, self.string_size) if field_type in Probe.fmt_types: return "%s v%d;\n" % (Probe.c_type[field_type], idx) self._bail("unrecognized format specifier %s" % field_type) def _generate_data_decl(self): # The BPF program will populate values into the struct # according to the format string, and the Python program will # construct the final display string. self.events_name = "%s_events" % self.probe_name self.struct_name = "%s_data_t" % self.probe_name self.stacks_name = "%s_stacks" % self.probe_name stack_table = "BPF_STACK_TRACE(%s, 1024);" % self.stacks_name \ if (self.kernel_stack or self.user_stack) else "" data_fields = "" for i, field_type in enumerate(self.types): data_fields += " " + \ self._generate_field_decl(i) kernel_stack_str = " int kernel_stack_id;" \ if self.kernel_stack else "" user_stack_str = " int user_stack_id;" \ if self.user_stack else "" text = """ struct %s { u64 timestamp_ns; u32 pid; char comm[TASK_COMM_LEN]; %s %s %s }; BPF_PERF_OUTPUT(%s); %s """ return text % (self.struct_name, data_fields, kernel_stack_str, user_stack_str, self.events_name, stack_table) def _generate_field_assign(self, idx): field_type = self.types[idx] expr = self.values[idx].strip() text = "" if self.probe_type == "u" and expr[0:3] == "arg": text = (" u64 %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") % \ (expr, expr[3], expr) if field_type == "s": return text + """ if (%s != 0) { bpf_probe_read(&__data.v%d, sizeof(__data.v%d), (void *)%s); } """ % (expr, idx, idx, expr) if field_type in Probe.fmt_types: return text + " __data.v%d = (%s)%s;\n" % \ (idx, Probe.c_type[field_type], expr) self._bail("unrecognized field type %s" % field_type) def _generate_usdt_filter_read(self): text = "" if self.probe_type == "u": for arg, _ in Probe.aliases.items(): if not (arg.startswith("arg") and (arg in self.filter)): continue arg_index = int(arg.replace("arg", "")) arg_ctype = self.usdt.get_probe_arg_ctype( self.usdt_name, arg_index) if not arg_ctype: self._bail("Unable to determine type of {} " "in the filter".format(arg)) text += """ {} {}_filter; bpf_usdt_readarg({}, ctx, &{}_filter); """.format(arg_ctype, arg, arg_index, arg) self.filter = self.filter.replace( arg, "{}_filter".format(arg)) return text def generate_program(self, include_self): data_decl = self._generate_data_decl() # kprobes don't have built-in pid filters, so we have to add # it to the function body: if len(self.library) == 0 and Probe.pid != -1: pid_filter = """ u32 __pid = bpf_get_current_pid_tgid(); if (__pid != %d) { return 0; } """ % Probe.pid elif not include_self: pid_filter = """ u32 __pid = bpf_get_current_pid_tgid(); if (__pid == %d) { return 0; } """ % os.getpid() else: pid_filter = "" prefix = "" signature = "struct pt_regs *ctx" data_fields = "" for i, expr in enumerate(self.values): data_fields += self._generate_field_assign(i) if self.probe_type == "t": heading = "TRACEPOINT_PROBE(%s, %s)" % \ (self.tp_category, self.tp_event) ctx_name = "args" else: heading = "int %s(%s)" % (self.probe_name, signature) ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ __data.user_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (self.stacks_name, ctx_name) if self.kernel_stack: stack_trace += """ __data.kernel_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (self.stacks_name, ctx_name) text = heading + """ { %s %s %s if (!(%s)) return 0; struct %s __data = {0}; __data.timestamp_ns = bpf_ktime_get_ns(); __data.pid = bpf_get_current_pid_tgid(); bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); return 0; } """ text = text % (pid_filter, prefix, self._generate_usdt_filter_read(), self.filter, self.struct_name, data_fields, stack_trace, self.events_name, ctx_name) return data_decl + "\n" + text @classmethod def _time_off_str(cls, timestamp_ns): return "%.6f" % (1e-9 * (timestamp_ns - cls.first_ts)) def _display_function(self): if self.probe_type == 'p' or self.probe_type == 'r': return self.function elif self.probe_type == 'u': return self.usdt_name else: # self.probe_type == 't' return self.tp_event def print_stack(self, bpf, stack_id, pid): if stack_id < 0: print(" %d" % stack_id) return stack = list(bpf.get_table(self.stacks_name).walk(stack_id)) for addr in stack: print(" %016x %s" % (addr, bpf.sym(addr, pid))) def _format_message(self, bpf, pid, values): # Replace each %K with kernel sym and %U with user sym in pid kernel_placeholders = [i for i in xrange(0, len(self.types)) if self.types[i] == 'K'] user_placeholders = [i for i in xrange(0, len(self.types)) if self.types[i] == 'U'] for kp in kernel_placeholders: values[kp] = bpf.ksymaddr(values[kp]) for up in user_placeholders: values[up] = bpf.symaddr(values[up], pid) return self.python_format % tuple(values) def print_event(self, bpf, cpu, data, size): # Cast as the generated structure type and display # according to the format string in the probe. event = ct.cast(data, ct.POINTER(self.python_struct)).contents values = map(lambda i: getattr(event, "v%d" % i), range(0, len(self.values))) msg = self._format_message(bpf, event.pid, values) time = strftime("%H:%M:%S") if Probe.use_localtime else \ Probe._time_off_str(event.timestamp_ns) print("%-8s %-6d %-12s %-16s %s" % \ (time[:8], event.pid, event.comm[:12], self._display_function(), msg)) if self.user_stack: print(" User Stack Trace:") self.print_stack(bpf, event.user_stack_id, event.pid) if self.kernel_stack: print(" Kernel Stack Trace:") self.print_stack(bpf, event.kernel_stack_id, -1) if self.user_stack or self.kernel_stack: print("") Probe.event_count += 1 if Probe.max_events is not None and \ Probe.event_count >= Probe.max_events: exit() def attach(self, bpf, verbose): if len(self.library) == 0: self._attach_k(bpf) else: self._attach_u(bpf) self.python_struct = self._generate_python_data_decl() callback = partial(self.print_event, bpf) bpf[self.events_name].open_perf_buffer(callback) def _attach_k(self, bpf): if self.probe_type == "r": bpf.attach_kretprobe(event=self.function, fn_name=self.probe_name) elif self.probe_type == "p": bpf.attach_kprobe(event=self.function, fn_name=self.probe_name) # Note that tracepoints don't need an explicit attach def _attach_u(self, bpf): libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "u": pass # Was already enabled by the BPF constructor elif self.probe_type == "r": bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.pid) else: bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.pid)
if (delta >= """ + str(min_ns) + """) { // populate and emit data struct struct data_t data = {.pid = pid, .ts = sp->ts, .delta = delta}; bpf_probe_read(&data.query, sizeof(data.query), (void *)sp->query); events.perf_submit(ctx, &data, sizeof(data)); } start_tmp.delete(&pid); return 0; }; """ # enable USDT probe from given PID u = USDT(pid=pid) u.enable_probe(probe="query__start", fn_name="do_start") u.enable_probe(probe="query__done", fn_name="do_done") if debug: print(u.get_text()) print(bpf_text) # initialize BPF b = BPF(text=bpf_text, usdt_contexts=[u]) # header print("Tracing MySQL server queries for PID %d slower than %s ms..." % (pid, min_ms_text)) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) class Data(ct.Structure):
from bcc import BPF, USDT bpf_source = """ #include <uapi/linux/ptrace.h> int trace_binary_exec(struct pt_regs *ctx) { u64 pid = bpf_get_current_pid_tgid(); bpf_trace_printk("New hello_usdt process running with PID: %d", pid); } """ usdt = USDT(path="./hello_usdt") usdt.enable_probe(probe="probe-main", fn_name="trace_binary_exec") bpf = BPF(text=bpf_source, usdt=usdt) bpf.trace_print()
def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace('FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P, or for user stacks. if self.per_pid or not self.is_kernel_probe() or self.user_stack: trace_count_text = trace_count_text.replace('GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace('STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # kernel stacks only. skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
operation_event.perf_submit(ctx, &end_data, sizeof(end_data)); return 0; } """ bpf_text = bpf_text.replace("FILTER_STRING", this_filter) if this_filter: bpf_text = bpf_text.replace( "FILTER", "if (!filter(start_data.input)) { return 0; }") else: bpf_text = bpf_text.replace("FILTER", "") # Create USDT context print("Attaching probes to pid %d" % this_pid) usdt_ctx = USDT(pid=this_pid) usdt_ctx.enable_probe(probe="operation_start", fn_name="trace_operation_start") usdt_ctx.enable_probe(probe="operation_end", fn_name="trace_operation_end") # Create BPF context, load BPF program bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel) # Define latency event and print function class OperationEventData(ct.Structure): _fields_ = [("operation_id", ct.c_ulonglong), ("input", ct.c_char * 64), ("output", ct.c_char * 64), ("start", ct.c_ulonglong), ("end", ct.c_ulonglong), ("duration", ct.c_ulonglong)] start = 0
class Probe(object): probe_count = 0 streq_index = 0 max_events = None event_count = 0 first_ts = 0 print_time = False use_localtime = True time_field = False print_cpu = False print_address = False tgid = -1 pid = -1 page_cnt = None build_id_enabled = False @classmethod def configure(cls, args): cls.max_events = args.max_events cls.print_time = args.timestamp or args.time cls.use_localtime = not args.timestamp cls.time_field = cls.print_time and (not cls.use_localtime) cls.print_cpu = args.print_cpu cls.print_address = args.address cls.first_ts = BPF.monotonic_time() cls.tgid = args.tgid or -1 cls.pid = args.pid or -1 cls.page_cnt = args.buffer_pages cls.bin_cmp = args.bin_cmp cls.build_id_enabled = args.sym_file_list is not None def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None self.streq_functions = "" self.raw_probe = probe self.string_size = string_size self.kernel_stack = kernel_stack self.user_stack = user_stack Probe.probe_count += 1 self._parse_probe() self.probe_num = Probe.probe_count self.probe_name = "probe_%s_%d" % \ (self._display_function(), self.probe_num) self.probe_name = re.sub(r'[^A-Za-z0-9_]', '_', self.probe_name) # compiler can generate proper codes for function # signatures with "syscall__" prefix if self.is_syscall_kprobe: self.probe_name = "syscall__" + self.probe_name[6:] def __str__(self): return "%s:%s:%s FLT=%s ACT=%s/%s" % (self.probe_type, self.library, self._display_function(), self.filter, self.types, self.values) def is_default_action(self): return self.python_format == "" def _bail(self, error): raise ValueError("error in probe '%s': %s" % (self.raw_probe, error)) def _parse_probe(self): text = self.raw_probe # There might be a function signature preceding the actual # filter/print part, or not. Find the probe specifier first -- # it ends with either a space or an open paren ( for the # function signature part. # opt. signature # probespec | rest # --------- ---------- -- (spec, sig, rest) = re.match(r'([^ \t\(]+)(\([^\(]*\))?(.*)', text).groups() self._parse_spec(spec) # Remove the parens self.signature = sig[1:-1] if sig else None if self.signature and self.probe_type in ['u', 't']: self._bail("USDT and tracepoint probes can't have " + "a function signature; use arg1, arg2, " + "... instead") text = rest.lstrip() # If we now have a (, wait for the balanced closing ) and that # will be the predicate self.filter = None if len(text) > 0 and text[0] == "(": balance = 1 for i in range(1, len(text)): if text[i] == "(": balance += 1 if text[i] == ")": balance -= 1 if balance == 0: self._parse_filter(text[:i + 1]) text = text[i + 1:] break if self.filter is None: self._bail("unmatched end of predicate") if self.filter is None: self.filter = "1" # The remainder of the text is the printf action self._parse_action(text.lstrip()) def _parse_spec(self, spec): parts = spec.split(":") # Two special cases: 'func' means 'p::func', 'lib:func' means # 'p:lib:func'. Other combinations need to provide an empty # value between delimiters, e.g. 'r::func' for a kretprobe on # the function func. if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] if len(parts[0]) == 0: self.probe_type = "p" elif parts[0] in ["p", "r", "t", "u"]: self.probe_type = parts[0] else: self._bail("probe type must be '', 'p', 't', 'r', " + "or 'u', but got '%s'" % parts[0]) if self.probe_type == "t": self.tp_category = parts[1] self.tp_event = parts[2] self.library = "" # kernel self.function = "" # from TRACEPOINT_PROBE elif self.probe_type == "u": self.library = ':'.join(parts[1:-1]) self.usdt_name = parts[-1] self.function = "" # no function, just address # We will discover the USDT provider by matching on # the USDT name in the specified library self._find_usdt_probe() else: self.library = ':'.join(parts[1:-1]) self.function = parts[-1] # only x64 syscalls needs checking, no other syscall wrapper yet. self.is_syscall_kprobe = False if self.probe_type == "p" and len(self.library) == 0 and \ self.function[:10] == "__x64_sys_": self.is_syscall_kprobe = True def _find_usdt_probe(self): target = Probe.pid if Probe.pid and Probe.pid != -1 \ else Probe.tgid self.usdt = USDT(path=self.library, pid=target) for probe in self.usdt.enumerate_probes(): if probe.name == self.usdt_name.encode('ascii'): return # Found it, will enable later self._bail("unrecognized USDT probe %s" % self.usdt_name) def _parse_filter(self, filt): self.filter = self._rewrite_expr(filt) def _parse_types(self, fmt): for match in re.finditer( r'[^%]%(s|u|d|lu|llu|ld|lld|hu|hd|x|lx|llx|c|K|U)', fmt): self.types.append(match.group(1)) fmt = re.sub(r'([^%]%)(u|d|lu|llu|ld|lld|hu|hd)', r'\1d', fmt) fmt = re.sub(r'([^%]%)(x|lx|llx)', r'\1x', fmt) fmt = re.sub('%K|%U', '%s', fmt) self.python_format = fmt.strip('"') def _parse_action(self, action): self.values = [] self.types = [] self.python_format = "" if len(action) == 0: return action = action.strip() match = re.search(r'(\".*?\"),?(.*)', action) if match is None: self._bail("expected format string in \"s") self.raw_format = match.group(1) self._parse_types(self.raw_format) for part in re.split('(?<!"),', match.group(2)): part = self._rewrite_expr(part) if len(part) > 0: self.values.append(part) aliases_arg = { "arg1": "PT_REGS_PARM1(ctx)", "arg2": "PT_REGS_PARM2(ctx)", "arg3": "PT_REGS_PARM3(ctx)", "arg4": "PT_REGS_PARM4(ctx)", "arg5": "PT_REGS_PARM5(ctx)", "arg6": "PT_REGS_PARM6(ctx)", } aliases_indarg = { "arg1": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})", "arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})", "arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})", "arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})", "arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})", "arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})", } aliases_common = { "retval": "PT_REGS_RC(ctx)", "$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)", "$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)", "$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)", "$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)", "$cpu": "bpf_get_smp_processor_id()", "$task" : "((struct task_struct *)bpf_get_current_task())" } def _generate_streq_function(self, string): fname = "streq_%d" % Probe.streq_index Probe.streq_index += 1 self.streq_functions += """ static inline bool %s(char const *ignored, uintptr_t str) { char needle[] = %s; char haystack[sizeof(needle)]; bpf_probe_read(&haystack, sizeof(haystack), (void *)str); for (int i = 0; i < sizeof(needle) - 1; ++i) { if (needle[i] != haystack[i]) { return false; } } return true; } """ % (fname, string) return fname def _rewrite_expr(self, expr): if self.is_syscall_kprobe: for alias, replacement in Probe.aliases_indarg.items(): expr = expr.replace(alias, replacement) else: for alias, replacement in Probe.aliases_arg.items(): # For USDT probes, we replace argN values with the # actual arguments for that probe obtained using # bpf_readarg_N macros emitted at BPF construction. if self.probe_type == "u": continue expr = expr.replace(alias, replacement) for alias, replacement in Probe.aliases_common.items(): expr = expr.replace(alias, replacement) if self.bin_cmp: STRCMP_RE = 'STRCMP\\(\"([^"]+)\\"' else: STRCMP_RE = 'STRCMP\\(("[^"]+\\")' matches = re.finditer(STRCMP_RE, expr) for match in matches: string = match.group(1) fname = self._generate_streq_function(string) expr = expr.replace("STRCMP", fname, 1) return expr p_type = {"u": ct.c_uint, "d": ct.c_int, "lu": ct.c_ulong, "ld": ct.c_long, "llu": ct.c_ulonglong, "lld": ct.c_longlong, "hu": ct.c_ushort, "hd": ct.c_short, "x": ct.c_uint, "lx": ct.c_ulong, "llx": ct.c_ulonglong, "c": ct.c_ubyte, "K": ct.c_ulonglong, "U": ct.c_ulonglong} def _generate_python_field_decl(self, idx, fields): field_type = self.types[idx] if field_type == "s": ptype = ct.c_char * self.string_size else: ptype = Probe.p_type[field_type] fields.append(("v%d" % idx, ptype)) def _generate_python_data_decl(self): self.python_struct_name = "%s_%d_Data" % \ (self._display_function(), self.probe_num) fields = [] if self.time_field: fields.append(("timestamp_ns", ct.c_ulonglong)) if self.print_cpu: fields.append(("cpu", ct.c_int)) fields.extend([ ("tgid", ct.c_uint), ("pid", ct.c_uint), ("comm", ct.c_char * 16) # TASK_COMM_LEN ]) for i in range(0, len(self.types)): self._generate_python_field_decl(i, fields) if self.kernel_stack: fields.append(("kernel_stack_id", ct.c_int)) if self.user_stack: fields.append(("user_stack_id", ct.c_int)) return type(self.python_struct_name, (ct.Structure,), dict(_fields_=fields)) c_type = {"u": "unsigned int", "d": "int", "lu": "unsigned long", "ld": "long", "llu": "unsigned long long", "lld": "long long", "hu": "unsigned short", "hd": "short", "x": "unsigned int", "lx": "unsigned long", "llx": "unsigned long long", "c": "char", "K": "unsigned long long", "U": "unsigned long long"} fmt_types = c_type.keys() def _generate_field_decl(self, idx): field_type = self.types[idx] if field_type == "s": return "char v%d[%d];\n" % (idx, self.string_size) if field_type in Probe.fmt_types: return "%s v%d;\n" % (Probe.c_type[field_type], idx) self._bail("unrecognized format specifier %s" % field_type) def _generate_data_decl(self): # The BPF program will populate values into the struct # according to the format string, and the Python program will # construct the final display string. self.events_name = "%s_events" % self.probe_name self.struct_name = "%s_data_t" % self.probe_name self.stacks_name = "%s_stacks" % self.probe_name stack_type = "BPF_STACK_TRACE" if self.build_id_enabled is False \ else "BPF_STACK_TRACE_BUILDID" stack_table = "%s(%s, 1024);" % (stack_type,self.stacks_name) \ if (self.kernel_stack or self.user_stack) else "" data_fields = "" for i, field_type in enumerate(self.types): data_fields += " " + \ self._generate_field_decl(i) time_str = "u64 timestamp_ns;" if self.time_field else "" cpu_str = "int cpu;" if self.print_cpu else "" kernel_stack_str = " int kernel_stack_id;" \ if self.kernel_stack else "" user_stack_str = " int user_stack_id;" \ if self.user_stack else "" text = """ struct %s { %s %s u32 tgid; u32 pid; char comm[TASK_COMM_LEN]; %s %s %s }; BPF_PERF_OUTPUT(%s); %s """ return text % (self.struct_name, time_str, cpu_str, data_fields, kernel_stack_str, user_stack_str, self.events_name, stack_table) def _generate_field_assign(self, idx): field_type = self.types[idx] expr = self.values[idx].strip() text = "" if self.probe_type == "u" and expr[0:3] == "arg": arg_index = int(expr[3]) arg_ctype = self.usdt.get_probe_arg_ctype( self.usdt_name, arg_index - 1) text = (" %s %s = 0;\n" + " bpf_usdt_readarg(%s, ctx, &%s);\n") \ % (arg_ctype, expr, expr[3], expr) if field_type == "s": return text + """ if (%s != 0) { void *__tmp = (void *)%s; bpf_probe_read(&__data.v%d, sizeof(__data.v%d), __tmp); } """ % (expr, expr, idx, idx) if field_type in Probe.fmt_types: return text + " __data.v%d = (%s)%s;\n" % \ (idx, Probe.c_type[field_type], expr) self._bail("unrecognized field type %s" % field_type) def _generate_usdt_filter_read(self): text = "" if self.probe_type != "u": return text for arg, _ in Probe.aliases_arg.items(): if not (arg in self.filter): continue arg_index = int(arg.replace("arg", "")) arg_ctype = self.usdt.get_probe_arg_ctype( self.usdt_name, arg_index - 1) if not arg_ctype: self._bail("Unable to determine type of {} " "in the filter".format(arg)) text += """ {} {}_filter; bpf_usdt_readarg({}, ctx, &{}_filter); """.format(arg_ctype, arg, arg_index, arg) self.filter = self.filter.replace( arg, "{}_filter".format(arg)) return text def generate_program(self, include_self): data_decl = self._generate_data_decl() if Probe.pid != -1: pid_filter = """ if (__pid != %d) { return 0; } """ % Probe.pid # uprobes can have a built-in tgid filter passed to # attach_uprobe, hence the check here -- for kprobes, we # need to do the tgid test by hand: elif len(self.library) == 0 and Probe.tgid != -1: pid_filter = """ if (__tgid != %d) { return 0; } """ % Probe.tgid elif not include_self: pid_filter = """ if (__tgid == %d) { return 0; } """ % os.getpid() else: pid_filter = "" prefix = "" signature = "struct pt_regs *ctx" if self.signature: signature += ", " + self.signature data_fields = "" for i, expr in enumerate(self.values): data_fields += self._generate_field_assign(i) if self.probe_type == "t": heading = "TRACEPOINT_PROBE(%s, %s)" % \ (self.tp_category, self.tp_event) ctx_name = "args" else: heading = "int %s(%s)" % (self.probe_name, signature) ctx_name = "ctx" time_str = """ __data.timestamp_ns = bpf_ktime_get_ns();""" if self.time_field else "" cpu_str = """ __data.cpu = bpf_get_smp_processor_id();""" if self.print_cpu else "" stack_trace = "" if self.user_stack: stack_trace += """ __data.user_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (self.stacks_name, ctx_name) if self.kernel_stack: stack_trace += """ __data.kernel_stack_id = %s.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (self.stacks_name, ctx_name) text = heading + """ { u64 __pid_tgid = bpf_get_current_pid_tgid(); u32 __tgid = __pid_tgid >> 32; u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half %s %s %s if (!(%s)) return 0; struct %s __data = {0}; %s %s __data.tgid = __tgid; __data.pid = __pid; bpf_get_current_comm(&__data.comm, sizeof(__data.comm)); %s %s %s.perf_submit(%s, &__data, sizeof(__data)); return 0; } """ text = text % (pid_filter, prefix, self._generate_usdt_filter_read(), self.filter, self.struct_name, time_str, cpu_str, data_fields, stack_trace, self.events_name, ctx_name) return self.streq_functions + data_decl + "\n" + text @classmethod def _time_off_str(cls, timestamp_ns): return "%.6f" % (1e-9 * (timestamp_ns - cls.first_ts)) def _display_function(self): if self.probe_type == 'p' or self.probe_type == 'r': return self.function elif self.probe_type == 'u': return self.usdt_name else: # self.probe_type == 't' return self.tp_event def print_stack(self, bpf, stack_id, tgid): if stack_id < 0: print(" %d" % stack_id) return stack = list(bpf.get_table(self.stacks_name).walk(stack_id)) for addr in stack: print(" ", end="") if Probe.print_address: print("%16x " % addr, end="") print("%s" % (bpf.sym(addr, tgid, show_module=True, show_offset=True))) def _format_message(self, bpf, tgid, values): # Replace each %K with kernel sym and %U with user sym in tgid kernel_placeholders = [i for i, t in enumerate(self.types) if t == 'K'] user_placeholders = [i for i, t in enumerate(self.types) if t == 'U'] for kp in kernel_placeholders: values[kp] = bpf.ksym(values[kp], show_offset=True) for up in user_placeholders: values[up] = bpf.sym(values[up], tgid, show_module=True, show_offset=True) return self.python_format % tuple(values) def print_event(self, bpf, cpu, data, size): # Cast as the generated structure type and display # according to the format string in the probe. event = ct.cast(data, ct.POINTER(self.python_struct)).contents values = map(lambda i: getattr(event, "v%d" % i), range(0, len(self.values))) msg = self._format_message(bpf, event.tgid, values) if Probe.print_time: time = strftime("%H:%M:%S") if Probe.use_localtime else \ Probe._time_off_str(event.timestamp_ns) print("%-8s " % time[:8], end="") if Probe.print_cpu: print("%-3s " % event.cpu, end="") print("%-7d %-7d %-15s %-16s %s" % (event.tgid, event.pid, event.comm.decode('utf-8', 'replace'), self._display_function(), msg)) if self.kernel_stack: self.print_stack(bpf, event.kernel_stack_id, -1) if self.user_stack: self.print_stack(bpf, event.user_stack_id, event.tgid) if self.user_stack or self.kernel_stack: print("") Probe.event_count += 1 if Probe.max_events is not None and \ Probe.event_count >= Probe.max_events: exit() def attach(self, bpf, verbose): if len(self.library) == 0: self._attach_k(bpf) else: self._attach_u(bpf) self.python_struct = self._generate_python_data_decl() callback = partial(self.print_event, bpf) bpf[self.events_name].open_perf_buffer(callback, page_cnt=self.page_cnt) def _attach_k(self, bpf): if self.probe_type == "r": bpf.attach_kretprobe(event=self.function, fn_name=self.probe_name) elif self.probe_type == "p": bpf.attach_kprobe(event=self.function, fn_name=self.probe_name) # Note that tracepoints don't need an explicit attach def _attach_u(self, bpf): libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: self._bail("unable to find library %s" % self.library) if self.probe_type == "u": pass # Was already enabled by the BPF constructor elif self.probe_type == "r": bpf.attach_uretprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.tgid) else: bpf.attach_uprobe(name=libpath, sym=self.function, fn_name=self.probe_name, pid=Probe.tgid)
return 0; } """ bpf_text = bpf_text.replace("MAX_SAMPLE_SIZE", str(this_maxsamplesize)) bpf_text = bpf_text.replace("FILTER_STRING", this_filter) if this_filter: bpf_text = bpf_text.replace( "FILTER_STATEMENT", "if (!filter(start_data.input)) { return 0; }") else: bpf_text = bpf_text.replace("FILTER_STATEMENT", "") # Create USDT context print("lat_avg.py - Attaching probes to pid: %d; filter: %s" % (this_pid, this_filter)) usdt_ctx = USDT(pid=this_pid) if args.sdt: usdt_ctx.enable_probe(probe="usdt_sample_lib1_sdt:operation_start_sdt", fn_name="trace_operation_start") usdt_ctx.enable_probe(probe="usdt_sample_lib1_sdt:operation_end_sdt", fn_name="trace_operation_end") else: usdt_ctx.enable_probe(probe="usdt_sample_lib1:operation_start", fn_name="trace_operation_start") usdt_ctx.enable_probe(probe="usdt_sample_lib1:operation_end", fn_name="trace_operation_end") # Create BPF context, load BPF program bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel)