events.perf_submit(ctx, &data, sizeof(data)); } start_tmp.delete(&pid); return 0; }; """ # enable USDT probe from given PID u = USDT(pid=pid) u.enable_probe(probe="query__start", fn_name="do_start") u.enable_probe(probe="query__done", fn_name="do_done") if debug: print(u.get_text()) print(bpf_text) # initialize BPF b = BPF(text=bpf_text, usdt_contexts=[u]) # header print("Tracing MySQL server queries for PID %d slower than %s ms..." % (pid, min_ms_text)) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) # process event start = 0 def print_event(cpu, data, size):
class Probe(object): def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, pid=None, per_pid=False): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ self.kernel_stack = kernel_stack self.user_stack = user_stack parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.per_pid = per_pid self.matched = 0 def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, BPF_F_REUSE_STACKID );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace('FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P, or for user stacks. if self.per_pid or not self.is_kernel_probe() or self.user_stack: trace_count_text = trace_count_text.replace('GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace('STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # kernel stacks only. skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
class Probe(object): def __init__(self, pattern, use_regex=False, pid=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' /path:func -- probe a user-space function in binary '/path' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.matched = 0 self.trace_functions = {} # map location number to function name def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p" and not self.library: for index, function in self.trace_functions.items(): self.bpf.attach_kprobe( event=function, fn_name="trace_count_%d" % index) elif self.type == "p" and self.library: for index, function in self.trace_functions.items(): self.bpf.attach_uprobe( name=self.library, sym=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "t": for index, function in self.trace_functions.items(): self.bpf.attach_tracepoint( tp=function, fn_name="trace_count_%d" % index) elif self.type == "u": pass # Nothing to do -- attach already happened in `load` def _add_function(self, template, probe_name): new_func = "trace_count_%d" % self.matched text = template.replace("PROBE_FUNCTION", new_func) text = text.replace("LOCATION", str(self.matched)) self.trace_functions[self.matched] = probe_name self.matched += 1 return text def _generate_functions(self, template): self.usdt = None text = "" if self.type == "p" and not self.library: functions = BPF.get_kprobe_functions(self.pattern) verify_limit(len(functions)) for function in functions: text += self._add_function(template, function) elif self.type == "p" and self.library: # uprobes are tricky because the same function may have multiple # addresses, and the same address may be mapped to multiple # functions. We aren't allowed to create more than one uprobe # per address, so track unique addresses and ignore functions that # map to an address that we've already seen. Also ignore functions # that may repeat multiple times with different addresses. addresses, functions = (set(), set()) functions_and_addresses = BPF.get_user_functions_and_addresses( self.library, self.pattern) verify_limit(len(functions_and_addresses)) for function, address in functions_and_addresses: if address in addresses or function in functions: continue addresses.add(address) functions.add(function) text += self._add_function(template, function) elif self.type == "t": tracepoints = BPF.get_tracepoints(self.pattern) verify_limit(len(tracepoints)) for tracepoint in tracepoints: text += self._add_function(template, tracepoint) elif self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) matches = [] for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): matches.append(probe.name) verify_limit(len(matches)) for match in matches: new_func = "trace_count_%d" % self.matched text += self._add_function(template, match) self.usdt.enable_probe(match, new_func) if debug: print(self.usdt.get_text()) return text def load(self): trace_count_text = """ int PROBE_FUNCTION(void *ctx) { FILTER int loc = LOCATION; u64 *val = counts.lookup(&loc); if (!val) { return 0; // Should never happen, # of locations is known } (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> BPF_ARRAY(counts, u64, NUMLOCATIONS); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.pid: trace_count_text = trace_count_text.replace('FILTER', """u32 pid = bpf_get_current_pid_tgid() >> 32; if (pid != %d) { return 0; }""" % self.pid) else: trace_count_text = trace_count_text.replace('FILTER', '') bpf_text += self._generate_functions(trace_count_text) bpf_text = bpf_text.replace("NUMLOCATIONS", str(len(self.trace_functions))) if debug: print(bpf_text) if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else []) self.clear() # Initialize all array items to zero def counts(self): return self.bpf["counts"] def clear(self): counts = self.bpf["counts"] for location, _ in list(self.trace_functions.items()): counts[counts.Key(location)] = counts.Leaf()
events.perf_submit(ctx, &data, sizeof(data)); } start_tmp.delete(&pid); return 0; }; """ # enable USDT probe from given PID u = USDT(pid=pid) u.enable_probe(probe="query__start", fn_name="do_start") u.enable_probe(probe="query__done", fn_name="do_done") if debug: print(u.get_text()) print(bpf_text) # initialize BPF b = BPF(text=bpf_text, usdt_contexts=[u]) # header print("Tracing MySQL server queries for PID %d slower than %s ms..." % (pid, min_ms_text)) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) class Data(ct.Structure): _fields_ = [ ("pid", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("delta", ct.c_ulonglong),
bpf_usdt_readarg(4, ctx, &native_id); bpf_probe_read(&te.name, sizeof(te.name), (void *)nameptr); te.runtime_id = id; te.native_id = native_id; __builtin_memcpy(&te.type, type, sizeof(te.type)); threads.perf_submit(ctx, &te, sizeof(te)); return 0; } """ program += template % ("trace_start", "start") program += template % ("trace_stop", "stop") usdt.enable_probe_or_bail("thread__start", "trace_start") usdt.enable_probe_or_bail("thread__stop", "trace_stop") if args.verbose: print(usdt.get_text()) print(program) bpf = BPF(text=program, usdt_contexts=[usdt]) print("Tracing thread events in process %d (language: %s)... Ctrl-C to quit." % (args.pid, args.language or "none")) print("%-8s %-16s %-8s %-30s" % ("TIME", "ID", "TYPE", "DESCRIPTION")) class ThreadEvent(ct.Structure): _fields_ = [ ("runtime_id", ct.c_ulonglong), ("native_id", ct.c_ulonglong), ("type", ct.c_char * 8), ("name", ct.c_char * 80), ]
probes.append(Probe("gc__start", "gc__done", "", end_save, lambda e: str.join(", ", [desc for desc, val in descs.items() if e.field1 & val != 0]))) else: print("No language detected; use -l to trace a language.") exit(1) for probe in probes: program += probe.generate() probe.attach() if args.verbose: print(usdt.get_text()) print(program) bpf = BPF(text=program, usdt_contexts=[usdt]) print("Tracing garbage collections in %s process %d... Ctrl-C to quit." % (language, args.pid)) time_col = "TIME (ms)" if args.milliseconds else "TIME (us)" print("%-8s %-8s %-40s" % ("START", time_col, "DESCRIPTION")) class GCEvent(ct.Structure): _fields_ = [ ("probe_index", ct.c_ulonglong), ("elapsed_ns", ct.c_ulonglong), ("field1", ct.c_ulonglong), ("field2", ct.c_ulonglong), ("field3", ct.c_ulonglong),
class Tool(object): examples = """ Probe specifier syntax: {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label] Where: p,r,t,u -- probe at function entry, function exit, kernel tracepoint, or USDT probe in exit probes: can use $retval, $entry(param), $latency library -- the library that contains the function (leave empty for kernel functions) category -- the category of the kernel tracepoint (e.g. net, sched) function -- the function name to trace (or tracepoint name) signature -- the function's parameters, as in the C header type -- the type of the expression to collect (supports multiple) expr -- the expression to collect (supports multiple) filter -- the filter that is applied to collected values label -- the label for this probe in the resulting output EXAMPLES: argdist -H 'p::__kmalloc(u64 size):u64:size' Print a histogram of allocation sizes passed to kmalloc argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16' Print a frequency count of how many times process 1005 called malloc with an allocation size of 16 bytes argdist -C 'r:c:gets():char*:(char*)$retval#snooped strings' Snoop on all strings returned by gets() argdist -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size)#ns per byte' Print a histogram of nanoseconds per byte from kmalloc allocations argdist -C 'p::__kmalloc(size_t size, gfp_t flags):size_t:size:flags&GFP_ATOMIC' Print frequency count of kmalloc allocation sizes that have GFP_ATOMIC argdist -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005, but only show the top 5 busiest fds argdist -p 1005 -H 'r:c:read()' Print a histogram of results (sizes) returned by read() in process 1005 argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000' Print frequency of reads by process where the latency was >0.1ms argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' Print a histogram of read sizes that were longer than 1ms argdist -H \\ 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1' Print a histogram of buffer sizes passed to write() across all processes, where the file descriptor was 1 (STDOUT) argdist -C 'p:c:fork()#fork calls' Count fork() calls in libc across all processes Can also use funccount.py, which is easier and more flexible argdist -H 't:block:block_rq_complete():u32:tp.nr_sector' Print histogram of number of sectors in completing block I/O requests argdist -C 't:irq:irq_handler_entry():int:tp.irq' Aggregate interrupts by interrupt request (IRQ) argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337 Print frequency of function addresses used as a pthread start function, relying on the USDT pthread_start probe in process 1337 argdist -H \\ 'p:c:sleep(u32 seconds):u32:seconds' \\ 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec' Print histograms of sleep() and nanosleep() parameter values argdist -p 2780 -z 120 \\ -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1' Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters """ def __init__(self): parser = argparse.ArgumentParser(description="Trace a " + "function and display a summary of its parameter values.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=Tool.examples) parser.add_argument("-p", "--pid", type=int, help="id of the process to trace (optional)") parser.add_argument("-z", "--string-size", default=80, type=int, help="maximum string size to read from char* arguments") parser.add_argument("-i", "--interval", default=1, type=int, help="output interval, in seconds") parser.add_argument("-n", "--number", type=int, dest="count", help="number of outputs") parser.add_argument("-v", "--verbose", action="store_true", help="print resulting BPF program code before executing") parser.add_argument("-T", "--top", type=int, help="number of top results to show (not applicable to " + "histograms)") parser.add_argument("-H", "--histogram", nargs="*", dest="histspecifier", metavar="specifier", help="probe specifier to capture histogram of " + "(see examples below)") parser.add_argument("-C", "--count", nargs="*", dest="countspecifier", metavar="specifier", help="probe specifier to capture count of " + "(see examples below)") parser.add_argument("-I", "--include", nargs="*", metavar="header", help="additional header files to include in the BPF program") self.args = parser.parse_args() self.usdt_ctx = None def _create_probes(self): self.probes = [] for specifier in (self.args.countspecifier or []): self.probes.append(Probe(self, "freq", specifier)) for histspecifier in (self.args.histspecifier or []): self.probes.append(Probe(self, "hist", histspecifier)) if len(self.probes) == 0: print("at least one specifier is required") exit() def enable_usdt_probe(self, probe_name, fn_name): if not self.usdt_ctx: self.usdt_ctx = USDT(pid=self.args.pid) self.usdt_ctx.enable_probe(probe_name, fn_name) def _generate_program(self): bpf_source = """ struct __string_t { char s[%d]; }; #include <uapi/linux/ptrace.h> """ % self.args.string_size for include in (self.args.include or []): bpf_source += "#include <%s>\n" % include bpf_source += BPF.generate_auto_includes( map(lambda p: p.raw_spec, self.probes)) bpf_source += Tracepoint.generate_decl() bpf_source += Tracepoint.generate_entry_probe() for probe in self.probes: bpf_source += probe.generate_text() if self.args.verbose: if self.usdt_ctx: print(self.usdt_ctx.get_text()) print(bpf_source) self.bpf = BPF(text=bpf_source, usdt=self.usdt_ctx) def _attach(self): Tracepoint.attach(self.bpf) for probe in self.probes: probe.attach(self.bpf) if self.args.verbose: print("open uprobes: %s" % self.bpf.open_uprobes) print("open kprobes: %s" % self.bpf.open_kprobes) def _main_loop(self): count_so_far = 0 while True: try: sleep(self.args.interval) except KeyboardInterrupt: exit() print("[%s]" % strftime("%H:%M:%S")) for probe in self.probes: probe.display(self.args.top) count_so_far += 1 if self.args.count is not None and \ count_so_far >= self.args.count: exit() def _close_probes(self): for probe in self.probes: probe.close() if self.args.verbose: print("closed probe: " + str(probe)) def run(self): try: self._create_probes() self._generate_program() self._attach() self._main_loop() except: if self.args.verbose: traceback.print_exc() elif sys.exc_info()[0] is not SystemExit: print(sys.exc_info()[1]) self._close_probes()
class Probe(object): def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, pid=None, per_pid=False, cpu=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ self.kernel_stack = kernel_stack self.user_stack = user_stack parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.per_pid = per_pid self.cpu = cpu self.matched = 0 def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, 0 );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ filter_text = [] # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: filter_text.append( 'u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }' % self.pid) if self.is_kernel_probe() and self.cpu: filter_text.append( 'struct task_struct *task; task = (struct task_struct*)bpf_get_current_task(); ' + 'if (task->cpu != %d) { return 0; }' % self.cpu) trace_count_text = trace_count_text.replace('FILTER', '\n '.join(filter_text)) # Do per-pid statistics iff -P is provided if self.per_pid: trace_count_text = trace_count_text.replace( 'GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace( 'STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
class Probe(object): def __init__(self, pattern, use_regex=False, pid=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' /path:func -- probe a user-space function in binary '/path' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.matched = 0 self.trace_functions = {} # map location number to function name def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p" and not self.library: for index, function in self.trace_functions.items(): self.bpf.attach_kprobe( event=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "p" and self.library: for index, function in self.trace_functions.items(): self.bpf.attach_uprobe( name=self.library, sym=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "t": for index, function in self.trace_functions.items(): self.bpf.attach_tracepoint( tp=function, fn_name="trace_count_%d" % index, pid=self.pid or -1) elif self.type == "u": pass # Nothing to do -- attach already happened in `load` def _add_function(self, template, probe_name): new_func = "trace_count_%d" % self.matched text = template.replace("PROBE_FUNCTION", new_func) text = text.replace("LOCATION", str(self.matched)) self.trace_functions[self.matched] = probe_name self.matched += 1 return text def _generate_functions(self, template): self.usdt = None text = "" if self.type == "p" and not self.library: functions = BPF.get_kprobe_functions(self.pattern) verify_limit(len(functions)) for function in functions: text += self._add_function(template, function) elif self.type == "p" and self.library: # uprobes are tricky because the same function may have multiple # addresses, and the same address may be mapped to multiple # functions. We aren't allowed to create more than one uprobe # per address, so track unique addresses and ignore functions that # map to an address that we've already seen. Also ignore functions # that may repeat multiple times with different addresses. addresses, functions = (set(), set()) functions_and_addresses = BPF.get_user_functions_and_addresses( self.library, self.pattern) verify_limit(len(functions_and_addresses)) for function, address in functions_and_addresses: if address in addresses or function in functions: continue addresses.add(address) functions.add(function) text += self._add_function(template, function) elif self.type == "t": tracepoints = BPF.get_tracepoints(self.pattern) verify_limit(len(tracepoints)) for tracepoint in tracepoints: text += self._add_function(template, tracepoint) elif self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) matches = [] for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): matches.append(probe.name) verify_limit(len(matches)) for match in matches: new_func = "trace_count_%d" % self.matched text += self._add_function(template, match) self.usdt.enable_probe(match, new_func) if debug: print(self.usdt.get_text()) return text def load(self): trace_count_text = """ int PROBE_FUNCTION(void *ctx) { FILTER int loc = LOCATION; u64 *val = counts.lookup(&loc); if (!val) { return 0; // Should never happen, # of locations is known } (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> BPF_ARRAY(counts, u64, NUMLOCATIONS); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.pid: trace_count_text = trace_count_text.replace('FILTER', """u32 pid = bpf_get_current_pid_tgid() >> 32; if (pid != %d) { return 0; }""" % self.pid) else: trace_count_text = trace_count_text.replace('FILTER', '') bpf_text += self._generate_functions(trace_count_text) bpf_text = bpf_text.replace("NUMLOCATIONS", str(len(self.trace_functions))) if debug: print(bpf_text) if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else []) self.clear() # Initialize all array items to zero def counts(self): return self.bpf["counts"] def clear(self): counts = self.bpf["counts"] for location, _ in list(self.trace_functions.items()): counts[counts.Key(location)] = counts.Leaf()
class Probe(object): def __init__(self, pattern, use_regex=False, pid=None, per_pid=False): """Init a new probe. Init the probe from the pattern provided by the user. The supported patterns mimic the 'trace' and 'argdist' tools, but are simpler because we don't have to distinguish between probes and retprobes. func -- probe a kernel function lib:func -- probe a user-space function in the library 'lib' p::func -- same thing as 'func' p:lib:func -- same thing as 'lib:func' t:cat:event -- probe a kernel tracepoint u:lib:probe -- probe a USDT tracepoint """ parts = pattern.split(':') if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] elif len(parts) == 3: if parts[0] == "t": parts = ["t", "", "%s:%s" % tuple(parts[1:])] if parts[0] not in ["p", "t", "u"]: raise Exception("Type must be 'p', 't', or 'u', but got %s" % parts[0]) else: raise Exception("Too many ':'-separated components in pattern %s" % pattern) (self.type, self.library, self.pattern) = parts if not use_regex: self.pattern = self.pattern.replace('*', '.*') self.pattern = '^' + self.pattern + '$' if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = libpath self.pid = pid self.per_pid = per_pid self.matched = 0 def is_kernel_probe(self): return self.type == "t" or (self.type == "p" and self.library == "") def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern) def load(self): trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.pid = GET_PID; key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS); u64 zero = 0; u64 *val = counts.lookup_or_init(&key, &zero); (*val)++; return 0; } """ bpf_text = """#include <uapi/linux/ptrace.h> struct key_t { u32 pid; int stackid; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: trace_count_text = trace_count_text.replace( 'FILTER', ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + 'if (pid != %d) { return 0; }') % (self.pid)) else: trace_count_text = trace_count_text.replace('FILTER', '') # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if # per-pid statistics were requested with -P. if self.per_pid or not self.is_kernel_probe(): trace_count_text = trace_count_text.replace( 'GET_PID', 'bpf_get_current_pid_tgid() >> 32') else: trace_count_text = trace_count_text.replace( 'GET_PID', '0xffffffff') stack_flags = 'BPF_F_REUSE_STACKID' if not self.is_kernel_probe(): stack_flags += '| BPF_F_USER_STACK' # can't do both U *and* K trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags) self.usdt = None if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (probe.bin_path != self.library): continue if re.match(self.pattern, probe.name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(probe.name, new_func) self.matched += 1 if debug: print(self.usdt.get_text()) else: bpf_text += trace_count_text if debug: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else [])
class Probe(): "Parse, load and attach BPF probes" def __init__(self, probe_spec, kernel_stack, user_stack, use_regex=False, pid=None, per_pid=False, cpu=None): self.kernel_stack = kernel_stack self.user_stack = user_stack if DEBUG: print(probe_spec) self._parse_spec(probe_spec) if (self.type == "p" and self.library) or self.type == "u": libpath = BPF.find_library(self.library) if libpath is None: # This might be an executable (e.g. 'bash') libpath = BPF.find_exe(self.library) if libpath is None or len(libpath) == 0: raise Exception("unable to find library %s" % self.library) self.library = str(libpath, 'ascii') self.pid = pid # FIXME: don't hardcode this here. self.per_pid = True self.cpu = cpu self.matched = 0 self.bpf = None self.usdt = None def _parse_spec(self, spec): parts = spec.split(":") # Two special cases: 'func' means 'p::func', 'lib:func' means # 'p:lib:func'. Other combinations need to provide an empty # value between delimiters, e.g. 'r::func' for a kretprobe on # the function func. if len(parts) == 1: parts = ["p", "", parts[0]] elif len(parts) == 2: parts = ["p", parts[0], parts[1]] if len(parts[0]) == 0: self.type = "p" elif parts[0] in ["p", "r", "t", "u"]: self.type = parts[0] else: raise Exception("probe type must be '', 'p', 't', 'r', " + "or 'u', but got '%s'" % parts[0]) if self.type == "u": # u:<library>[:<provider>]:<probe> where :<provider> is optional self.library = parts[1] self.pattern = ":".join(parts[2:]) else: self.library = ':'.join(parts[1:-1]) self.pattern = parts[-1] def load(self): ctx_name = "ctx" stack_trace = "" if self.user_stack: stack_trace += """ key.user_stack_id = stack_traces.get_stackid( %s, BPF_F_USER_STACK );""" % (ctx_name) else: stack_trace += "key.user_stack_id = -1;" if self.kernel_stack: stack_trace += """ key.kernel_stack_id = stack_traces.get_stackid( %s, 0 );""" % (ctx_name) else: stack_trace += "key.kernel_stack_id = -1;" trace_count_text = """ int trace_count(void *ctx) { FILTER struct key_t key = {}; key.tgid = GET_TGID; STORE_COMM %s counts.increment(key); return 0; } """ trace_count_text = trace_count_text % (stack_trace) bpf_text = """#include <uapi/linux/ptrace.h> #include <linux/sched.h> struct key_t { // no pid (thread ID) so that we do not needlessly split this key u32 tgid; int kernel_stack_id; int user_stack_id; char name[TASK_COMM_LEN]; }; BPF_HASH(counts, struct key_t); BPF_STACK_TRACE(stack_traces, 1024); """ filter_text = [] trace_count_text = trace_count_text.replace('FILTER', '\n '.join(filter_text)) # Do per-pid statistics iff -P is provided if self.per_pid: trace_count_text = trace_count_text.replace( 'GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace( 'STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: # skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') trace_count_text = trace_count_text.replace('STORE_COMM', '') if self.type == "u": self.usdt = USDT(path=self.library, pid=self.pid) for probe in self.usdt.enumerate_probes(): if not self.pid and (str(probe.bin_path, 'ascii') != self.library): continue parts = self.pattern.split(":") if len(parts) == 1: provider_name = None usdt_name = parts[0].encode("ascii") else: provider_name = parts[0] usdt_name = parts[1] if (str(probe.name, 'ascii') == usdt_name and str(probe.provider, 'ascii') == provider_name): # This hack is required because the bpf_usdt_readarg # functions generated need different function names for # each attached probe. If we just stick to trace_count, # we'd get multiple bpf_usdt_readarg helpers with the same # name when enabling more than one USDT probe. new_func = "trace_count_%d" % self.matched bpf_text += trace_count_text.replace( "trace_count", new_func) self.usdt.enable_probe(str(probe.name, 'ascii'), new_func) self.matched += 1 if DEBUG: print(self.usdt.get_text()) else: bpf_text += trace_count_text if DEBUG: print(bpf_text) self.bpf = BPF(text=bpf_text, usdt_contexts=[self.usdt] if self.usdt else []) def attach(self): if self.type == "p": if self.library: self.bpf.attach_uprobe(name=self.library, sym_re=self.pattern, fn_name="trace_count", pid=self.pid or -1) self.matched = self.bpf.num_open_uprobes() else: self.bpf.attach_kprobe(event_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_kprobes() elif self.type == "t": self.bpf.attach_tracepoint(tp_re=self.pattern, fn_name="trace_count") self.matched = self.bpf.num_open_tracepoints() elif self.type == "u": pass # Nothing to do -- attach already happened in `load` if self.matched == 0: raise Exception("No functions matched by pattern %s" % self.pattern)
class Tool(object): examples = """ Probe specifier syntax: {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label] Where: p,r,t,u -- probe at function entry, function exit, kernel tracepoint, or USDT probe in exit probes: can use $retval, $entry(param), $latency library -- the library that contains the function (leave empty for kernel functions) category -- the category of the kernel tracepoint (e.g. net, sched) function -- the function name to trace (or tracepoint name) signature -- the function's parameters, as in the C header type -- the type of the expression to collect (supports multiple) expr -- the expression to collect (supports multiple) filter -- the filter that is applied to collected values label -- the label for this probe in the resulting output EXAMPLES: argdist -H 'p::__kmalloc(u64 size):u64:size' Print a histogram of allocation sizes passed to kmalloc argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16' Print a frequency count of how many times process 1005 called malloc with an allocation size of 16 bytes argdist -C 'r:c:gets():char*:(char*)$retval#snooped strings' Snoop on all strings returned by gets() argdist -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size)#ns per byte' Print a histogram of nanoseconds per byte from kmalloc allocations argdist -C 'p::__kmalloc(size_t size, gfp_t flags):size_t:size:flags&GFP_ATOMIC' Print frequency count of kmalloc allocation sizes that have GFP_ATOMIC argdist -p 1005 -C 'p:c:write(int fd):int:fd' -T 5 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005, but only show the top 5 busiest fds argdist -p 1005 -H 'r:c:read()' Print a histogram of results (sizes) returned by read() in process 1005 argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000' Print frequency of reads by process where the latency was >0.1ms argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' Print a histogram of read sizes that were longer than 1ms argdist -H \\ 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1' Print a histogram of buffer sizes passed to write() across all processes, where the file descriptor was 1 (STDOUT) argdist -C 'p:c:fork()#fork calls' Count fork() calls in libc across all processes Can also use funccount.py, which is easier and more flexible argdist -H 't:block:block_rq_complete():u32:tp.nr_sector' Print histogram of number of sectors in completing block I/O requests argdist -C 't:irq:irq_handler_entry():int:tp.irq' Aggregate interrupts by interrupt request (IRQ) argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337 Print frequency of function addresses used as a pthread start function, relying on the USDT pthread_start probe in process 1337 argdist -H \\ 'p:c:sleep(u32 seconds):u32:seconds' \\ 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec' Print histograms of sleep() and nanosleep() parameter values argdist -p 2780 -z 120 \\ -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1' Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters """ def __init__(self): parser = argparse.ArgumentParser( description="Trace a " + "function and display a summary of its parameter values.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=Tool.examples) parser.add_argument("-p", "--pid", type=int, help="id of the process to trace (optional)") parser.add_argument( "-z", "--string-size", default=80, type=int, help="maximum string size to read from char* arguments") parser.add_argument("-i", "--interval", default=1, type=int, help="output interval, in seconds") parser.add_argument("-n", "--number", type=int, dest="count", help="number of outputs") parser.add_argument( "-v", "--verbose", action="store_true", help="print resulting BPF program code before executing") parser.add_argument( "-T", "--top", type=int, help="number of top results to show (not applicable to " + "histograms)") parser.add_argument("-H", "--histogram", nargs="*", dest="histspecifier", metavar="specifier", help="probe specifier to capture histogram of " + "(see examples below)") parser.add_argument("-C", "--count", nargs="*", dest="countspecifier", metavar="specifier", help="probe specifier to capture count of " + "(see examples below)") parser.add_argument( "-I", "--include", nargs="*", metavar="header", help="additional header files to include in the BPF program") self.args = parser.parse_args() self.usdt_ctx = None def _create_probes(self): self.probes = [] for specifier in (self.args.countspecifier or []): self.probes.append(Probe(self, "freq", specifier)) for histspecifier in (self.args.histspecifier or []): self.probes.append(Probe(self, "hist", histspecifier)) if len(self.probes) == 0: print("at least one specifier is required") exit() def enable_usdt_probe(self, probe_name, fn_name): if not self.usdt_ctx: self.usdt_ctx = USDT(pid=self.args.pid) self.usdt_ctx.enable_probe(probe_name, fn_name) def _generate_program(self): bpf_source = """ struct __string_t { char s[%d]; }; #include <uapi/linux/ptrace.h> """ % self.args.string_size for include in (self.args.include or []): bpf_source += "#include <%s>\n" % include bpf_source += BPF.generate_auto_includes( map(lambda p: p.raw_spec, self.probes)) bpf_source += Tracepoint.generate_decl() bpf_source += Tracepoint.generate_entry_probe() for probe in self.probes: bpf_source += probe.generate_text() if self.args.verbose: if self.usdt_ctx: print(self.usdt_ctx.get_text()) print(bpf_source) self.bpf = BPF(text=bpf_source, usdt=self.usdt_ctx) def _attach(self): Tracepoint.attach(self.bpf) for probe in self.probes: probe.attach(self.bpf) if self.args.verbose: print("open uprobes: %s" % self.bpf.open_uprobes) print("open kprobes: %s" % self.bpf.open_kprobes) def _main_loop(self): count_so_far = 0 while True: try: sleep(self.args.interval) except KeyboardInterrupt: exit() print("[%s]" % strftime("%H:%M:%S")) for probe in self.probes: probe.display(self.args.top) count_so_far += 1 if self.args.count is not None and \ count_so_far >= self.args.count: exit() def _close_probes(self): for probe in self.probes: probe.close() if self.args.verbose: print("closed probe: " + str(probe)) def run(self): try: self._create_probes() self._generate_program() self._attach() self._main_loop() except: if self.args.verbose: traceback.print_exc() elif sys.exc_info()[0] is not SystemExit: print(sys.exc_info()[1]) self._close_probes()