class TestKprobeMaxactive(TestCase): def setUp(self): self.b = BPF(text=b""" typedef struct { int idx; } Key; typedef struct { u64 val; } Val; BPF_HASH(stats, Key, Val, 3); int hello(void *ctx) { stats.lookup_or_init(&(Key){1}, &(Val){0})->val++; return 0; } int goodbye(void *ctx) { stats.lookup_or_init(&(Key){2}, &(Val){0})->val++; return 0; } """) self.b.attach_kprobe(event_re=self.b.get_syscall_prefix() + b"bpf", fn_name=b"hello") self.b.attach_kretprobe(event_re=self.b.get_syscall_prefix() + b"bpf", fn_name=b"goodbye", maxactive=128) def test_send1(self): k1 = self.b[b"stats"].Key(1) k2 = self.b[b"stats"].Key(2) self.assertTrue(self.b[b"stats"][k1].val >= 2) self.assertTrue(self.b[b"stats"][k2].val == 1)
class TestKprobeRgx(TestCase): def setUp(self): self.b = BPF(text=""" typedef struct { int idx; } Key; typedef struct { u64 val; } Val; BPF_TABLE("hash", Key, Val, stats, 3); int hello(void *ctx) { stats.lookup_or_init(&(Key){1}, &(Val){0})->val++; return 0; } int goodbye(void *ctx) { stats.lookup_or_init(&(Key){2}, &(Val){0})->val++; return 0; } """) self.b.attach_kprobe(event_re="^SyS_bp.*", fn_name="hello") self.b.attach_kretprobe(event_re="^SyS_bp.*", fn_name="goodbye") def test_send1(self): k1 = self.b["stats"].Key(1) k2 = self.b["stats"].Key(2) self.assertEqual(self.b["stats"][k1].val, self.b["stats"][k2].val + 1)
from bcc.utils import printb def ntoa(addr): ipaddr = b'' for n in range(0, 4): ipaddr = ipaddr + str(addr & 0xff).encode() if (n != 3): ipaddr = ipaddr + b'.' addr = addr >> 8 return ipaddr def print_event(cpu, data, size): event = bpf["events"].event(data) printb(b"%-6d %-3s %-16s %-16s %-16s %-16d" % (event.pid, event.nodename, event.comm, ntoa( event.saddr), ntoa(event.daddr), event.dport)) bpf = BPF(src_file="trace.c") bpf.attach_kprobe(event="tcp_v4_connect", fn_name="tcp_connect") bpf.attach_kretprobe(event="tcp_v4_connect", fn_name="tcp_connect_ret") bpf["events"].open_perf_buffer(print_event) print("%-6s %-3s %-16s %-16s %-16s %-16s" % ("PID", "CONTAINER-ID", "COMMAND", "SOURCE-IPADDR", "DESTINATION-IPADDR", "DPORT")) while 1: bpf.perf_buffer_poll()
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) # I'd rather trace these via new_sync_read/new_sync_write (which used to be # do_sync_read/do_sync_write), but those became static. So trace these from # the parent functions, at the cost of more overhead, instead. # Ultimately, we should be using [V]FS tracepoints. b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") try: b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") except: # older kernels don't have __vfs_write so try vfs_write instead b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") TASK_COMM_LEN = 16 # linux/sched.h DNAME_INLINE_LEN = 32 # linux/dcache.h class Data(ct.Structure): _fields_ = [ ("mode", ct.c_int), ("pid", ct.c_uint),
events.perf_submit(ctx, &data, sizeof(data)); return 0; } """ bpf_text = bpf_text.replace("MAXARG", args.max_args) if args.ebpf: print(bpf_text) exit() # initialize BPF b = BPF(text=bpf_text) execve_fnname = b.get_syscall_fnname("execve") b.attach_kprobe(event=execve_fnname, fn_name="syscall__execve") b.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve") # header if args.timestamp: print("%-8s" % ("TIME(s)"), end="") print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS")) class EventType(object): EVENT_ARG = 0 EVENT_RET = 1 start_ts = time.time() argv = defaultdict(list) # This is best-effort PPID matching. Short-lived processes may exit # before we get a chance to read the PPID.
event.ts_us, event.task, event.pid, type, event.size, event.offset, event.delta_us, event.file)) return print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), event.task, event.pid, type, event.size, event.offset / 1024, float(event.delta_us) / 1000, event.file)) # initialize BPF b = BPF(text=bpf_text) # common file functions b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_rw_entry") b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_rw_entry") b.attach_kprobe(event="xfs_file_open", fn_name="trace_open_entry") b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_fsync_entry") b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return") b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return") b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return") # header if (csv): print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") else: if min_ms == 0: print("Tracing XFS operations") else: print("Tracing XFS operations slower than %d ms" % min_ms) print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
b = BPF(text=bpf_text) # common file functions if BPF.get_kprobe_functions('zpl_iter'): b.attach_kprobe(event="zpl_iter_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_iter_write", fn_name="trace_entry") elif BPF.get_kprobe_functions('zpl_aio'): b.attach_kprobe(event="zpl_aio_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_aio_write", fn_name="trace_entry") else: b.attach_kprobe(event="zpl_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_write", fn_name="trace_entry") b.attach_kprobe(event="zpl_open", fn_name="trace_entry") b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry") if BPF.get_kprobe_functions('zpl_iter'): b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return") elif BPF.get_kprobe_functions('zpl_aio'): b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return") else: b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") print("Tracing ZFS operation latency... Hit Ctrl-C to end.") # output exiting = 0 dist = b.get_table("dist")
interval = 5 count = -1 if len(argv) > 1: try: interval = int(argv[1]) if interval == 0: raise if len(argv) > 2: count = int(argv[2]) except: # also catches -h, --help usage() # load BPF program b = BPF(src_file = "vfsreadlat.c") b.attach_kprobe(event="vfs_read", fn_name="do_entry") b.attach_kretprobe(event="vfs_read", fn_name="do_return") # header print("Tracing... Hit Ctrl-C to end.") # output loop = 0 do_exit = 0 while (1): if count > 0: loop += 1 if loop > count: exit() try: sleep(interval) except KeyboardInterrupt:
} udata.t = udata.t1 - udata.t0; // Print only fsyncs that take 1 second or longer // if (udata.t < 1 * 1000 * 1000 * 1000) { // return; // } events.perf_submit(ctx, &udata, sizeof(udata)); }; """) b.attach_kprobe(event="jbd2_journal_lock_updates", fn_name="trace_jbd2_journal_lock_updates") b.attach_kretprobe(event="jbd2_journal_lock_updates", fn_name="trace_ret_jbd2_journal_lock_updates") class Data(ct.Structure): _fields_ = [ ("tid", ct.c_uint), ("comm", ct.c_char * 16), ("fd", ct.c_uint), ("t0", ct.c_ulonglong), ("t1", ct.c_ulonglong), ("t", ct.c_ulonglong), ("stack_id", ct.c_ulong), ] stacks = b["stacks"]
class EventMonitor: def __init__(self, args): self.start_off = 0 self.cur_off = 0 self.max_off = 0 self.end_off = 0 self.events = list() self.do_trace = True self.bpf = None self.cont_mode = args.container self.json = args.json self.ebpf = args.ebpf self.events_to_trace = args.events_to_trace def init_bpf(self): bpf_text = load_bpf_program().replace("MAXARG", str(MAX_ARGS)) if self.cont_mode: bpf_text = bpf_text.replace("CONTAINER_MODE", "1") else: bpf_text = bpf_text.replace("CONTAINER_MODE", "0") if self.ebpf: log.debug(bpf_text) exit() # initialize BPF self.bpf = BPF(text=bpf_text) # attaching kprobes sk, se = get_kprobes(self.events_to_trace) for syscall in sk: syscall_fnname = self.bpf.get_syscall_fnname(syscall) self.bpf.attach_kprobe(event=syscall_fnname, fn_name="trace_sys_" + syscall) self.bpf.attach_kretprobe(event=syscall_fnname, fn_name="trace_ret_sys_" + syscall) for sysevent in se: self.bpf.attach_kprobe(event=sysevent, fn_name="trace_" + sysevent) if not self.json: log.info("%-14s %-16s %-12s %-12s %-6s %-16s %-16s %-6s %-6s %-6s %-12s %s" % ( "TIME(s)", "UTS_NAME", "MNT_NS", "PID_NS", "UID", "EVENT", "COMM", "PID", "TID", "PPID", "RET", "ARGS")) def get_sockaddr_from_buf(self, buf): # handle buffer wrap # todo: parse all fields if self.cur_off >= self.max_off: self.cur_off = 0 c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_short)).contents self.cur_off = self.cur_off + 2 domain = c_val.value if domain in sock_domain: return sock_domain[domain] else: return str(domain) def get_int_from_buf(self, buf): # handle buffer wrap if self.cur_off >= self.max_off: self.cur_off = 0 c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_int)).contents self.cur_off = self.cur_off + 4 return c_val.value def get_uint_from_buf(self, buf): # handle buffer wrap if self.cur_off >= self.max_off: self.cur_off = 0 c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_uint)).contents self.cur_off = self.cur_off + 4 return c_val.value def get_ulong_from_buf(self, buf): # handle buffer wrap if self.cur_off >= self.max_off: self.cur_off = 0 c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_ulong)).contents self.cur_off = self.cur_off + 8 return c_val.value def get_pointer_from_buf(self, buf): # handle buffer wrap if self.cur_off >= self.max_off: self.cur_off = 0 c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_void_p)).contents self.cur_off = self.cur_off + 8 return hex(0 if c_val.value is None else c_val.value) def get_string_from_buf(self, buf): # handle buffer wrap if self.cur_off >= self.max_off: self.cur_off = 0 str_size = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_uint)).contents.value str_off = self.cur_off + 4 str_buf = buf[str_off:str_off + str_size] self.cur_off = self.cur_off + str_size + 4 try: ret_str = str(array.array('B', str_buf).tostring().decode("utf-8")) return ret_str except: return "" def get_argv_from_buf(self, buf, args): while True: # first, check if there are more args to parse if self.cur_off >= self.end_off: if self.end_off > self.start_off: return # reached args end elif self.cur_off < self.start_off: # wrapped buffer return # reached args end (wrapped buffer) args.append(self.get_string_from_buf(buf)) # process event def print_event(self, cpu, data, size): event = self.bpf["events"].event(data) self.start_off = event.start_off self.cur_off = event.start_off self.max_off = event.max_off self.end_off = event.end_off event_buf = self.bpf["submission_buf"][cpu].buf context = ctypes.cast(ctypes.byref(event_buf, self.cur_off), ctypes.POINTER(context_t)).contents self.cur_off += ctypes.sizeof(context_t) pid = context.pid tid = context.tid ppid = context.ppid args = list() if context.eventid == EventId.SYS_EXECVE: eventname = "execve" event_type = self.get_int_from_buf(event_buf) if event_type == EventType.EVENT_ARG: self.get_argv_from_buf(event_buf, args) # argv # EVENT_RET will happen only when exec failed - print context below elif context.eventid == EventId.SYS_EXECVEAT: eventname = "execveat" event_type = self.get_int_from_buf(event_buf) if event_type == EventType.EVENT_ARG: args.append(str(self.get_int_from_buf(event_buf))) # dirfd self.get_argv_from_buf(event_buf, args) # argv flags = self.get_int_from_buf(event_buf) args.append(execveat_flags_to_str(flags)) # flags # EVENT_RET will happen only when exec failed - print context below elif context.eventid == EventId.SYS_OPEN: eventname = "open" args.append(self.get_string_from_buf(event_buf)) # filename args.append(open_flags_to_str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_OPENAT: eventname = "openat" args.append(str(self.get_int_from_buf(event_buf))) # dirfd args.append(self.get_string_from_buf(event_buf)) # filename args.append(open_flags_to_str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_CREAT: eventname = "creat" args.append(self.get_string_from_buf(event_buf)) # pathname args.append(str(self.get_uint_from_buf(event_buf))) # mode elif context.eventid == EventId.SYS_MEMFD_CREATE: eventname = "memfd_create" args.append(self.get_string_from_buf(event_buf)) # name args.append(str(self.get_uint_from_buf(event_buf))) # flags elif context.eventid == EventId.CAP_CAPABLE: eventname = "cap_capable" capability = self.get_int_from_buf(event_buf) if capability in capabilities: args.append(capabilities[capability]) else: args.append(str(capability)) elif context.eventid == EventId.SYS_MMAP: eventname = "mmap" args.append(str(self.get_pointer_from_buf(event_buf))) # addr args.append(str(self.get_ulong_from_buf(event_buf))) # length args.append(str(self.get_int_from_buf(event_buf))) # prot args.append(str(self.get_int_from_buf(event_buf))) # flags args.append(str(self.get_int_from_buf(event_buf))) # fd args.append(str(self.get_ulong_from_buf(event_buf))) # offset elif context.eventid == EventId.SYS_MKNOD: eventname = "mknod" args.append(self.get_string_from_buf(event_buf)) # pathname args.append(str(self.get_uint_from_buf(event_buf))) # mode args.append(str(self.get_uint_from_buf(event_buf))) # dev elif context.eventid == EventId.SYS_MKNOD: eventname = "mknodat" args.append(str(self.get_int_from_buf(event_buf))) # dirfd args.append(self.get_string_from_buf(event_buf)) # pathname args.append(str(self.get_uint_from_buf(event_buf))) # mode args.append(str(self.get_uint_from_buf(event_buf))) # dev elif context.eventid == EventId.SYS_DUP: eventname = "dup" args.append(str(self.get_int_from_buf(event_buf))) # oldfd elif context.eventid == EventId.SYS_DUP2: eventname = "dup2" args.append(str(self.get_int_from_buf(event_buf))) # oldfd args.append(str(self.get_int_from_buf(event_buf))) # newfd elif context.eventid == EventId.SYS_DUP3: eventname = "dup3" args.append(str(self.get_int_from_buf(event_buf))) # oldfd args.append(str(self.get_int_from_buf(event_buf))) # newfd args.append(str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_CLOSE: eventname = "close" args.append(str(self.get_uint_from_buf(event_buf))) # fd elif context.eventid == EventId.SYS_IOCTL: eventname = "ioctl" args.append(str(self.get_uint_from_buf(event_buf))) # fd args.append(str(self.get_ulong_from_buf(event_buf))) # request elif context.eventid == EventId.SYS_KILL: eventname = "kill" args.append(str(self.get_int_from_buf(event_buf))) # pid args.append(str(self.get_int_from_buf(event_buf))) # sig elif context.eventid == EventId.SYS_LISTEN: eventname = "listen" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(str(self.get_int_from_buf(event_buf))) # backlog elif context.eventid == EventId.SYS_CONNECT: eventname = "connect" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(self.get_sockaddr_from_buf(event_buf)) # sockaddr (partialy parsed to family) elif context.eventid == EventId.SYS_ACCEPT: eventname = "accept" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(self.get_sockaddr_from_buf(event_buf)) # sockaddr (partialy parsed to family) elif context.eventid == EventId.SYS_ACCEPT4: eventname = "accept4" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(self.get_sockaddr_from_buf(event_buf)) # sockaddr (partialy parsed to family) elif context.eventid == EventId.SYS_BIND: eventname = "bind" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(self.get_sockaddr_from_buf(event_buf)) # sockaddr (partialy parsed to family) elif context.eventid == EventId.SYS_GETSOCKNAME: eventname = "getsockname" args.append(str(self.get_int_from_buf(event_buf))) # sockfd args.append(self.get_sockaddr_from_buf(event_buf)) # sockaddr (partialy parsed to family) elif context.eventid == EventId.SYS_ACCESS: eventname = "access" args.append(self.get_string_from_buf(event_buf)) # pathname args.append(str(self.get_int_from_buf(event_buf))) # mode elif context.eventid == EventId.SYS_FACCESSAT: eventname = "faccessat" args.append(str(self.get_int_from_buf(event_buf))) # dirfd args.append(self.get_string_from_buf(event_buf)) # pathname args.append(str(self.get_int_from_buf(event_buf))) # mode args.append(str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_SOCKET: eventname = "socket" domain = self.get_int_from_buf(event_buf) # domain if domain in sock_domain: args.append(sock_domain[domain]) else: args.append(str(domain)) _type = self.get_int_from_buf(event_buf) # type type_str = "" s_type = _type & 0xf if s_type in sock_type: type_str = sock_type[s_type] else: type_str = str(s_type) if _type & 0o00004000: type_str += "|SOCK_NONBLOCK" if _type & 0o02000000: type_str += "|SOCK_CLOEXEC" args.append(type_str) args.append(str(self.get_int_from_buf(event_buf))) # protocol elif context.eventid == EventId.SYS_MPROTECT: eventname = "mprotect" args.append(str(self.get_pointer_from_buf(event_buf))) # addr args.append(str(self.get_ulong_from_buf(event_buf))) # length args.append(str(self.get_int_from_buf(event_buf))) # prot elif context.eventid == EventId.SYS_STAT: eventname = "stat" args.append(self.get_string_from_buf(event_buf)) # path elif context.eventid == EventId.SYS_FSTAT: eventname = "fstat" args.append(str(self.get_uint_from_buf(event_buf))) # fd elif context.eventid == EventId.SYS_LSTAT: eventname = "lstat" args.append(self.get_string_from_buf(event_buf)) # path elif context.eventid == EventId.SYS_PRCTL: eventname = "prctl" args.append(str(self.get_uint_from_buf(event_buf))) # option args.append(str(self.get_ulong_from_buf(event_buf))) # arg2 args.append(str(self.get_ulong_from_buf(event_buf))) # arg3 args.append(str(self.get_ulong_from_buf(event_buf))) # arg4 args.append(str(self.get_ulong_from_buf(event_buf))) # arg5 elif context.eventid == EventId.SYS_PTRACE: eventname = "ptrace" args.append(str(self.get_uint_from_buf(event_buf))) # request args.append(str(self.get_int_from_buf(event_buf))) # pid args.append(str(self.get_pointer_from_buf(event_buf))) # addr args.append(str(self.get_pointer_from_buf(event_buf))) # data elif context.eventid == EventId.SYS_PROCESS_VM_WRITEV: eventname = "process_vm_writev" args.append(str(self.get_int_from_buf(event_buf))) # pid args.append(str(self.get_pointer_from_buf(event_buf))) # local_iov args.append(str(self.get_ulong_from_buf(event_buf))) # liovcnt args.append(str(self.get_pointer_from_buf(event_buf))) # remote_iov args.append(str(self.get_ulong_from_buf(event_buf))) # riovcnt args.append(str(self.get_ulong_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_PROCESS_VM_READV: eventname = "process_vm_readv" args.append(str(self.get_int_from_buf(event_buf))) # pid args.append(str(self.get_pointer_from_buf(event_buf))) # local_iov args.append(str(self.get_ulong_from_buf(event_buf))) # liovcnt args.append(str(self.get_pointer_from_buf(event_buf))) # remote_iov args.append(str(self.get_ulong_from_buf(event_buf))) # riovcnt args.append(str(self.get_ulong_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_INIT_MODULE: eventname = "init_module" args.append(str(self.get_pointer_from_buf(event_buf))) # module_image args.append(str(self.get_ulong_from_buf(event_buf))) # len args.append(self.get_string_from_buf(event_buf)) # param_values elif context.eventid == EventId.SYS_FINIT_MODULE: eventname = "finit_module" args.append(str(self.get_int_from_buf(event_buf))) # fd args.append(self.get_string_from_buf(event_buf)) # param_values args.append(str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_DELETE_MODULE: eventname = "delete_module" args.append(self.get_string_from_buf(event_buf)) # name args.append(str(self.get_int_from_buf(event_buf))) # flags elif context.eventid == EventId.SYS_SYMLINK: eventname = "symlink" args.append(self.get_string_from_buf(event_buf)) # target args.append(self.get_string_from_buf(event_buf)) # linkpath elif context.eventid == EventId.SYS_SYMLINKAT: eventname = "symlinkat" args.append(self.get_string_from_buf(event_buf)) # target args.append(str(self.get_int_from_buf(event_buf))) # newdirfd args.append(self.get_string_from_buf(event_buf)) # linkpath elif context.eventid == EventId.SYS_GETDENTS: eventname = "getdents" args.append(str(self.get_uint_from_buf(event_buf))) # fd elif context.eventid == EventId.SYS_GETDENTS64: eventname = "getdents64" args.append(str(self.get_uint_from_buf(event_buf))) # fd elif context.eventid == EventId.SYS_CLONE: eventname = "clone" elif context.eventid == EventId.SYS_FORK: eventname = "fork" elif context.eventid == EventId.SYS_VFORK: eventname = "vfork" elif context.eventid == EventId.DO_EXIT: eventname = "do_exit" else: return try: comm = context.comm.decode("utf-8") uts_name = context.uts_name.decode("utf-8") except: return if eventname in self.events_to_trace: if not self.json: log.info("%-14f %-16s %-12d %-12d %-6d %-16s %-16s %-6d %-6d %-6d %-12d %s" % ( context.ts / 1000000.0, uts_name, context.mnt_id, context.pid_id, context.uid, eventname, comm, pid, tid, ppid, context.retval, " ".join(args))) else: # prepare data to be consumed by ultrabox data = dict() data["status"] = [0] data["raw"] = "" data["type"] = ["apicall"] data["time"] = context.ts / 1000000.0 data["mnt_ns"] = context.mnt_id data["pid_ns"] = context.pid_id data["uid"] = context.uid data["api"] = eventname data["uts_name"] = uts_name data["process_name"] = comm data["pid"] = pid data["tid"] = tid data["ppid"] = ppid data["return_value"] = context.retval dict_args = dict() args_len = len(args) for i in range(args_len): dict_args["p" + str(i)] = args[i].rstrip('\0') data["arguments"] = dict_args log.info(json.dumps(data)) self.events.append(data) # if eventname == "do_exit" and pid == 1: # log.info(json.dumps(events, indent=4)) # exit() def stop_trace(self): self.do_trace = False def get_events(self): return self.events def monitor_events(self): # loop with callback to print_event self.bpf["events"].open_perf_buffer(self.print_event) while self.do_trace: try: self.bpf.perf_buffer_poll(1000) except KeyboardInterrupt: exit()
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) else: bpf_text = bpf_text.replace('FILTER_PID', '0') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # load BPF program b = BPF(text=bpf_text) b.attach_kprobe(event=ext4_read_fn, fn_name=ext4_trace_read_fn) b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry") b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry") b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry") b.attach_kretprobe(event=ext4_read_fn, fn_name='trace_read_return') b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") print("Tracing ext4 operation latency... Hit Ctrl-C to end.") # output exiting = 0 dist = b.get_table("dist") while (1): try: if args.interval: sleep(int(args.interval)) else: sleep(99999999)
args = parser.parse_args() # load BPF program current_pid = os.getpid() # Generate program prog = prog_header for lock in locks: prog += lock_func.replace("_ID_", str(lock['id'])).replace("_NAME_", lock['name']) prog = prog.replace("CUR_PID", str(current_pid)) b = BPF(text=prog) for lock in locks: b.attach_kprobe(event=lock['lock_func'], fn_name="lock_%s" % lock['name']) b.attach_kretprobe(event=lock['lock_func'], fn_name="release_%s" % lock['name']) events = {} # header # print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "LOCKTIME")) print("Tracing locks for %d seconds" % args.time) # process event start = 0 # loop with callback to print_event for lock in locks: b[lock['name']].open_perf_buffer(print_event, page_cnt=4096) start_time = datetime.datetime.now() try:
if args.tid: # TID trumps PID program = program.replace('FILTER', 'if (tid != %s) { return 0; }' % args.tid) elif args.pid: program = program.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) else: program = program.replace('FILTER', '') program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) b = BPF(text=program) b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter") b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return") b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter") enabled = b.get_table("enabled"); stack_traces = b.get_table("stack_traces") aq_counts = b.get_table("aq_report_count") aq_maxs = b.get_table("aq_report_max") aq_totals = b.get_table("aq_report_total") hl_counts = b.get_table("hl_report_count") hl_maxs = b.get_table("hl_report_max") hl_totals = b.get_table("hl_report_total") aq_sort = sort_list(aq_maxs, aq_totals, aq_counts) hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
event.ts_us, event.task, event.pid, type, event.size, event.offset, event.delta_us, event.file)) return print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), event.task, event.pid, type, event.size, event.offset / 1024, float(event.delta_us) / 1000, event.file)) # initialize BPF b = BPF(text=bpf_text) # Common file functions. See earlier comment about generic_*(). b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_write_entry") b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_fsync_entry") b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") # header if (csv): print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") else: if min_ms == 0: print("Tracing btrfs operations") else: print("Tracing btrfs operations slower than %d ms" % min_ms) print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
class EventMonitor: def __init__(self, args): self.cur_off = 0 self.events = list() self.do_trace = True self.bpf = None self.event_bufs = list() self.total_lost = 0 # input arguments self.cont_mode = args.container self.json = args.json self.ebpf = args.ebpf self.list_events = args.list self.events_to_trace = args.events_to_trace self.buf_pages = args.buf_pages def init_bpf(self): bpf_text = load_bpf_program() if self.list_events: log.info("Syscalls:") for e in syscalls: log.info(" %s" % e) log.info("\nOther events:") for e in sysevents: log.info(" %s" % e) exit() if self.ebpf: log.debug(bpf_text) exit() # initialize BPF self.bpf = BPF(text=bpf_text) # set shared config key = ctypes.c_uint32(shared_config.CONFIG_CONT_MODE) self.bpf["config_map"][key] = ctypes.c_uint32(self.cont_mode) # attaching kprobes sk, se = get_kprobes(self.events_to_trace) for syscall in sk: syscall_fnname = self.bpf.get_syscall_fnname(syscall) self.bpf.attach_kprobe(event=syscall_fnname, fn_name="syscall__" + syscall) self.bpf.attach_kretprobe(event=syscall_fnname, fn_name="trace_ret_" + syscall) for sysevent in se: self.bpf.attach_kprobe(event=sysevent, fn_name="trace_" + sysevent) if not self.json: log.info("%-14s %-16s %-12s %-12s %-6s %-16s %-16s %-6s %-6s %-6s %-12s %s" % ( "TIME(s)", "UTS_NAME", "MNT_NS", "PID_NS", "UID", "EVENT", "COMM", "PID", "TID", "PPID", "RET", "ARGS")) def get_sockaddr_from_buf(self, buf): # todo: parse all fields c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_short)).contents self.cur_off = self.cur_off + 2 domain = c_val.value if domain in sock_domain: return sock_domain[domain] else: return str(domain) def get_type_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_byte)).contents self.cur_off = self.cur_off + 1 return c_val.value def get_int_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_int)).contents self.cur_off = self.cur_off + 4 return c_val.value def get_uint_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_uint)).contents self.cur_off = self.cur_off + 4 return c_val.value def get_long_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_long)).contents self.cur_off = self.cur_off + 8 return c_val.value def get_ulong_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_ulong)).contents self.cur_off = self.cur_off + 8 return c_val.value def get_pointer_from_buf(self, buf): c_val = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_void_p)).contents self.cur_off = self.cur_off + 8 return hex(0 if c_val.value is None else c_val.value) def get_string_from_buf(self, buf): str_size = ctypes.cast(ctypes.byref(buf, self.cur_off), ctypes.POINTER(ctypes.c_uint)).contents.value str_off = self.cur_off + 4 str_buf = buf[str_off:str_off + str_size] self.cur_off = self.cur_off + str_size + 4 try: ret_str = str(array.array('B', str_buf).tostring().decode("utf-8")) return ret_str except: return "" def get_str_arr_from_buf(self, buf, args): str_list = list() while self.cur_off < ctypes.sizeof(buf): argtype = self.get_type_from_buf(buf) if argtype == ArgType.STR_T: str_list.append(self.get_string_from_buf(buf).rstrip('\x00')) else: args.append('[%s]' % ', '.join(map(str, str_list))) return def print_event(self, eventname, context, args): # There are some syscalls which doesn't have the same name as their function eventfunc = "dummy" if context.eventid == 4: eventfunc = "newstat" elif context.eventid == 5: eventfunc = "newfstat" elif context.eventid == 6: eventfunc = "newlstat" try: comm = context.comm.decode("utf-8") uts_name = context.uts_name.decode("utf-8") except: return if eventname in self.events_to_trace or eventfunc in self.events_to_trace: if not self.json: log.info("%-14f %-16s %-12d %-12d %-6d %-16s %-16s %-6d %-6d %-6d %-12d %s" % ( context.ts / 1000000.0, uts_name, context.mnt_id, context.pid_id, context.uid, eventname, comm, context.pid, context.tid, context.ppid, context.retval, " ".join(args))) else: # prepare data to be consumed by ultrabox data = dict() data["status"] = [0] data["raw"] = "" data["type"] = ["apicall"] data["time"] = context.ts / 1000000.0 data["mnt_ns"] = context.mnt_id data["pid_ns"] = context.pid_id data["uid"] = context.uid data["api"] = eventname data["uts_name"] = uts_name data["process_name"] = comm data["pid"] = context.pid data["tid"] = context.tid data["ppid"] = context.ppid data["return_value"] = context.retval dict_args = dict() args_len = len(args) for i in range(args_len): dict_args["p" + str(i)] = args[i].rstrip('\0') data["arguments"] = dict_args log.info(json.dumps(data)) self.events.append(data) def parse_event(self, event_buf): context = ctypes.cast(ctypes.byref(event_buf), ctypes.POINTER(context_t)).contents self.cur_off = ctypes.sizeof(context_t) args = list() if context.eventid in event_id: eventname = event_id[context.eventid] for i in range(context.argnum): argtype = self.get_type_from_buf(event_buf) # sanity check - should never happen if self.cur_off >= ctypes.sizeof(event_buf): return if argtype == ArgType.INT_T: args.append(str(self.get_int_from_buf(event_buf))) elif argtype == ArgType.UINT_T: args.append(str(self.get_uint_from_buf(event_buf))) elif argtype == ArgType.LONG_T: args.append(str(self.get_long_from_buf(event_buf))) elif argtype == ArgType.ULONG_T: args.append(str(self.get_ulong_from_buf(event_buf))) elif argtype == ArgType.OFF_T_T: args.append(str(self.get_ulong_from_buf(event_buf))) elif argtype == ArgType.MODE_T_T: args.append(str(self.get_uint_from_buf(event_buf))) elif argtype == ArgType.DEV_T_T: args.append(str(self.get_uint_from_buf(event_buf))) elif argtype == ArgType.SIZE_T_T: args.append(str(self.get_ulong_from_buf(event_buf))) elif argtype == ArgType.POINTER_T: args.append(str(self.get_pointer_from_buf(event_buf))) elif argtype == ArgType.STR_T: args.append(self.get_string_from_buf(event_buf)) elif argtype == ArgType.STR_ARR_T: self.get_str_arr_from_buf(event_buf, args) elif argtype == ArgType.SOCKADDR_T: # sockaddr (partialy parsed to family) args.append(self.get_sockaddr_from_buf(event_buf)) elif argtype == ArgType.OPENFLAGS_T: args.append(open_flags_to_str(self.get_int_from_buf(event_buf))) elif argtype == ArgType.EXEC_FLAG_T: flags = self.get_int_from_buf(event_buf) args.append(execveat_flags_to_str(flags)) elif argtype == ArgType.SOCK_DOM_T: domain = self.get_int_from_buf(event_buf) if domain in sock_domain: args.append(sock_domain[domain]) else: args.append(str(domain)) elif argtype == ArgType.SOCK_TYPE_T: sock_type_num = self.get_int_from_buf(event_buf) args.append(sock_type_to_str(sock_type_num)) elif argtype == ArgType.CAP_T: capability = self.get_int_from_buf(event_buf) if capability in capabilities: args.append(capabilities[capability]) else: args.append(str(capability)) else: return return self.print_event(eventname, context, args) # process event def handle_event(self, cpu, data, size): buf = ctypes.cast(data, ctypes.POINTER(ctypes.c_char*size)).contents event_buf = (ctypes.c_char * size).from_buffer_copy(buf) self.event_bufs.append(event_buf) def lost_event(self, lost): self.total_lost += lost log.info("Possibly lost %d events (%d in total), consider using a bigger buffer" % (lost, self.total_lost)) def stop_trace(self): self.do_trace = False def get_events(self): return self.events def monitor_events(self): # loop with callback to handle_event self.bpf["events"].open_perf_buffer(self.handle_event, page_cnt=self.buf_pages, lost_cb=self.lost_event) while self.do_trace: try: # It would have been better to parse the events in a "consumer" thread # As python threading is not efficient - parse here for event in self.event_bufs: self.parse_event(event) self.event_bufs = list() self.bpf.perf_buffer_poll(1000) except KeyboardInterrupt: exit()
pid_filter = "" netns_filter = "" if args.pid: pid_filter = 'if (pid >> 32 != %d) { return 0; }' % args.pid if args.netns: netns_filter = 'if (net_ns_inum != %d) { return 0; }' % args.netns bpf_text = bpf_text.replace('##FILTER_PID##', pid_filter) bpf_text = bpf_text.replace('##FILTER_NETNS##', netns_filter) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_entry") b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return") b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_entry") b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return") b.attach_kprobe(event="tcp_set_state", fn_name="trace_tcp_set_state_entry") b.attach_kprobe(event="tcp_close", fn_name="trace_close_entry") b.attach_kretprobe(event="inet_csk_accept", fn_name="trace_accept_return") print("Tracing TCP established connections. Ctrl-C to end.") # header if args.verbose: print("%-12s %-6s %-16s %-2s %-16s %-16s %-6s %-7s" % ("TYPE", "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"), end="") if not args.netns: print("%-8s" % "NETNS", end="") print()
if args.verbose or args.ebpf: print(bpf_text) if args.ebpf: exit() b = BPF(text=bpf_text) for i, function in enumerate(args.functions): if ":" in function: library, func = function.split(":") b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i) b.attach_uretprobe(name=library, sym=func, fn_name="trace_return") else: b.attach_kprobe(event=function, fn_name="trace_%d" % i) b.attach_kretprobe(event=function, fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h class Data(ct.Structure): _fields_ = [ ("id", ct.c_ulonglong), ("tgid_pid", ct.c_ulonglong), ("start_ns", ct.c_ulonglong), ("duration_ns", ct.c_ulonglong), ("retval", ct.c_ulonglong), ("comm", ct.c_char * TASK_COMM_LEN) ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \ ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \ ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') if debug: print(bpf_text) # initialize BPF b = BPF(text=bpf_text) # for POSIX compliance, all architectures implement these # system calls but the name of the actual entry point may # be different for which we must check if the entry points # actually exist before attaching the probes if BPF.ksymname("sys_stat") != -1: b.attach_kprobe(event="sys_stat", fn_name="trace_entry") b.attach_kretprobe(event="sys_stat", fn_name="trace_return") if BPF.ksymname("sys_statfs") != -1: b.attach_kprobe(event="sys_statfs", fn_name="trace_entry") b.attach_kretprobe(event="sys_statfs", fn_name="trace_return") if BPF.ksymname("sys_newstat") != -1: b.attach_kprobe(event="sys_newstat", fn_name="trace_entry") b.attach_kretprobe(event="sys_newstat", fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h NAME_MAX = 255 # linux/limits.h class Data(ct.Structure): _fields_ = [ ("pid", ct.c_ulonglong),
prog = prog.replace('FILTER', '') b = BPF(text=prog) ### Probes # NOTE: use "sys_read" on older kernels b.attach_kprobe(event="ksys_read", fn_name="in_sys_read") # the unix socket relevant functions b.attach_kprobe(event="unix_stream_recvmsg", fn_name="in_unix_stream_recvmsg") b.attach_kprobe(event="unix_dgram_recvmsg", fn_name="in_unix_stream_recvmsg") b.attach_kprobe(event="unix_seqpacket_recvmsg", fn_name="in_unix_stream_recvmsg") b.attach_kretprobe(event="ksys_read", fn_name="out_sys_read") ### # define output data structure in Python TASK_COMM_LEN = 16 # linux/sched.h MAX_BUF_SIZE = 356 # Limited by the BPF stack # Max size of the whole struct: 512 bytes class Data(ct.Structure): _fields_ = [("timestamp_ns", ct.c_ulonglong), ("pid", ct.c_uint), ("remote_pid", ct.c_uint), ("comm", ct.c_char * TASK_COMM_LEN), ("v0", ct.c_ubyte * MAX_BUF_SIZE), ("buf_size", ct.c_uint), ("len", ct.c_uint)]
return 0; } """ if args.pid: bpf_text = bpf_text.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') if debug: print(bpf_text) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="sys_open", fn_name="trace_entry") b.attach_kretprobe(event="sys_open", fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h NAME_MAX = 255 # linux/limits.h class Data(ct.Structure): _fields_ = [ ("pid", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("delta", ct.c_ulonglong), ("ret", ct.c_int), ("comm", ct.c_char * TASK_COMM_LEN), ("fname", ct.c_char * NAME_MAX) ] start_ts = 0
from bcc import BPF bpf_source = """ #include <uapi/linux/ptrace.h> int ret_sys_execve(struct pt_regs *ctx) { int return_value; char comm[16]; bpf_get_current_comm(&comm, sizeof(comm)); // PT_REGS_RC是一个宏,它将从这个特定上下文的BPF寄存器中读取返回值。 return_value = PT_REGS_RC(ctx); bpf_trace_printk("program: %s, return: %d\\n", comm, return_value); return 0; } """ bpf = BPF(text=bpf_source) execve_function = bpf.get_syscall_fnname("execve") bpf.attach_kretprobe(event=execve_function, fn_name="ret_sys_execve") bpf.trace_print()
bpf_text = bpf_text.replace('STORE', 'dist.increment(bpf_log2l(delta));') if args.verbose: print(bpf_text) # signal handler def signal_ignore(signal, frame): print() # load BPF program b = BPF(text=bpf_text) # attach probes if not library: b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry") b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return") matched = b.num_open_kprobes() else: b.attach_uprobe(name=library, sym_re=pattern, fn_name="trace_func_entry", pid=args.pid or -1) b.attach_uretprobe(name=library, sym_re=pattern, fn_name="trace_func_return", pid=args.pid or -1) matched = b.num_open_uprobes() if matched == 0: print("0 functions matched by \"%s\". Exiting." % args.pattern) exit() # header print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." % (matched / 2, args.pattern))
event.delay_max, average_delay)) # load BPF program KVER = os.popen('uname -r').read().rstrip() b = BPF(text=bpf_text, cflags=[ "-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl", "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/linux", "-DCC_USING_FENTRY" ]) b.attach_kprobe(event="spa_sync", fn_name="spa_sync_entry") b.attach_kretprobe(event="spa_sync", fn_name="spa_sync_return") b.attach_kprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_entry") b.attach_kretprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_return") b.attach_kprobe(event="dmu_tx_delay", fn_name="dmu_tx_delay_entry") b.attach_kprobe(event="trace_zfs_delay__mintime", fn_name="dmu_tx_delay_mintime") b.attach_perf_event(ev_type=PerfType.SOFTWARE, ev_config=PerfSWConfig.CPU_CLOCK, fn_name="get_spa_dirty", sample_freq=10) print_count = 30 # initialize dgrn program object to read zfs_dirty_data_max prog = drgn.program_from_kernel()
event.ip, inet_ntoa(event.saddr), inet_ntoa(event.daddr), event.dport)) def print_ipv6_event(cpu, data, size): event = ct.cast(data, ct.POINTER(Data_ipv6)).contents if args.timestamp: if start_ts == 0: start_ts = event.ts_us print("%-9.3f" % ((event.ts_us - start_ts) / 100000), end="") print("%-6d %-12.12s %-2d ...%-13x ...%-13x %-4d" % (event.pid, event.task, event.ip, event.saddr, event.daddr, event.dport)) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry") b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_entry") b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return") b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return") # header if args.timestamp: print("%-9s" % ("TIME(s)"), end="") print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "SADDR", "DADDR", "DPORT")) start_ts = 0 def inet_ntoa(addr): dq = '' for i in range(0, 4): dq = dq + str(addr & 0xff) if (i != 3):
data_struct = { "measurement": 'picknext', "time": [], "tags": [ 'glob', 'cpu', 'pid', ], "fields": ['duration'] } b = BPF(text=bpf_text) b.attach_kprobe(event="pick_next_task_fair", fn_name="pick_start") b.attach_kretprobe(event="pick_next_task_fair", fn_name="pick_end") dist = b.get_table("dist") #print("%-6s%-6s%-6s%-6s" % ("CPU", "PID", "TGID", "TIME(ns)")) while (1): try: sleep(1) for k, v in dist.items(): #print("%-6d%-6d%-6d%-6d" % (k.cpu, k.pid, k.tgid, v.value)) #test_data = lmp_data('glob', k.cpu, k.pid, v.value) test_data = lmp_data(datetime.now().isoformat(), 'glob', k.cpu, k.pid, v.value) write2db(data_struct, test_data, client) dist.clear()
def run_bpf_probe(num_iterations, sleep_secs): """Run the extended BPF probe on Linux's compact_zone_order() function.""" # debug_level = 0x3 # 0x3: dump LLVM IR and BPF byte code to stderr debug_level = 0x0 # debug 0x0 = no debug bpf = BPF(src_file="eBPF_c_probe.c", debug=debug_level) # ebpf_code = bpf.dump_func(func_name="prb_eBPF_compact_zone_order_entry") assert len(bpf["global_var_total_accum_nsec"]) == 1, \ "Expected a global variable in BPF that be a scalar, ie., of length 1" # our BPF probe will only work for a kernel point which is not executed # concurrently, if not it will fail. Of course, you can use other # data structures in the BPF probe that can make it work concurrently. synchr_non_concurrent_kpoint = "compact_zone_order" bpf.attach_kprobe(event=synchr_non_concurrent_kpoint, fn_name="prb_eBPF_compact_zone_order_entry") bpf.attach_kretprobe(event=synchr_non_concurrent_kpoint, fn_name="prb_eBPF_compact_zone_order_return") # these are other collateral events we want to know if they happen at the # same time as the main event above, and relatively, how frequently they # happen when the main probed events (above) are happening. collateral_events = [ {'func': 'kmalloc_order_trace', 'probe': 'prb_eBPF_kmalloc_order_trace_return', 'count': 'global_var_cnt_kmalloc_order_trace'}, {'func': '__kmalloc', 'probe': 'prb_eBPF___kmalloc_return', 'count': 'global_var_cnt___kmalloc'}, {'func': '__do_kmalloc_node', 'probe': 'prb_eBPF___do_kmalloc_node_return', 'count': 'global_var_cnt___do_kmalloc_node'}, {'func': 'kmem_cache_alloc', 'probe': 'prb_eBPF_kmem_cache_alloc_return', 'count': 'global_var_cnt_kmem_cache_alloc'}, {'func': 'kmem_cache_alloc_trace', 'probe': 'prb_eBPF_kmem_cache_alloc_trace_return', 'count': 'global_var_cnt_kmem_cache_alloc_trace'}, {'func': 'malloc', 'probe': 'prb_eBPF_malloc_return', 'count': 'global_var_cnt_malloc'}, {'func': 'kfree', 'probe': 'prb_eBPF_kfree_return', 'count': 'global_var_cnt_kfree'}, {'func': 'kmem_cache_reap', 'probe': 'prb_eBPF_kmem_cache_reap_return', 'count': 'global_var_cnt_kmem_cache_reap'}, {'func': 'kmem_cache_free', 'probe': 'prb_eBPF_kmem_cache_free_return', 'count': 'global_var_cnt_kmem_cache_free'}, {'func': 'kmem_cache_destroy', 'probe': 'prb_eBPF_kmem_cache_destroy_return', 'count': 'global_var_cnt_kmem_cache_destroy'}, {'func': 'kmem_cache_shrink', 'probe': 'prb_eBPF_kmem_cache_shrink_return', 'count': 'global_var_cnt_kmem_cache_shrink'} ] for collateral_event in collateral_events: bpf.attach_kretprobe(event=collateral_event['func'], fn_name=collateral_event['probe']) assert len(bpf[collateral_event['count']]) == 1, \ "Var '{}' must be a scalar too.".format(collateral_event['count']) # request time to sleep and iterations as arguments from the command-line, # e.g., by using the 'argparse' module (the timing to wait is important # because there can be no output reported below if there is no activity of # the kprobe we attached to in this period of time) for sample in xrange(1, num_iterations + 1): sleep(sleep_secs) print "---- new sample: {} at {}".format(sample, strftime("%D %T")) bpf["delay_dist"].print_log2_hist("usecs") bpf["delay_dist"].clear() # All the direct iterations on BPF tables return ctypes values (like # c_int, c_ulong, etc), which we unwrap here by the .value property and # divide by 1000 (microseconds) since the histogram in C in the BPF # probe also divided the nanoseconds by 1000, so all will report in the # same unit of time total_accum_nsec = bpf["global_var_total_accum_nsec"].values()[0] print "total_accum_usec = {:.0f}".format(total_accum_nsec.value / 1000) bpf["global_var_total_accum_nsec"].clear() for k, val in bpf["total_accum_nsec_per_order"].items(): print ("total_accum_usec[order = {}] = " "{:.0f}").format(k.value, val.value / 1000) bpf["total_accum_nsec_per_order"].clear() for collateral_event in collateral_events: concur_kmallocs = bpf[collateral_event['count']].values()[0] print "{} while compaction = {}".format(collateral_event['func'], concur_kmallocs.value) bpf[collateral_event['count']].clear() sys.stdout.flush()
bpf_program.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter", pid=pid) bpf_program.attach_uretprobe(name="c", sym="malloc", fn_name="alloc_exit", pid=pid) bpf_program.attach_uprobe(name="c", sym="free", fn_name="free_enter", pid=pid) else: print("Attaching to kmalloc and kfree, Ctrl+C to quit.") bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter") bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") bpf_program.attach_kprobe(event="kfree", fn_name="free_enter") decoder = StackDecoder(pid) def print_outstanding(): stacks = {} print("[%s] Top %d stacks with outstanding allocations:" % (datetime.now().strftime("%H:%M:%S"), top_stacks)) allocs = bpf_program.get_table("allocs") for address, info in sorted(allocs.items(), key=lambda a: a[1].size): if BPF.monotonic_time() - min_age_ns < info.timestamp_ns: continue stack = decoder.decode_stack(info, kernel_trace) if stack in stacks:
# lop off the trailing . question = ("%s" % dnspkt.q.qname)[:-1].encode('utf-8') for answer in dnspkt.rr: # skip all but A and AAAA records if answer.rtype == 1 or answer.rtype == 28: dns_cache[str(answer.rdata).encode('utf-8')] = (question, datetime.now()) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry") b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_entry") b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return") b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return") if args.dns: b.attach_kprobe(event="udp_recvmsg", fn_name="trace_udp_recvmsg") b.attach_kretprobe(event="udp_recvmsg", fn_name="trace_udp_ret_recvmsg") print("Tracing connect ... Hit Ctrl-C to end") if args.count: try: while True: sleep(99999999) except KeyboardInterrupt: pass # header print("\n%-25s %-25s %-20s %-10s" %
void count_lookup(struct pt_regs *ctx) { int key = S_SLOW; stats.atomic_increment(key); if (PT_REGS_RC(ctx) == 0) { key = S_MISS; stats.atomic_increment(key); } } """ # load BPF program b = BPF(text=bpf_text) b.attach_kprobe(event_re="^lookup_fast$|^lookup_fast.constprop.*.\d$", fn_name="count_fast") b.attach_kretprobe(event="d_lookup", fn_name="count_lookup") # stat column labels and indexes stats = {"REFS": 1, "SLOW": 2, "MISS": 3} # header print("%-8s " % "TIME", end="") for stype, idx in sorted(stats.items(), key=lambda k_v: (k_v[1], k_v[0])): print(" %8s" % (stype + "/s"), end="") print(" %8s" % "HIT%") # output i = 0 while (1): if count > 0: i += 1
b = BPF(text=bpf_text) # common file functions if BPF.get_kprobe_functions(b'zpl_iter'): b.attach_kprobe(event="zpl_iter_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_iter_write", fn_name="trace_entry") elif BPF.get_kprobe_functions(b'zpl_aio'): b.attach_kprobe(event="zpl_aio_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_aio_write", fn_name="trace_entry") else: b.attach_kprobe(event="zpl_read", fn_name="trace_entry") b.attach_kprobe(event="zpl_write", fn_name="trace_entry") b.attach_kprobe(event="zpl_open", fn_name="trace_entry") b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry") if BPF.get_kprobe_functions(b'zpl_iter'): b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return") elif BPF.get_kprobe_functions(b'zpl_aio'): b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return") else: b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return") b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return") b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return") b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return") print("Tracing ZFS operation latency... Hit Ctrl-C to end.") # output exiting = 0 dist = b.get_table("dist")
bpf_text = bpf_text.replace('FLAGS_FILTER', 'if (!(flags & %d)) { return 0; }' % flag_filter_mask) else: bpf_text = bpf_text.replace('FLAGS_FILTER', '') if not (args.extended_fields or args.flag_filter): bpf_text = '\n'.join(x for x in bpf_text.split('\n') if 'EXTENDED_STRUCT_MEMBER' not in x) if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="do_sys_open", fn_name="trace_entry") b.attach_kretprobe(event="do_sys_open", fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h NAME_MAX = 255 # linux/limits.h class Data(ct.Structure): _fields_ = [ ("id", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("ret", ct.c_int), ("comm", ct.c_char * TASK_COMM_LEN), ("fname", ct.c_char * NAME_MAX), ("flags", ct.c_int), ] initial_ts = 0
events.perf_submit(ctx, &data, sizeof(data)); return 0; } """ bpf_text = bpf_text.replace("MAXARG", args.max_args) if args.ebpf: print(bpf_text) exit() # initialize BPF b = BPF(text=bpf_text) execve_fnname = b.get_syscall_fnname("execve") b.attach_kprobe(event=execve_fnname, fn_name="do_sys_execve") b.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve") # header if args.timestamp: print("%-8s" % ("TIME(s)"), end="") print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS")) TASK_COMM_LEN = 16 # linux/sched.h ARGSIZE = 128 # should match #define in C above class Data(ct.Structure): _fields_ = [ ("pid", ct.c_uint), ("comm", ct.c_char * TASK_COMM_LEN), ("type", ct.c_int), ("argv", ct.c_char * ARGSIZE),
bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) # I'd rather trace these via new_sync_read/new_sync_write (which used to be # do_sync_read/do_sync_write), but those became static. So trace these from # the parent functions, at the cost of more overhead, instead. # Ultimately, we should be using [V]FS tracepoints. b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") try: b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") except Exception: # older kernels don't have __vfs_write so try vfs_write instead b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") TASK_COMM_LEN = 16 # linux/sched.h DNAME_INLINE_LEN = 32 # linux/dcache.h class Data(ct.Structure): _fields_ = [ ("mode", ct.c_int), ("pid", ct.c_uint),
'(*vp) += delta;') if debug: print(bpf_text) # load BPF program b = BPF(text=bpf_text) # this should really use irq:softirq_entry/exit tracepoints; for now the # soft irq functions are individually traced (search your kernel for # open_softirq() calls, and adjust the following list as needed). for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq", "rcu_process_callbacks", "run_rebalance_domains", "tasklet_action", "tasklet_hi_action", "run_timer_softirq", "net_tx_action", "net_rx_action"): b.attach_kprobe(event=softirqfunc, fn_name="trace_start") b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion") print("Tracing soft irq event time... Hit Ctrl-C to end.") # output exiting = 0 if args.interval else 1 dist = b.get_table("dist") while (1): try: sleep(int(args.interval)) except KeyboardInterrupt: exiting = 1 print() if args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="")
if args.pid: pid_filter = 'if (pid >> 32 != %d) { return 0; }' % args.pid if args.netns: netns_filter = 'if (net_ns_inum != %d) { return 0; }' % args.netns bpf_text = bpf_text.replace('##FILTER_PID##', pid_filter) bpf_text = bpf_text.replace('##FILTER_NETNS##', netns_filter) if args.ebpf: print(bpf_text) exit() # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_entry") b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return") b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_entry") b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return") b.attach_kprobe(event="tcp_set_state", fn_name="trace_tcp_set_state_entry") b.attach_kprobe(event="tcp_close", fn_name="trace_close_entry") b.attach_kretprobe(event="inet_csk_accept", fn_name="trace_accept_return") print("Tracing TCP established connections. Ctrl-C to end.") # header if args.verbose: if args.timestamp: print("%-14s" % ("TIME(ns)"), end="") print("%-12s %-6s %-16s %-2s %-16s %-16s %-6s %-7s" % ("TYPE", "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"), end="") if not args.netns:
if args.tid: # TID trumps PID bpf_text = bpf_text.replace('FILTER', 'if (tid != %s) { return 0; }' % args.tid) elif args.pid: bpf_text = bpf_text.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') # initialize BPF b = BPF(text=bpf_text) syscall_fnname = b.get_syscall_fnname("sendmsg") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__sendmsg") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_sendmsg_return") syscall_fnname = b.get_syscall_fnname("recvmsg") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__recvmsg") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_recvmsg_return") b.attach_kprobe(event="__scm_send", fn_name="trace_scm_send_entry") b.attach_kprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_entry") b.attach_kretprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_return") initial_ts = 0 # header if args.timestamp: print("%-14s" % ("TIME(s)"), end="")
if args.verbose: print(bpf_text) # signal handler def signal_ignore(signal, frame): print() # load BPF program b = BPF(text=bpf_text) # attach probes if not library: b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry") b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return") matched = b.num_open_kprobes() else: b.attach_uprobe(name=library, sym_re=pattern, fn_name="trace_func_entry", pid=args.pid or -1) b.attach_uretprobe(name=library, sym_re=pattern, fn_name="trace_func_return", pid=args.pid or -1) matched = b.num_open_uprobes() if matched == 0: print("0 functions matched by \"%s\". Exiting." % args.pattern) exit()
void count_lookup(struct pt_regs *ctx) { int key = S_SLOW; u64 *leaf = stats.lookup(&key); if (leaf) (*leaf)++; if (PT_REGS_RC(ctx) == 0) { key = S_MISS; leaf = stats.lookup(&key); if (leaf) (*leaf)++; } } """ # load BPF program b = BPF(text=bpf_text) b.attach_kprobe(event="lookup_fast", fn_name="count_fast") b.attach_kretprobe(event="d_lookup", fn_name="count_lookup") # stat column labels and indexes stats = { "REFS": 1, "SLOW": 2, "MISS": 3 } # header print("%-8s " % "TIME", end="") for stype, idx in sorted(stats.items(), key=lambda k_v: (k_v[1], k_v[0])): print(" %8s" % (stype + "/s"), end="") print(" %8s" % "HIT%") # output
if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) # for POSIX compliance, all architectures implement these # system calls but the name of the actual entry point may # be different for which we must check if the entry points # actually exist before attaching the probes syscall_fnname = b.get_syscall_fnname("stat") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") syscall_fnname = b.get_syscall_fnname("statfs") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") syscall_fnname = b.get_syscall_fnname("newstat") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") start_ts = 0 prev_ts = 0 delta = 0
""" if args.pid: bpf_text = bpf_text.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) kill_fnname = b.get_syscall_fnname("kill") b.attach_kprobe(event=kill_fnname, fn_name="syscall__kill") b.attach_kretprobe(event=kill_fnname, fn_name="do_ret_sys_kill") TASK_COMM_LEN = 16 # linux/sched.h class Data(ct.Structure): _fields_ = [ ("pid", ct.c_ulonglong), ("tpid", ct.c_int), ("sig", ct.c_int), ("ret", ct.c_int), ("comm", ct.c_char * TASK_COMM_LEN) ] # header print("%-9s %-6s %-16s %-4s %-6s %s" % (
global start_ts if args.timestamp: if start_ts == 0: start_ts = event.ts_us print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid, event.task.decode(), event.ip, inet_ntop(AF_INET6, event.saddr), inet_ntop( AF_INET6, event.daddr), event.dport)) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry") b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_entry") b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return") b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return") # header if args.timestamp: print("%-9s" % ("TIME(s)"), end="") print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "SADDR", "DADDR", "DPORT")) start_ts = 0 # read events b["ipv4_events"].open_perf_buffer(print_ipv4_event) b["ipv6_events"].open_perf_buffer(print_ipv6_event) while 1: b.perf_buffer_poll()
else: usdt = None if args.ebpf or args.verbose: if args.verbose and usdt: print(usdt.get_text()) print(program) if args.ebpf: exit() bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else []) if args.syscalls: syscall_regex = "^[Ss]y[Ss]_.*" bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry") if args.latency: bpf.attach_kretprobe(event_re=syscall_regex, fn_name="syscall_return") print("Attached %d kernel probes for syscall tracing." % bpf.num_open_kprobes()) def get_data(): # Will be empty when no language was specified for tracing if args.latency: data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \ + "." + \ kv[0].method.decode('utf-8', 'replace'), (kv[1].num_calls, kv[1].total_ns)), bpf["times"].items())) else: data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \ + "." + \ kv[0].method.decode('utf-8', 'replace'),
from struct import pack import time as ti import json import ctypes import sys # Initialize BPF b = BPF(src_file="ebpf_probes.c") # Attach trace functions to kernel functions as k(ret)probes b.attach_kprobe(event="tcp_reno_cong_avoid", fn_name="trace_cong_avoid") b.attach_kprobe(event="tcp_init_buffer_space", fn_name="trace_init_cong_control") b.attach_kprobe(event="tcp_mark_skb_lost", fn_name="trace_mark_lost") b.attach_kprobe(event="tcp_write_timer_handler", fn_name="trace_timeout_trigger") b.attach_kprobe(event="tcp_rack_mark_lost", fn_name="trace_rack_timer") b.attach_kretprobe(event="tcp_schedule_loss_probe", fn_name="trace_probe_timer") qlog = { "qlog_version": "draft-01", "traces": [ { "common_fields": { "reference_time": -1 }, "configuration": {"time_units": "ms"}, "event_fields": [ "relative_time", "category", "event_type", "data"
event.offset / 1024, float(event.delta_us) / 1000, event.file.decode())) # Currently specifically works for NFSv4, the other kprobes are generic # so it should work with earlier NFS versions b = BPF(text=bpf_text) b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry") b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry") b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry") b.attach_kprobe(event="nfs_file_open", fn_name="trace_file_open_entry") b.attach_kprobe(event="nfs_getattr", fn_name="trace_getattr_entry") b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return") b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return") b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_file_open_return") b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return") b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") if(csv): print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") else: if min_ms == 0: print("Tracing NFS operations... Ctrl-C to quit") else: print("""Tracing NFS operations that are slower than \ %d ms... Ctrl-C to quit""" % min_ms) print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME",
'dist.increment(key);') else: bpf_text = bpf_text.replace( 'STORE', 'irq_key_t key = {.slot = 0 /* ignore */};' + 'bpf_probe_read(&key.name, sizeof(key.name), name);' + 'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' + '(*vp) += delta;') if debug: print(bpf_text) # load BPF program b = BPF(text=bpf_text) # these should really use irq:irq_handler_entry/exit tracepoints: b.attach_kprobe(event="handle_irq_event_percpu", fn_name="trace_start") b.attach_kretprobe(event="handle_irq_event_percpu", fn_name="trace_completion") print("Tracing hard irq event time... Hit Ctrl-C to end.") # output exiting = 0 if args.interval else 1 dist = b.get_table("dist") while (1): try: sleep(int(args.interval)) except KeyboardInterrupt: exiting = 1 print() if args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="")
bpf_source = bpf_source.replace("STACK_FLAGS", stack_flags) bpf_program = BPF(text=bpf_source) if not kernel_trace: print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid) bpf_program.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter", pid=pid) bpf_program.attach_uretprobe(name="c", sym="malloc", fn_name="alloc_exit", pid=pid) bpf_program.attach_uprobe(name="c", sym="free", fn_name="free_enter", pid=pid) else: print("Attaching to kmalloc and kfree, Ctrl+C to quit.") bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter") bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") bpf_program.attach_kprobe(event="kfree", fn_name="free_enter") decoder = KStackDecoder() if kernel_trace else UStackDecoder(pid) def print_outstanding(): print("[%s] Top %d stacks with outstanding allocations:" % (datetime.now().strftime("%H:%M:%S"), top_stacks)) alloc_info = {} allocs = bpf_program["allocs"] stack_traces = bpf_program["stack_traces"] for address, info in sorted(allocs.items(), key=lambda a: a[1].size): if Time.monotonic_time() - min_age_ns < info.timestamp_ns: continue if info.stack_id < 0: continue
} """ if args.pid: bpf_text = bpf_text.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') if debug: print(bpf_text) # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="sys_stat", fn_name="trace_entry") b.attach_kprobe(event="sys_statfs", fn_name="trace_entry") b.attach_kprobe(event="sys_newstat", fn_name="trace_entry") b.attach_kretprobe(event="sys_stat", fn_name="trace_return") b.attach_kretprobe(event="sys_statfs", fn_name="trace_return") b.attach_kretprobe(event="sys_newstat", fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h NAME_MAX = 255 # linux/limits.h class Data(ct.Structure): _fields_ = [("pid", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("delta", ct.c_ulonglong), ("ret", ct.c_int), ("comm", ct.c_char * TASK_COMM_LEN), ("fname", ct.c_char * NAME_MAX)] start_ts = 0
'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('FILTER', '') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) syscall_fnname = b.get_syscall_fnname("shmget") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmget") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") syscall_fnname = b.get_syscall_fnname("shmat") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmat") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") syscall_fnname = b.get_syscall_fnname("shmdt") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmdt") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") syscall_fnname = b.get_syscall_fnname("shmctl") if BPF.ksymname(syscall_fnname) != -1: b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmctl") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
bpf_text = bpf_text.replace('FILTER_PID', '0') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # load BPF program b = BPF(text=bpf_text) # common file functions b.attach_kprobe(event="nfs_file_read", fn_name="trace_entry") b.attach_kprobe(event="nfs_file_write", fn_name="trace_entry") b.attach_kprobe(event="nfs_file_open", fn_name="trace_entry") b.attach_kprobe(event="nfs_getattr", fn_name="trace_entry") b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return") b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return") b.attach_kretprobe(event="nfs_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") if BPF.get_kprobe_functions(b'nfs4_file_open'): b.attach_kprobe(event="nfs4_file_open", fn_name="trace_entry") b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_open_return") else: b.attach_kprobe(event="nfs_file_open", fn_name="trace_entry") b.attach_kretprobe(event="nfs_file_open", fn_name="trace_open_return") print("Tracing NFS operation latency... Hit Ctrl-C to end.") # output exiting = 0
} """ % (i, i) if args.verbose: print(bpf_text) b = BPF(text=bpf_text) for i, function in enumerate(args.functions): if ":" in function: library, func = function.split(":") b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i) b.attach_uretprobe(name=library, sym=func, fn_name="trace_return") else: b.attach_kprobe(event=function, fn_name="trace_%d" % i) b.attach_kretprobe(event=function, fn_name="trace_return") TASK_COMM_LEN = 16 # linux/sched.h class Data(ct.Structure): _fields_ = [ ("id", ct.c_ulonglong), ("tgid_pid", ct.c_ulonglong), ("start_ns", ct.c_ulonglong), ("duration_ns", ct.c_ulonglong), ("retval", ct.c_ulonglong), ("comm", ct.c_char * TASK_COMM_LEN) ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) time_designator = "us" if args.min_us else "ms" time_value = args.min_us or args.min_ms or 1
else: usdt = None if args.ebpf or args.verbose: if args.verbose and usdt: print(usdt.get_text()) print(program) if args.ebpf: exit() bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else []) if args.syscalls: syscall_regex = "^[Ss]y[Ss]_.*" bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry") if args.latency: bpf.attach_kretprobe(event_re=syscall_regex, fn_name="syscall_return") print("Attached %d kernel probes for syscall tracing." % bpf.num_open_kprobes()) def get_data(): # Will be empty when no language was specified for tracing if args.latency: data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \ + "." + \ kv[0].method.decode('utf-8', 'replace'), (kv[1].num_calls, kv[1].total_ns)), bpf["times"].items())) else: data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \ + "." + \
'dist.increment(key, delta);') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # load BPF program b = BPF(text=bpf_text) # these should really use irq:irq_handler_entry/exit tracepoints: if args.count: b.attach_kprobe(event="handle_irq_event_percpu", fn_name="count_only") print("Tracing hard irq events... Hit Ctrl-C to end.") else: b.attach_kprobe(event="handle_irq_event_percpu", fn_name="trace_start") b.attach_kretprobe(event="handle_irq_event_percpu", fn_name="trace_completion") print("Tracing hard irq event time... Hit Ctrl-C to end.") # output exiting = 0 if args.interval else 1 dist = b.get_table("dist") while (1): try: sleep(int(args.interval)) except KeyboardInterrupt: exiting = 1 print() if args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="")
bpf_text = bpf_text.replace('PID_FILTER', '') if args.signal: bpf_text = bpf_text.replace('SIGNAL_FILTER', 'if (sig != %s) { return 0; }' % args.signal) else: bpf_text = bpf_text.replace('SIGNAL_FILTER', '') if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() # initialize BPF b = BPF(text=bpf_text) kill_fnname = b.get_syscall_fnname("kill") b.attach_kprobe(event=kill_fnname, fn_name="syscall__kill") b.attach_kretprobe(event=kill_fnname, fn_name="do_ret_sys_kill") # header print("%-9s %-6s %-16s %-4s %-6s %s" % ("TIME", "PID", "COMM", "SIG", "TPID", "RESULT")) # process event def print_event(cpu, data, size): event = b["events"].event(data) if (args.failed and (event.ret >= 0)): return printb(b"%-9s %-6d %-16s %-4d %-6d %d" % (strftime("%H:%M:%S").encode('ascii'), event.pid, event.comm,
if args.pid: bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) else: bpf_text = bpf_text.replace('FILTER_PID', '0') if debug: print(bpf_text) # load BPF program b = BPF(text=bpf_text) # Common file functions. See earlier comment about generic_file_read_iter(). b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_entry") b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_entry") b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") print("Tracing btrfs operation latency... Hit Ctrl-C to end.") # output exiting = 0 dist = b.get_table("dist") while (1): try: if args.interval: sleep(int(args.interval)) else: sleep(99999999)
def main(): examples = '''Examples: deadlock 181 # Analyze PID 181 deadlock 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 # Analyze PID 181 and locks from this binary. # If tracing a process that is running from # a dynamically-linked binary, this argument # is required and should be the path to the # pthread library. deadlock 181 --verbose # Analyze PID 181 and print statistics about # the mutex wait graph. deadlock 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\ --unlock-symbols my_mutex_unlock1,my_mutex_unlock2 # Analyze PID 181 and trace custom mutex # symbols instead of pthread mutexes. deadlock 181 --dump-graph graph.json # Analyze PID 181 and dump the mutex wait # graph to graph.json. ''' parser = argparse.ArgumentParser( description=( 'Detect potential deadlocks (lock inversions) in a running binary.' '\nMust be run as root.' ), formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples, ) parser.add_argument('pid', type=int, help='Pid to trace') # Binaries with `:` in the path will fail to attach uprobes on kernels # running without this patch: https://lkml.org/lkml/2017/1/13/585. # Symlinks to the binary without `:` in the path can get around this issue. parser.add_argument( '--binary', type=str, default='', help='If set, trace the mutexes from the binary at this path. ' 'For statically-linked binaries, this argument is not required. ' 'For dynamically-linked binaries, this argument is required and ' 'should be the path of the pthread library the binary is using. ' 'Example: /lib/x86_64-linux-gnu/libpthread.so.0', ) parser.add_argument( '--dump-graph', type=str, default='', help='If set, this will dump the mutex graph to the specified file.', ) parser.add_argument( '--verbose', action='store_true', help='Print statistics about the mutex wait graph.', ) parser.add_argument( '--lock-symbols', type=strlist, default=['pthread_mutex_lock'], help='Comma-separated list of lock symbols to trace. Default is ' 'pthread_mutex_lock. These symbols cannot be inlined in the binary.', ) parser.add_argument( '--unlock-symbols', type=strlist, default=['pthread_mutex_unlock'], help='Comma-separated list of unlock symbols to trace. Default is ' 'pthread_mutex_unlock. These symbols cannot be inlined in the binary.', ) args = parser.parse_args() if not args.binary: try: args.binary = os.readlink('/proc/%d/exe' % args.pid) except OSError as e: print('%s. Is the process (pid=%d) running?' % (str(e), args.pid)) sys.exit(1) bpf = BPF(src_file=b'deadlock.c') # Trace where threads are created bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone') # We must trace unlock first, otherwise in the time we attached the probe # on lock() and have not yet attached the probe on unlock(), a thread can # acquire mutexes and release them, but the release events will not be # traced, resulting in noisy reports. for symbol in args.unlock_symbols: try: bpf.attach_uprobe( name=args.binary, sym=symbol, fn_name='trace_mutex_release', pid=args.pid, ) except Exception as e: print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) sys.exit(1) for symbol in args.lock_symbols: try: bpf.attach_uprobe( name=args.binary, sym=symbol, fn_name='trace_mutex_acquire', pid=args.pid, ) except Exception as e: print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) sys.exit(1) def print_stack_trace(stack_id): '''Closure that prints the symbolized stack trace.''' for addr in bpf.get_table('stack_traces').walk(stack_id): line = bpf.sym(addr, args.pid) # Try to symbolize with objdump if we cannot with bpf. if line == '[unknown]': symbol = symbolize_with_objdump(args.binary, addr) if symbol: line = symbol print('@ %016x %s' % (addr, line)) print('Tracing... Hit Ctrl-C to end.') while True: try: # Map of child thread pid -> parent info thread_info = { child.value: (parent.parent_pid, parent.stack_id, parent.comm) for child, parent in bpf.get_table('thread_to_parent').items() } # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists # if there exists some thread T where lock(A) was called and # lock(B) was called before unlock(A) was called. graph = DiGraph() for key, leaf in bpf.get_table('edges').items(): graph.add_edge( key.mutex1, key.mutex2, thread_pid=leaf.thread_pid, thread_comm=leaf.comm.decode('utf-8'), first_mutex_stack_id=leaf.mutex1_stack_id, second_mutex_stack_id=leaf.mutex2_stack_id, ) if args.verbose: print( 'Mutexes: %d, Edges: %d' % (len(graph.nodes()), len(graph.edges())) ) if args.dump_graph: with open(args.dump_graph, 'w') as f: data = graph.node_link_data() f.write(json.dumps(data, indent=2)) cycle = find_cycle(graph) if cycle: print_cycle( args.binary, graph, cycle, thread_info, print_stack_trace ) sys.exit(1) time.sleep(1) except KeyboardInterrupt: break