def get_tentative_arg(pid, funcname, argN): foundarg = None s = exec_bt("foreach {} bt".format(pid), MEMOIZE=False)[0] with DisasmFlavor('att'): search_for_registers(s) for f in s.frames: if (funcname not in f.func): continue if (f.lookup_regs): for reg in f.reg: if (not reg in __ARG_REG): continue if (__ARG_REG.index(reg) == argN): addr, conf = f.reg[reg] foundarg = addr break return foundarg
def get_interesting_arguments(pid, re_funcnames, re_ctypes): __ARG_REG = ('RDI', 'RSI', 'RDX', 'RCX', 'R8', 'R9') s = exec_bt("bt {}".format(pid), MEMOIZE=False)[0] with DisasmFlavor('att'): #search_for_registers(s, re_funcnames) #for f in s.frames: for f in search_for_registers(s, re_funcnames): #print(f.func) #if (not re_funcnames.search(f.func)): # continue if (f.lookup_regs): once = TrueOnce(1) if (not f.func): continue argprotos = funcargs(f.func) if (not argprotos): continue nargs = len(argprotos) #print(f.func, argprotos) for reg in f.reg: if (not reg in __ARG_REG or not f.func): continue index = __ARG_REG.index(reg) if (index >= nargs): continue ctype = argprotos[index] if (not re_ctypes.search(ctype)): continue #if (once): #print(f.func) addr = f.reg[reg][0] #print(' {} {} {:#x}'.format(index, reg, addr)) #print(' ', ctype) m = __re_struct.match(ctype) if (not m): continue sname = m.group(1) yield (f.func, sname, addr)
def do_check(): tt = TaskTable() has_do_coredump = False has_UN = False for task in tt.getThreadsByComm('adclient'): pid = task.pid stack = exec_bt("bt {}".format(task.pid))[0] if (stack.hasfunc('do_coredump')): has_do_coredump = True if (task.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE): has_UN = True if (has_do_coredump and has_UN): break else: return # After some commands issued, GDB returns incorrect type for this - # 'char core_pattern[];' instead of ' char core_pattern[CORENAME_MAX_SIZE]' addr = sym2addr("core_pattern") core_pattern = SmartString(readmem(addr, 1), addr, None) #core_pattern = readSymbol("core_pattern") if (not core_pattern.startswith("|")): return abrt_hook = tt.getByComm('abrt-hook-ccpp') if (not abrt_hook): return __daemon = "/var/centrifydc/daemon" for sock in abrt_hook[0].get_task_socks(): family, sktype, protoname, inet = decodeSock(sock) if (protoname == 'UNIX'): sock = sock.castTo("struct unix_sock") state, ino, s_path = unix_sock(sock) p_state, p_ino, p_path = unix_sock(sock.Peer) for path in (s_path, p_path): if (path.startswith(__daemon)): pylog.info(__txt)
def run_check_on_multipath(): tt = TaskTable() bts = [] errors = 0 task_cnt = 0 multipathd_daemon = 0 # To verify if multipathd daemon is running multipath_blocked = 0 # To verify if multipathd daemon or command is blocked mpath_present = 0 # To verify if multipath device exists with or without # multipathd daemon running wq_blocked = 0 # To verify if scsi_wq or fc_wq is blocked kworker_md_blocked = 0 # Counter for hung worker threads which are waiting for # IO requests on mdraid devices print("\nChecking for device-mapper issues...\n") for t in tt.allThreads(): print("Getting a list of processes in UN state..." "(Count: {:d})".format(task_cnt), end="\r") if ('multipathd' in t.comm): multipathd_daemon = 1 if (t.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE): task_cnt += 1 # crash can miss some threads when there are pages missing # and it will not do 'bt' in that case. try: bts.append(exec_bt("bt %d" % t.pid)[0]) except: pass print("Getting a list of processes in UN state...\t\t\t[Done]") if (task_cnt): print("\nProcessing the back trace of hung tasks...\t\t\t", end='') for bt in bts: if ('kworker' in bt.cmd): if (bt.hasfunc('md_flush_request') and bt.hasfunc('dio_aio_complete_work')): kworker_md_blocked += 1 if ('multipath' in bt.cmd): multipath_blocked = 1 if (('scsi_wq' in bt.cmd) or ('fc_wq' in bt.cmd)): wq_blocked = 1 print("[Done]") # Checks for dm devices for dev in devlist: md, name = dev dm_table_map = StructResult("struct dm_table", md.map) # Check if there is any multipath device present in device-mapper table if (dm_table_map.targets.type.name == "multipath"): mpath_present += 1 # Check if kworker threads are stuck waiting to flush IO on mdraid devices if (kworker_md_blocked >= 5): print( "\n ** {} kworker threads are stuck in UN state waiting to flush the IO" "\n requests on mdraid devices. This could be a result of thundering" "\n herd problem. See reference: " "\n https://marc.info/?l=linux-raid&m=155364683109115&w=2". format(kworker_md_blocked)) print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 # multipath devices are present but multipathd is not running if (mpath_present != 0 and multipathd_daemon == 0): print( "\n ** multipath device(s) are present, but multipathd service is" "\n not running. IO failover/failback may not work.") errors += 1 # scsi or fc work queue and multipathd are blocked if (multipath_blocked == 1 and wq_blocked == 1): print( "\n ** multipathd and scsi/fc work_queue processes are stuck in UN state," "\n this could block IO failover on multipath devices") print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 # only multipathd process is stuck in UN state elif (multipath_blocked == 1): print("\n ** multipathd processes stuck in UN state," "\n this could block IO failover on multipath devices") print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 if (errors == 0 and task_cnt != 0): print("\n No device-mapper, multipath issues detected by utility," "\n but found {} processes in UN state.".format(task_cnt)) print( "\n Run 'hanginfo' for more information on processes in UN state." ) elif (errors == 0 and task_cnt == 0): print("No issues detected by utility.")
def printTasks(reverse=False, maxtoprint=-1): tt = TaskTable() if (debug): print("Uptime:", ms2uptime(tt.basems)) out = [] if (not reverse): # Natural order (task followed by its threads) for mt in tt.allTasks(): out.append((mt.Ran_ago, mt.pid, mt)) for t in mt.threads: #print (" struct thread_info 0x%x" % long(t)) out.append((t.Ran_ago, t.pid, t)) hdr = 'Tasks in PID order, grouped by Thread Group leader' else: # Most recent first for t in tt.allThreads(): out.append((t.Ran_ago, t.pid, t)) out.sort() hdr = 'Tasks in reverse order, scheduled recently first' # Apply the filter if (taskstates_filter): out1 = [] for *group, t in out: sstate = t.state[5:7] if (sstate in taskstates_filter): out1.append((*group, t)) out = out1 nthreads = len(out) if (maxtoprint != -1 and maxtoprint < nthreads): # Split them 1:1 nbeg = maxtoprint // 2 nend = maxtoprint - nbeg out = out[:nbeg] + [(None, None, None)] + out[-nend:] extra = " ({} tasks skipped)".format(nthreads - maxtoprint) else: extra = '' # Print the header print("=== {}{} ===".format(hdr, extra)) _header = " PID CMD CPU Ran ms ago STATE\n" +\ "-------- ------------ -- ------------- -----" if (not runcmd): print(_header) for ran_ms_ago, pid, t in out: if (pid is None): print(" <snip>") continue sstate = t.state[5:7] tgid = t.tgid pid_template = " {:6d}" if (pid != tgid): if (not reverse): pid_template = " {:6d}" extra = " (tgid=%d)" % tgid else: extra = "" uid = t.Uid pid_s = pid_template.format(pid) extra = "%13s UID=%d" % (extra, uid) if (is_task_active(long(t.ts))): pid_s = ">" + pid_s[1:] uid = t.Uid # Thread pointers might be corrupted try: if (runcmd): print(_header) print ("%s %14s %3d %14d %s %s" \ % (pid_s, t.comm, t.cpu, int(ran_ms_ago), sstate, extra)) if (runcmd): _cmdline = "{} {}".format(runcmd, pid) print("\ncrash> {}".format(_cmdline)) out = exec_crash_command(_cmdline) if (": command not found: " in out): sys.exit(1) print(out) # In versbose mode, print stack as well if (verbose): bt = exec_bt("bt %d" % pid) print(bt[0]) if (verbose or runcmd): print("\n", "-" * 78, "\n", sep='') except crash.error: pylog.error("corrupted", t)
else: btcmd = "foreach " + args.pid + " " + bts # Make sure we're on an x86_64 vmcore, or this will fail miserably. if (sys_info.machine != "x86_64"): print("Register decoding is supported on x86_64 dumps only.") sys.exit() # Purge the memoize cache if a 'mod' command has been done since # our last invocation, since new symbols may not be available purge_memoize_cache(CU_LOAD) with DisasmFlavor('att'): try: stacklist = exec_bt(btcmd, MEMOIZE=False) except: print("Unable to get stack trace") sys.exit() for s in stacklist: search_for_registers(s, routine) print("\nPID: {} TASK: {:x} CPU: {} COMMAND: {}".format( s.pid, s.addr, s.cpu, s.cmd)) for f in s.frames: # Skip frame if it doesn't match routine name pattern. # If no routine was specified, the frame will print because
def classify_UN(v): # Reset owners resource_owners_clear() # We get a list of UN tasks tasksrem = getUNTasks() if (not tasksrem): print("There are no UNINTERRUPTIBLE tasks") return print(" *** UNINTERRUPTIBLE threads, classified ***") # Now we are do a number of tests trying to classify the threads # Every time we succeed, we remove these threads from tasksrem check_stack_and_print('io_schedule', tasksrem) check_stack_and_print('btrfs_tree_read_lock', tasksrem) check_inode_mutexes(tasksrem) check_other_mutexes(tasksrem) check_mmap_sem(tasksrem) check_congestion_queues(tasksrem) check_kthread_create_list(tasksrem) check_throttle_direct_reclaim(tasksrem) check_console_sem(tasksrem) check_stack_and_print('schedule_timeout', tasksrem) check_stack_and_print('alloc_pages_slowpath', tasksrem) check_stack_and_print('nfs_idmap_id', tasksrem, "NFS idmapper") if (tasksrem): print("\n\n ******** Non-classified UN Threads ********** {}" " in total".format(len(tasksrem))) # Print what remains btlist = [] for pid in tasksrem: try: btlist.append(exec_bt("bt %d" % pid)[0]) except IndexError: pylog.warning("Cannot get stack for PID={}".format(pid)) #btlist = [exec_bt("bt %d" % pid)[0] for pid in tasksrem] bt_mergestacks(btlist, verbose=1) #print(un) # Print resource owners. We have two kinds: real pids and pseudo-owners, # such as "io_schedule" __real_owners = {x for x in __resource_owners if isinstance(x, int)} __pseudo_owners = __resource_owners - __real_owners if (__real_owners): print("\n*** Threads that own resources the other threads are" " waiting on ***") for pid in __real_owners: s = exec_bt("bt {}".format(pid))[0] print(s) print(__resource_owner_extra[pid]) if (__pseudo_owners): print("\n*** System activities other threads are waiting for ***") for pid in __pseudo_owners: print(" --- Doing {} ---".format(pid)) print(__resource_owner_extra[pid]) # Are any of these owners looping in zone allocator? #_owners = zvm_pids & rem #if (_owners): # print(" Looping in zone allocator:", list(_owners)) return # The following code is not ready yet for vfsmount, superblk, fstype, devname, mnt in getMount(): sb = readSU("struct super_block", superblk) um = sb.s_umount if (um.count): print(devname, sb) bdi = sb.s_bdi if (bdi): print_backing_dev_info(bdi)