def update_omsadmin(): try: with open(CONF_PATH, 'r') as conf_file: for line in conf_file: parsed_line = (line.rstrip('\n')).split('=') general_info[parsed_line[0]] = '='.join(parsed_line[1:]) return NO_ERROR except IOError as e: if (e.errno == errno.EACCES): error_info.append((CONF_PATH, )) return ERR_SUDO_PERMS elif (e.errno == errno.ENOENT): error_info.append(('file', CONF_PATH)) return ERR_FILE_MISSING else: raise
def no_clconf(interactive): # check if enough time has passed for agent to pull config from OMS backend print( "--------------------------------------------------------------------------------" ) print(" The troubleshooter cannot find the customlog.conf file. If the custom log \n"\ " configuration was just applied in portal, it takes up to 15 minutes for the \n"\ " agent to pick the new configuration.\n"\ " You can manually pull the config from the OMS backend by running this command:\n"\ "\n $ sudo su omsagent -c 'python /opt/microsoft/omsconfig/Scripts/PerformRequiredConfigurationChecks.py'\n") # errors out here if not using custom logs (for silent mode) if (not interactive): print( " (NOTE: if you aren't using custom logs, please ignore this message.)" ) error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH)) return ERR_BACKEND_CONFIG # ask if already tried pulling config from OMS backend if (interactive): manual_pull = get_input("Have you already tried pulling the config manually? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # tried pulling, see if that fixed it if (manual_pull.lower() in ['y', 'yes']): # config now exists if (os.path.isfile(CLCONF_PATH)): print("The config file has been pulled successfully.") print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) return NO_ERROR # config still doesn't exist else: # TODO: check the log files for an error in DSC error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH)) return ERR_BACKEND_CONFIG # haven't tried pulling yet else: print( " Please try running the above command to pull the config file." ) return ERR_FOUND
def check_omi_cpu(): # Run script try: script_output = subprocess.check_output(['bash',SCRIPT_FILE,'--runtime-in-min','1',\ '--cpu-threshold','80'], universal_newlines=True, stderr=subprocess.STDOUT) script_lines = script_output.split('\n') for script_line in script_lines: if (script_line.startswith("Traces will be saved to this file: ")): # started running successfully return check_output_file() # script didn't start running successfully error_info.append((script_output,)) return ERR_OMICPU # process errored out except subprocess.CalledProcessError as e: error_info.append((e.output,)) return ERR_OMICPU
def start_omsagent(workspace, enabled=False): print("Agent curently not running. Attempting to start omsagent...") result = 0 # enable the agent if necessary if (not enabled): result = subprocess.call([SC_PATH, 'enable']) # start the agent if enable was successful result = (subprocess.call([SC_PATH, 'start'])) if (result == 0) else (result) # check if successful if (result == 0): return check_omsagent_running(workspace) elif (result == 127): # script doesn't exist error_info.append(('executable shell script', SC_PATH)) return ERR_FILE_MISSING
def check_oms(interactive): cpu_bits = geninfo_lookup('CPU_BITS') oms_version = get_oms_version() if (oms_version == None): return ERR_OMS_INSTALL # check if version is >= 1.11 if (not comp_versions_ge(oms_version, '1.11')): error_info.append((oms_version, cpu_bits)) return ERR_OLD_OMS_VER # get most recent version (curr_oms_version, e) = get_curr_oms_version(OMSAGENT_URL) # getting current version failed if (curr_oms_version == None): # could connect, just formatting issue if (e == None): return ERR_GETTING_OMS_VER # couldn't connect else: checked_urlopen = check_urlopen_errs(e) # issue with connecting to Github specifically if (checked_urlopen == ERR_ENDPT): print( "WARNING: can't connect to {0}: {1}\n Skipping this check..." .format(OMSAGENT_URL, e)) print( "--------------------------------------------------------------------------------" ) # issue with general internet connectivity / ssl package else: error_info.append((OMSAGENT_URL, e)) return checked_urlopen # got current version else: # if not most recent version, ask if want to update if (interactive and (not comp_versions_ge(oms_version, curr_oms_version))): if (ask_update_old_version(oms_version, curr_oms_version, cpu_bits) == USER_EXIT): return USER_EXIT return update_omsadmin()
def check_vm_supported(vm_dist, vm_ver): vm_supported = False # find VM distribution in supported list vm_supported_dist = None for supported_dist in (supported_dists.keys()): if (not vm_dist.lower().startswith(supported_dist)): continue vm_supported_dist = supported_dist # check if version is supported vm_ver_split = vm_ver.split('.') for supported_ver in (supported_dists[supported_dist]): supported_ver_split = supported_ver.split('.') vm_ver_match = True # try matching VM version with supported version for (idx, supported_ver_num) in enumerate(supported_ver_split): try: supported_ver_num = int(supported_ver_num) vm_ver_num = int(vm_ver_split[idx]) if (vm_ver_num is not supported_ver_num): vm_ver_match = False break except (IndexError, ValueError) as e: vm_ver_match = False break # check if successful in matching if (vm_ver_match): vm_supported = True break # check if any version successful in matching if (vm_supported): return NO_ERROR # VM distribution is supported, but not current version if (vm_supported_dist != None): alt_vers = get_alternate_versions(vm_supported_dist) error_info.append((vm_dist, vm_ver, alt_vers)) return ERR_OS_VER # VM distribution isn't supported else: error_info.append((vm_dist, )) return ERR_OS
def check_syslogdest(sys_bind, sys_pt): # get workspace id workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # set up regex lines comment_line = "# OMS Syslog collection for workspace (\S+)" spec_line = "(\w+).=alert;(\w+).=crit;(\w+).=debug;(\w+).=emerg;(\w+).=err;"\ "(\w+).=info;(\w+).=notice;(\w+).=warning" # open file with open(SYSLOGDEST_PATH, 'r') as syslogdest_file: for line in syslogdest_file: line = line.rstrip('\n') # skip empty lines if (line == ''): continue # check if workspace for syslog collection lines up match_comment = re.match(comment_line, line) if (match_comment == None): continue syslog_wkspc = (match_comment.groups())[0] if (workspace_id != syslog_wkspc): error_info.append( (syslog_wkspc, workspace_id, SYSLOGCONF_PATH)) return ERR_SYSLOG_WKSPC else: continue # check if port is correct parsed_line = line.split() match_spec = re.match(spec_line, parsed_line[0]) if (match_comment != None): checked_port = check_port(parsed_line[1], sys_port, sys_bind) if (checked_port != NO_ERROR): return checked_port else: continue else: continue # all ports set up correctly return NO_ERROR
def check_log_rotation(): # update logrotate config path with wsid workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING global LR_CONFIG_PATH LR_CONFIG_PATH = LR_CONFIG_PATH.format(workspace_id) # check logrotate config file exists if (not os.path.isfile(LR_CONFIG_PATH)): error_info.append(('logrotate config file', LR_CONFIG_PATH)) return ERR_FILE_MISSING # go through logrotate config file logrotate_configs = dict() with open(LR_CONFIG_PATH, 'r') as f: lr_lines = f.readlines() in_file = None for lr_line in lr_lines: lr_line = lr_line.rstrip('\n') # start of log rotation config lr_start = re.match("^(\S+) \{$", lr_line) if (lr_start != None): in_file = lr_start.groups()[0] logrotate_configs[in_file] = set() continue # log rotation config info elif (in_file != None): logrotate_configs[in_file].add(lr_line.lstrip()) continue # end of log rotation config elif (lr_line == '}'): in_file = None continue # check size rotation working checked_size_config = check_size_config(logrotate_configs) if (checked_size_config != NO_ERROR): return checked_size_config return NO_ERROR
def scan_top_files(num_files, tto): top_files = [] print('num_files: {0}, tto: {1}'.format(num_files, tto)) with open(os.devnull, 'w') as devnull: find_cmd = subprocess.Popen(['find','/','-type','f','-exec','du','-S','\{\}','+'],\ stdout=subprocess.PIPE, stderr=devnull) print('find_cmd: {0}'.format(find_cmd)) sort_cmd = subprocess.Popen(['sort', '-rh'], stdin=find_cmd.stdout, stdout=subprocess.PIPE) find_cmd.stdout.close() print('sort_cmd: {0}'.format(sort_cmd)) head_cmd = subprocess.Popen(['head','-n',str(num_files)], stdin=sort_cmd.stdout,\ stdout=subprocess.PIPE) sort_cmd.stdout.close() print('head_cmd: {0}'.format(head_cmd)) files = head_cmd.communicate()[0] print('files: {0}'.format(files)) # format file list parsed_files = files.split('\n') print('parsed_files: {0}'.format(parsed_files)) for f in parsed_files: print('f: {0}'.format(f)) fpath = f.split()[1] fstat = os.stat(fpath) top_files.append( (fpath, fstat.st_size, fstat.st_size, fstat.st_mtime, [])) # top_files : [ (fpath1, finitsize1, fsize1, ftime1, [ (fsizechange1, fsizechangetime1), ... ]), ... ] # check every second for sec in range(tto): check_top_files(top_files) time.sleep(1) # go over each file's changes result = NO_ERROR for (fpath, finitsize, fsize, ftime, fchanges) in top_files: if (fsize > finitsize): error_info.append((fpath, len(fchanges), ftime)) # TODO: add to file with more info or smth result = WARN_LARGE_FILES return result
def check_log_analytics_endpts(): success = NO_ERROR no_certs_printed = False # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: ssl_command = SSL_CMD # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # check endpoint without certs if (not check_endpt_ssl(ssl_command, endpt)): # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) if (not check_endpt_ssl(ssl_command, endpt)): error_info.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT else: # lets user know cert and key aren't there if (not no_certs_printed): print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) no_certs_printed = True error_info.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT return success
def check_sys_invoke_rc(service, controller): try: sys_status = subprocess.check_output([controller, service, 'status'], \ universal_newlines=True, stderr=subprocess.STDOUT) sys_line = sys_status.split('\n')[0] sys_info = sys_line.split() # [service, status+',', 'process', PID] status = sys_info[1].rstrip(',') # [goal+'/'+curr_state] if (status == 'start/running'): # exists and running correctly return NO_ERROR else: # exists but not running correctly error_info.append((service, status, controller)) return ERR_SERVICE_STATUS except subprocess.CalledProcessError as e: # service not on machine if (e.returncode == 100): return ERR_SYSLOG else: error_info.append((service, e.output, controller)) return ERR_SERVICE_STATUS
def check_log_analytics_endpts(): success = NO_ERROR # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # ping endpoint if (not check_endpt(endpt)): error_info.append((endpt, )) success = ERR_ENDPT return success
def check_multihoming(workspace): directories = [] potential_workspaces = [] for (dirpath, dirnames, filenames) in os.walk("/var/opt/microsoft/omsagent"): directories.extend(dirnames) break # Get the top level of directories for directory in directories: if len(directory) >= 32: potential_workspaces.append(directory) workspace_id_list = ', '.join(potential_workspaces) # 2+ potential workspaces if len(potential_workspaces) > 1: error_info.append((workspace_id_list)) return ERR_MULTIHOMING # 0 potential workspaces if (len(potential_workspaces) == 0): missing_dir = "/var/opt/microsoft/omsagent/{0}".format(workspace) error_info.append(('Directory', missing_dir)) return ERR_FILE_MISSING # 1 incorrect workspace if (potential_workspaces[0] != workspace): error_info.append(potential_workspaces[0], workspace) return ERR_GUID # 1 correct workspace return NO_ERROR
def check_omsagent_running_ps(workspace): # check if OMS is running through 'ps' processes = subprocess.check_output(['ps', '-ef'], universal_newlines=True).split('\n') for process in processes: # check if process is OMS if (not process.startswith('omsagent')): continue # [ UID, PID, PPID, C, STIME, TTY, TIME, CMD ] process = process.split() command = ' '.join(process[7:]) # try to match command with omsagent command regx_cmd = "/opt/microsoft/omsagent/ruby/bin/ruby /opt/microsoft/omsagent/bin/omsagent "\ "-d /var/opt/microsoft/omsagent/(\S+)/run/omsagent.pid "\ "-o /var/opt/microsoft/omsagent/(\S+)/log/omsagent.log "\ "-c /etc/opt/microsoft/omsagent/(\S+)/conf/omsagent.conf "\ "--no-supervisor" matches = re.match(regx_cmd, command) if (matches == None): continue matches_tup = matches.groups() guid = matches_tup[0] if (matches_tup.count(guid) != len(matches_tup)): continue # check if OMS is running with a different workspace if (workspace != guid): error_info.append((guid, workspace)) return ERR_GUID # OMS currently running and delivering to the correct workspace return NO_ERROR # none of the processes running are OMS return ERR_OMS_WONT_RUN
def check_sys_systemctl(service, controller): try: sys_status = subprocess.check_output([controller, 'status', service], \ universal_newlines=True, stderr=subprocess.STDOUT) sys_lines = sys_status.split('\n') for line in sys_lines: line = line.strip() if line.startswith('Active: '): stripped_line = line.lstrip('Active: ') # exists and running correctly if stripped_line.startswith('active (running) since '): return NO_ERROR # exists but not running correctly else: error_info.append((service, stripped_line, controller)) return ERR_SERVICE_STATUS except subprocess.CalledProcessError as e: # service not on machine if (e.returncode == 4): return ERR_SYSLOG else: error_info.append((service, e.output, controller)) return ERR_SERVICE_STATUS
def check_sys_service(service, controller): try: sys_status = subprocess.check_output([controller, service, 'status'], \ universal_newlines=True, stderr=subprocess.STDOUT) sys_line = sys_status.split('\n')[0] sys_info = sys_line.split() # [service, '(pid', pid+')', 'is', status+'...'] status = sys_info[-1].rstrip('.') if (status == 'running'): # exists and running correctly return NO_ERROR else: # exists but not running correctly error_info.append((service, status, controller)) return ERR_SERVICE_STATUS except subprocess.CalledProcessError as e: # permissions issue if (e.returncode == 4): return ERR_SUDO_PERMS # service not on machine elif ((e.returncode == 1) and ('unrecognized service' in e.output)): return ERR_SYSLOG else: error_info.append((service, e.output, controller)) return ERR_SERVICE_STATUS
def check_agent_service_endpt(): # get endpoint dsc_endpt = geninfo_lookup('DSC_ENDPOINT') if (dsc_endpt == None): error_info.append(('DSC (agent service) endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING agent_endpt = dsc_endpt.split('/')[2] # check without certs (dsc_connected, dsc_verified) = check_endpt_ssl(SSL_CMD, agent_endpt) if (dsc_connected and dsc_verified): return NO_ERROR else: # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) (dsc_cert_connected, dsc_cert_verified) = check_endpt_ssl(ssl_command, agent_endpt) # with certs connected and verified if (dsc_cert_connected and dsc_cert_verified): return NO_ERROR # with certs connected, but didn't verify elif (dsc_cert_connected and not dsc_cert_verified): error_info.append( (agent_endpt, ssl_command.format(agent_endpt))) return WARN_ENDPT else: # lets user know cert and key aren't there print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) # if certs didn't work at all, check to see if no certs was connected (but not verified) if (dsc_connected and not dsc_verified): error_info.append((agent_endpt, SSL_CMD.format(agent_endpt))) return WARN_ENDPT # neither with nor without certs connected error_info.append((agent_endpt, SSL_CMD.format(agent_endpt))) return ERR_ENDPT
def get_omsagent_logs(LOG_PATH): log_tail_size = 50 lts_mult = 1 parsed_log_lines = [] log_template = r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (\+|-)\d{4} \[\w+\]: .*$" try: # open omsagent.log with open(LOG_PATH, 'r') as log_file: log_lines = log_file.readlines() last_update_lines = log_lines # read from bottom up until run into end of omsagent.conf (printed when agent starts up) # then grab all logs after (for all logs since agent started) for i in range(len(log_lines) - 1, -1, -1): if (log_lines[i] == "</ROOT>\n"): last_update_lines = log_lines[i + 1:] break # parse logs for line in last_update_lines: line = line.rstrip('\n') # empty line if (line == ''): continue # conf file text if (re.match(log_template, line) == None): continue parsed_log = line.split(' ', 4) parsed_log[3] = parsed_log[3].rstrip(':') parsed_log.append(line) # [ date, time, zone, [logtype], log, unparsed log ] parsed_log_lines.append(parsed_log) return (parsed_log_lines, None) # ran into an error with opening file except IOError as e: # can't access due to permissions if (e.errno == errno.EACCES): error_info.append((LOG_PATH, )) return (None, 100) # file doesn't exist elif (e.errno == errno.ENOENT): error_info.append(("file", LOG_PATH)) return (None, ERR_FILE_MISSING) # some other error else: error_info.append((LOG_PATH, e)) return (None, ERR_FILE_ACCESS)
def check_port(port, sys_bind, sys_pt): oms_version = get_oms_version() if (oms_version == None): return ERR_OMS_INSTALL # get number of '@'s in front of port corr_pt = None if (comp_versions_ge(oms_version, '1.12')): if (sys_pt == 'udp'): corr_pt = '@' elif (sys_pt == 'tcp'): corr_pt = '@@' elif (sys_pt in ['udp', 'tcp']): corr_pt = '@' # verify protocol type is valid if (corr_pt == None): error_info.append(("protocol type", SYSLOGCONF_PATH)) return ERR_INFO_MISSING # syslog destination file is sending to right port corr_port = corr_pt + sys_bind if (port.startswith(corr_port)): return NO_ERROR # wrong number of '@'s pt_count = port.count('@') corr_pt_count = corr_pt.count('@') if (pt_count != corr_pt_count): pt = port[:pt_count] error_info.append((sys_pt, corr_pt, pt, SYSLOGDEST_PATH)) return ERR_PT # wrong port curr_bind = (port[pt_count + 1:]).split(':')[0] if (curr_bind != sys_bind): error_info.append((sys_bind, curr_bind, SYSLOGDEST_PATH)) return ERR_PORT_MISMATCH # some other error? error_info.append((SYSLOGCONF_PATH, SYSLOGDEST_PATH)) return ERR_PORT_SETUP
def check_customlog(log_dict): log_path = log_dict[path] # check if path exists if (not os.path.isfile(log_path)): # try splitting on like './' or something to check both file paths # if that doesn't work: error_info.append(('file', log_path)) return ERR_FILE_MISSING # check if pos file exists log_pos_file = log_dict[pos_file] if (not os.path.isfile(log_pos_file)): error_info.append(('file', log_pos_file)) return ERR_FILE_MISSING # check pos file contents with open(log_pos_file, 'r') as lpf: parsed_lines = lpf.readlines().split() # mismatch in pos file filepath and custom log filepath if (parsed_lines[0] != log_path): error_info.append((log_pos_file, log_path, CLCONF_PATH)) return ERR_CL_FILEPATH #TODO: check size of custom log pos_size = parsed_lines[1] # check unique number with custom log un_pos = parsed_lines[2] log_ls_info = subprocess.check_output(['ls', '-li', log_path]) un_log = (log_ls_info.split())[0] un_log_hex = hex(int(un_log)).lstrip('0x').rstrip('L') if (un_pos != un_log_hex): error_info.append((log_path, un_log_hex, log_pos_file, un_pos, \ CLCONF_PATH)) return ERR_CL_UNIQUENUM return NO_ERROR
def check_conf_files(): # verify syslog.conf exists / not empty if (not os.path.isfile(SYSLOGCONF_PATH)): error_info.append(('file', SYSLOGCONF_PATH)) return ERR_FILE_MISSING if (os.stat(SYSLOGCONF_PATH).st_size == 0): error_info.append((SYSLOGCONF_PATH, )) return ERR_FILE_EMPTY # update syslog destination path with correct location syslog_dest = geninfo_lookup('SYSLOG_DEST') if (syslog_dest == None): return ERR_SYSLOG global SYSLOGDEST_PATH SYSLOGDEST_PATH = syslog_dest # verify syslog destination exists / not empty if (not os.path.isfile(SYSLOGDEST_PATH)): error_info.append(('file', SYSLOGDEST_PATH)) return ERR_FILE_MISSING if (os.stat(SYSLOGDEST_PATH).st_size == 0): error_info.append((SYSLOGDEST_PATH, )) return ERR_FILE_EMPTY # parse syslog.conf syslogconf_dict = parse_syslogconf() if (not syslogconf_dict): error_info.append(("syslog configuration info", SYSLOGCONF_PATH)) return ERR_INFO_MISSING # get info for checking syslog destination file try: sys_bind = syslogconf_dict['bind'] sys_pt = syslogconf_dict['protocol_type'] except KeyError: error_info.append(("syslog configuration info", SYSLOGCONF_PATH)) return ERR_INFO_MISSING # check with syslog destination file return check_syslogdest(sys_bind, sys_pt)
def check_heartbeat(interactive, prev_success=NO_ERROR): print("CHECKING HEARTBEAT / HEALTH...") success = prev_success # TODO: run `sh /opt/microsoft/omsagent/bin/omsadmin.sh -l` to check if onboarded and running # check if installed correctly print("Checking if installed correctly...") if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_CONF_PATH)) print_errors(ERR_INFO_MISSING) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check if running multi-homing print("Checking if omsagent is trying to run multihoming...") checked_multihoming = check_multihoming(workspace_id) if (is_error(checked_multihoming)): return print_errors(checked_multihoming) else: success = print_errors(checked_multihoming) # TODO: check if other agents are sending heartbeats # check if omsagent is running print("Checking if omsagent is running...") checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running == ERR_OMS_WONT_RUN): # try starting omsagent # TODO: find better way of doing this, check to see if agent is stopped / grab results checked_omsagent_running = start_omsagent(workspace_id) if (is_error(checked_omsagent_running)): return print_errors(checked_omsagent_running) else: success = print_errors(checked_omsagent_running) # check if omsagent.log finds any heartbeat errors print("Checking for errors in omsagent.log...") checked_log_hb = check_log_heartbeat(workspace_id) if (is_error(checked_log_hb)): # connection issue if (checked_log_hb == ERR_HEARTBEAT): print_errors(checked_log_hb) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(err_codes=False, prev_success=ERR_FOUND) # other issue else: return print_errors(checked_log_hb) else: success = print_errors(checked_log_hb) return success
def check_internet_connect(): if (check_endpt_ssl(SSL_CMD, "docs.microsoft.com")): return NO_ERROR else: error_info.append((SSL_CMD.format("docs.microsoft.com"), )) return ERR_INTERNET
def check_cert(): crt_path = "/etc/opt/microsoft/omsagent/certs/oms.crt" try: crt_info = subprocess.check_output(['openssl','x509','-in',crt_path,'-text','-noout'],\ universal_newlines=True, stderr=subprocess.STDOUT) if (crt_info.startswith("Certificate:\n")): return NO_ERROR error_info.append((crt_path, )) return ERR_CERT # error with openssl except subprocess.CalledProcessError as e: try: err = e.output.split('\n')[1].split(':')[5] # openssl permissions error if (err == "Permission denied"): error_info.append((crt_path, )) return ERR_SUDO_PERMS # openssl file existence error elif (err == "No such file or directory"): error_info.append(("file", crt_path)) return ERR_FILE_MISSING # openssl some other error else: error_info.append((crt_path, err)) return ERR_FILE_ACCESS # catch-all in case of fluke error except: error_info.append((crt_path, e.output)) return ERR_FILE_ACCESS # general error except: error_info.append((crt_path, )) return ERR_CERT
def check_log_analytics_endpts(): success = NO_ERROR no_certs_printed = False connected_err = [] verified_err = [] # get OMS endpoint to check if fairfax region oms_endpt = geninfo_lookup('OMS_ENDPOINT') if (oms_endpt == None): error_info.append(('OMS endpoint', OMSADMIN_PATH)) return ERR_INFO_MISSING # get workspace ID workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING # get log analytics endpoints if ('.us' in oms_endpt): log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \ "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \ "*.oms.opinsights.azure.us"] else: log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \ "ods.systemcenteradvisor.com"] for endpt in log_analytics_endpts: # replace '*' with workspace ID if ('*' in endpt): endpt = endpt.replace('*', workspace_id) # check endpoint without certs (la_connected, la_verified) = check_endpt_ssl(SSL_CMD, endpt) if (not (la_connected or la_verified)): # try with certs (if they exist) if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)): ssl_command = "{0} -cert {1} -key {2}".format( SSL_CMD, CERT_PATH, KEY_PATH) (la_cert_connected, la_cert_verified) = check_endpt_ssl(ssl_command, endpt) # didn't connect or verify with certs if (not (la_cert_connected or la_cert_verified)): connected_err.append((endpt, ssl_command.format(endpt))) success = ERR_ENDPT # connected but didn't verify with certs elif (la_cert_connected and not la_cert_verified): # haven't run into a connected error already if (success != ERR_ENDPT): verified_err.append((endpt, ssl_command.format(endpt))) success = WARN_ENDPT else: # lets user know cert and key aren't there if (not no_certs_printed): print( "NOTE: Certificate and key files don't exist, OMS isn't onboarded." ) no_certs_printed = True # if certs didn't work at all, check to see if no certs was connected (but not verified) if (la_connected and not la_verified): # haven't run into a connected error already if (success != ERR_ENDPT): verified_err.append((endpt, SSL_CMD.format(endpt))) success = WARN_ENDPT # neither with nor without certs connected connected_err.append((endpt, SSL_CMD.format(endpt))) success = ERR_ENDPT # if any connection issues found if (success == ERR_ENDPT): error_info.extend(connected_err) # if no connection issues found but some verification issues found elif (success == WARN_ENDPT): error_info.extend(verified_err) return success
def check_e2e(): # get machine's hostname hostname = subprocess.check_output(['hostname'], universal_newlines=True).rstrip('\n') sources = ['Heartbeat', 'Syslog', 'Perf'] successes = [] failures = [] print("--------------------------------------------------------------------------------") print(" Please go to https://portal.azure.com and navigate to your workspace.\n"\ " Once there, please navigate to the 'Logs' blade, and input the queries that\n"\ " will be printed below. If the query was successful, then you should see one\n"\ " result; if not, then there will be no results.\n") # ask if user wants to skip entire query section no_skip_all = get_input("Do you want to continue with this section (all queries)? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") if (no_skip_all.lower() in ['y','yes']): for source in sources: query = "{0} | where Computer == '{1}' | sort by TimeGenerated desc | take 1".format(source, hostname) print("--------------------------------------------------------------------------------") print(" Please run this query:") print("\n {0}\n".format(query)) # ask if query was successful q_result = get_input("Was the query successful? (y/n/skip)",\ (lambda x : x.lower() in ['y','yes','n','no','s','skip']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed, or\n"\ "'s'/'skip' to skip the {0} query.".format(source)) # skip current query if (q_result.lower() in ['s','skip']): print(" Skipping {0} query...".format(source)) continue # query was successful elif (q_result.lower() in ['y','yes']): successes.append(source) print(" Continuing to next query...") continue # query wasn't successful elif (q_result.lower() in ['n','no']): failures.append(source) print(" Continuing to next query...") continue # summarize query section success_qs = ', '.join(successes) if (len(successes) > 0) else 'none' failed_qs = ', '.join(failures) if (len(failures) > 0) else 'none' print("--------------------------------------------------------------------------------") print(" Successful queries: {0}".format(success_qs)) print(" Failed queries: {0}".format(failed_qs)) if (len(failures) > 0): error_info.append((', '.join(failures),)) return ERR_QUERIES print("Continuing on with troubleshooter...") print("--------------------------------------------------------------------------------") return NO_ERROR