def collect_logs(): # get SR number / company name print("Please input the output directory for where the zip file will be generated, the\n"\ "SR number to collect OMS logs, and (if applicable) the company name for\n"\ "reference. (Leave field empty to skip)") output_dir = get_input("Output Directory", lambda x : (os.path.isdir(x)), \ "Please input an existing, absolute filepath for the resulting zip file") sr_num = get_input("SR Number", (lambda x : (x.isalnum())), "Please enter the SR number (without any spaces or special \n"\ "characters) to continue.") com_name = get_input("Company Name", (lambda x: True), "") # create command to run logcollect_cmd = "cd {0}; sudo sh ./omslinux_agentlog.sh -o {1} -s {2}".format( LOGCOLLECT_PATH, output_dir, sr_num) if (com_name != ''): logcollect_cmd = logcollect_cmd + ("-c {0}".format(com_name)) # run command print("Starting up log collector...") print( "--------------------------------------------------------------------------------" ) log_collection = subprocess.call(logcollect_cmd, shell=True) if (log_collection != 0): print( "--------------------------------------------------------------------------------" ) print("Log collector returned error code {0}. Please look through the above output to\n"\ "find the reason for the error.".format(log_collection)) return
def check_disk_space(): print( "--------------------------------------------------------------------------------" ) print(" Please input the number of files you want to check, as well as the length of\n"\ " time you want to observe these files for.") def check_int(i): try: return (int(i) > 0) except ValueError: return (i == '') num_files_in = get_input("How many files do you want to check? (Default is top 20 files)",\ check_int,\ "Please either type a positive integer, or just hit enter to go\n"\ "with the default value.") num_files = 20 if (num_files_in == '') else int(num_files_in) tto_in = get_input("How many seconds do you want to observe the files? (Default is 60sec)",\ check_int,\ "Please either type a positive integer, or just hit enter to go\nwith "\ "the default value.") tto = 60 if (tto_in == '') else int(tto_in) # gather info for files print("Checking top {0} files for the next {1} seconds...".format( num_files, tto)) return scan_top_files(num_files, tto)
def collect_logs(): # get SR number / company name print("Please input the SR number to collect OMS logs and (if applicable) the company\n"\ "name for reference. (Leave field empty to skip)") sr_num = get_input("SR Number", (lambda x : (x=="" or x.isalnum())), "Please enter the SR number (without any spaces or special \n"\ "characters) to continue.") com_name = get_input("Company Name", (lambda x: True), "") # create command to run logcollect_cmd = "cd {0}; sudo sh ./omslinux_agentlog.sh -s {1}".format( LOGCOLLECT_PATH, sr_num) if (com_name != ''): logcollect_cmd = logcollect_cmd + ("-c {0}".format(com_name)) # run command print("Starting up log collector...") print( "--------------------------------------------------------------------------------" ) log_collection = subprocess.call(logcollect_cmd, shell=True) if (log_collection != 0): print( "--------------------------------------------------------------------------------" ) print("Log collector returned error code {0}. Please look through the above output to\n"\ "find the reason for the error.".format(log_collection)) return
def ask_update_old_version(oms_version, curr_oms_version, cpu_bits): print( "--------------------------------------------------------------------------------" ) print("You are currently running OMS Verion {0}. There is a newer version\n"\ "available which may fix your issue (version {1}).".format(oms_version, curr_oms_version)) answer = get_input("Do you want to update? (y/n)", (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # user does want to update if (answer.lower() in ['y', 'yes']): print( "--------------------------------------------------------------------------------" ) print("Please head to the Github link below and click on 'Download Latest OMS Agent\n"\ "for Linux ({0})' in order to update to the newest version:".format(cpu_bits)) print("\n https://github.com/microsoft/OMS-Agent-for-Linux\n") print("And follow the instructions given here:") print("\n https://github.com/microsoft/OMS-Agent-for-Linux/blob/master/docs/"\ "OMS-Agent-for-Linux.md#upgrade-from-a-previous-release\n") return USER_EXIT # user doesn't want to update elif (answer.lower() in ['n', 'no']): print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) return NO_ERROR
def ask_error_codes(err_type, err_types): # ask if user has error code answer = get_input("Do you have an {0} error code? (y/n)".format(err_type.lower()),\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") if (answer.lower() in ['y', 'yes']): # get dict of all error codes (err_codes, tsg_error) = get_error_codes(err_type) if (err_codes == None): print("WARNING (INTERNAL): {0}\n Skipping this check...".format( tsg_error)) print( "--------------------------------------------------------------------------------" ) return NO_ERROR # ask user for error code poss_ans = lambda x: x.isdigit() or (x in ['NOT_DEFINED', 'none']) err_code = get_input("Please input the error code", poss_ans,\ "Please enter an error code ({0})\nto get the error message, or "\ "type 'none' to continue with the troubleshooter.".format(err_types)) # did user give integer, but not valid error code while (err_code.isdigit() and (not err_code in list(err_codes.keys()))): print("{0} is not a valid {1} error code.".format( err_code, err_type.lower())) err_code = get_input("Please input the error code", poss_ans,\ "Please enter an error code ({0})\nto get the error message, or type "\ "'none' to continue with the troubleshooter.".format(err_types)) # print out error, ask to exit if (err_code != 'none'): print("\nError {0}: {1}\n".format(err_code, err_codes[err_code])) answer1 = get_input("Would you like to continue with the troubleshooter? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']), "Please type either 'y'/'yes' or 'n'/'no' to proceed.") if (answer1.lower() in ['n', 'no']): print("Exiting troubleshooter...") print( "================================================================================" ) return USER_EXIT print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) return NO_ERROR
def no_clconf(interactive): # check if enough time has passed for agent to pull config from OMS backend print( "--------------------------------------------------------------------------------" ) print(" The troubleshooter cannot find the customlog.conf file. If the custom log \n"\ " configuration was just applied in portal, it takes up to 15 minutes for the \n"\ " agent to pick the new configuration.\n"\ " You can manually pull the config from the OMS backend by running this command:\n"\ "\n $ sudo su omsagent -c 'python /opt/microsoft/omsconfig/Scripts/PerformRequiredConfigurationChecks.py'\n") # errors out here if not using custom logs (for silent mode) if (not interactive): print( " (NOTE: if you aren't using custom logs, please ignore this message.)" ) error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH)) return ERR_BACKEND_CONFIG # ask if already tried pulling config from OMS backend if (interactive): manual_pull = get_input("Have you already tried pulling the config manually? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # tried pulling, see if that fixed it if (manual_pull.lower() in ['y', 'yes']): # config now exists if (os.path.isfile(CLCONF_PATH)): print("The config file has been pulled successfully.") print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) return NO_ERROR # config still doesn't exist else: # TODO: check the log files for an error in DSC error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH)) return ERR_BACKEND_CONFIG # haven't tried pulling yet else: print( " Please try running the above command to pull the config file." ) return ERR_FOUND
def ask_install_error_codes(): print( "--------------------------------------------------------------------------------" ) print("Installation error codes can be found for either shell bundle or extension\n"\ "installation, and can help give a quick idea of what went wrong (separate \n"\ "from the troubleshooter's tests).") do_install_tests = get_input("Do you have an installation code from either installing via shell bundle (b) or\n"\ "via extension (e)? (Type 's' to skip)",\ (lambda x : x.lower() in ['b','bundle','e','extension','s','skip']),\ "Please enter 'bundle'/'b' for a shell bundle code, 'extension'/'e' for an\n"\ "extension code, or 's'/'skip' to skip.") # shell bundle error code if (do_install_tests.lower() in ['b', 'bundle']): print( "--------------------------------------------------------------------------------" ) print("Shell bundle error codes can be found by going through the command output in \n"\ "the terminal after running the `omsagent-*.universal.x64.sh` script to find \n"\ "a line that matches:\n"\ "\n Shell bundle exiting with code <err>\n") return ask_error_codes('Installation', "either an integer or 'NOT_DEFINED'") # extension error code elif (do_install_tests.lower() in ['e', 'extension']): print( "--------------------------------------------------------------------------------" ) print("Data about the state of extension deployments can be retrieved from the Azure \n"\ "portal, and by using the Azure CLI.") return ask_error_codes('Extension', "an integer") # requested to skip else: print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" )
def check_e2e(): # get machine's hostname hostname = subprocess.check_output(['hostname'], universal_newlines=True).rstrip('\n') sources = ['Heartbeat', 'Syslog', 'Perf'] successes = [] failures = [] print("--------------------------------------------------------------------------------") print(" Please go to https://portal.azure.com and navigate to your workspace.\n"\ " Once there, please navigate to the 'Logs' blade, and input the queries that\n"\ " will be printed below. If the query was successful, then you should see one\n"\ " result; if not, then there will be no results.\n") # ask if user wants to skip entire query section no_skip_all = get_input("Do you want to continue with this section (all queries)? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") if (no_skip_all.lower() in ['y','yes']): for source in sources: query = "{0} | where Computer == '{1}' | sort by TimeGenerated desc | take 1".format(source, hostname) print("--------------------------------------------------------------------------------") print(" Please run this query:") print("\n {0}\n".format(query)) # ask if query was successful q_result = get_input("Was the query successful? (y/n/skip)",\ (lambda x : x.lower() in ['y','yes','n','no','s','skip']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed, or\n"\ "'s'/'skip' to skip the {0} query.".format(source)) # skip current query if (q_result.lower() in ['s','skip']): print(" Skipping {0} query...".format(source)) continue # query was successful elif (q_result.lower() in ['y','yes']): successes.append(source) print(" Continuing to next query...") continue # query wasn't successful elif (q_result.lower() in ['n','no']): failures.append(source) print(" Continuing to next query...") continue # summarize query section success_qs = ', '.join(successes) if (len(successes) > 0) else 'none' failed_qs = ', '.join(failures) if (len(failures) > 0) else 'none' print("--------------------------------------------------------------------------------") print(" Successful queries: {0}".format(success_qs)) print(" Failed queries: {0}".format(failed_qs)) if (len(failures) > 0): error_info.append((', '.join(failures),)) return ERR_QUERIES print("Continuing on with troubleshooter...") print("--------------------------------------------------------------------------------") return NO_ERROR
def run_troubleshooter(): # check if running as sudo if (not check_sudo()): return # check if want to run again run_again = True print( "Welcome to the OMS Agent for Linux Troubleshooter! What is your issue?\n" ) while (run_again): print("================================================================================\n"\ "1: Agent is unhealthy or heartbeat data missing.\n"\ "2: Agent doesn't start, can't connect to Log Analytic Services.\n"\ "3: Syslog issue.\n"\ "4: Agent consuming high CPU/memory.\n"\ "5: Installation failures.\n"\ "6: Custom logs issue.\n"\ "================================================================================\n"\ "A: Run through all scenarios.\n"\ "L: Collect the logs for OMS Agent.\n"\ "Q: Press 'Q' to quit.\n"\ "================================================================================") switcher = { '1': check_heartbeat, '2': check_connection, '3': check_syslog, '4': check_high_cpu_memory, '5': check_installation, '6': check_custom_logs, 'A': check_all } issue = get_input("Please select an option",\ (lambda x : x.lower() in ['1','2','3','4','5','6','q','quit','a','l']),\ "Please enter an integer corresponding with your issue (1-6) to\n"\ "continue (or 'A' to run through all scenarios), 'L' to run the log\n"\ "collector, or 'Q' to quit.") # quit troubleshooter if (issue.lower() in ['q', 'quit']): print("Exiting the troubleshooter...") return # collect logs if (issue.lower() == 'l'): print("Running the OMS Log Collector...") print( "================================================================================" ) collect_logs() return # silent vs interactive mode print( "--------------------------------------------------------------------------------" ) print("The troubleshooter can be run in two different modes.\n"\ " - Silent Mode runs through with no input required\n"\ " - Interactive Mode includes extra checks that require input") mode = get_input("Do you want to run the troubleshooter in silent (s) or interactive (i) mode?",\ (lambda x : x.lower() in ['s','silent','i','interactive','q','quit']),\ "Please enter 's'/'silent' to run silent mode, 'i'/'interactive' to run \n"\ "interactive mode, or 'q'/'quit' to quit.") if (mode.lower() in ['q', 'quit']): print("Exiting the troubleshooter...") return elif (mode.lower() in ['s', 'silent']): print("Running troubleshooter in silent mode...") interactive_mode = False elif (mode.lower() in ['i', 'interactive']): print("Running troubleshooter in interactive mode...") interactive_mode = True # run troubleshooter section = switcher.get(issue.upper(), lambda: "Invalid input") print( "================================================================================" ) success = section(interactive=interactive_mode) print( "================================================================================" ) print( "================================================================================" ) # print out all errors/warnings if (len(err_summary) > 0): print("ALL ERRORS/WARNINGS ENCOUNTERED:") for err in err_summary: print(" {0}".format(err)) print( "--------------------------------------------------------------------------------" ) # no errors found if (success == NO_ERROR): print("No errors were found.") # user requested to exit elif (success == USER_EXIT): return # error found else: print("Please review the errors found above.") # if user ran single scenario, ask if they want to run again if (issue in ['1', '2', '3', '4', '5', '6']): run_again = get_input("Do you want to run another scenario? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") if (run_again.lower() in ['y', 'yes']): print("Please select another scenario below:") elif (run_again.lower() in ['n', 'no']): run_again = False else: run_again = False # give information to user about next steps print( "================================================================================" ) print("If you still have an issue, please run the troubleshooter again and collect the\n"\ "logs for OMS.\n"\ "In addition, please include the following information:\n"\ " - Azure Subscription ID where the Log Analytics Workspace is located\n"\ " - Workspace ID the agent has been onboarded to\n"\ " - Workspace Name\n"\ " - Region Workspace is located\n"\ " - Pricing Tier assigned to the Workspace\n"\ " - Linux Distribution on the VM\n"\ " - Log Analytics Agent Version") return
def check_custom_logs(interactive, prev_success=NO_ERROR): if (interactive): print(" To check if you are using custom logs, please go to https://ms.portal.azure.com\n"\ " and navigate to your workspace. Once there, please navigate to the 'Advanced\n"\ " settings' blade, and then go to 'Data' > 'Custom Logs'. There you should be\n"\ " to see any custom logs you may have.\n") using_cl = get_input("Are you currently using custom logs? (y/n)",\ (lambda x : x.lower() in ['y','yes','n','no']),\ "Please type either 'y'/'yes' or 'n'/'no' to proceed.") # not using custom logs if (using_cl in ['n', 'no']): print("Continuing on with the rest of the troubleshooter...") print( "================================================================================" ) return prev_success # using custom logs else: print("Continuing on with troubleshooter...") print( "--------------------------------------------------------------------------------" ) print("CHECKING FOR CUSTOM LOG ISSUES...") success = prev_success # check if installed / connected / running correctly print("Checking if omsagent installed and running...") # check installation if (get_oms_version() == None): print_errors(ERR_OMS_INSTALL) print( "Running the installation part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_installation(interactive, err_codes=False, prev_success=ERR_FOUND) # check connection checked_la_endpts = check_log_analytics_endpts() if (checked_la_endpts != NO_ERROR): print_errors(checked_la_endpts) print( "Running the connection part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_connection(interactive, err_codes=False, prev_success=ERR_FOUND) # check running workspace_id = geninfo_lookup('WORKSPACE_ID') if (workspace_id == None): error_info.append(('Workspace ID', OMSADMIN_PATH)) return ERR_INFO_MISSING checked_omsagent_running = check_omsagent_running(workspace_id) if (checked_omsagent_running != NO_ERROR): print_errors(checked_omsagent_running) print( "Running the general health part of the troubleshooter in order to find the issue..." ) print( "================================================================================" ) return check_heartbeat(interactive, prev_success=ERR_FOUND) # check customlog.conf print("Checking for custom log configuration files...") checked_clconf = check_customlog_conf(interactive) if (is_error(checked_clconf)): return print_errors(checked_clconf) else: success = print_errors(checked_clconf) return success