예제 #1
0
def collect_logs():
    # get SR number / company name
    print("Please input the output directory for where the zip file will be generated, the\n"\
          "SR number to collect OMS logs, and (if applicable) the company name for\n"\
          "reference. (Leave field empty to skip)")
    output_dir = get_input("Output Directory", lambda x : (os.path.isdir(x)), \
                           "Please input an existing, absolute filepath for the resulting zip file")
    sr_num = get_input("SR Number", (lambda x : (x.isalnum())),
                       "Please enter the SR number (without any spaces or special \n"\
                           "characters) to continue.")
    com_name = get_input("Company Name", (lambda x: True), "")

    # create command to run
    logcollect_cmd = "cd {0}; sudo sh ./omslinux_agentlog.sh -o {1} -s {2}".format(
        LOGCOLLECT_PATH, output_dir, sr_num)
    if (com_name != ''):
        logcollect_cmd = logcollect_cmd + ("-c {0}".format(com_name))

    # run command
    print("Starting up log collector...")
    print(
        "--------------------------------------------------------------------------------"
    )
    log_collection = subprocess.call(logcollect_cmd, shell=True)
    if (log_collection != 0):
        print(
            "--------------------------------------------------------------------------------"
        )
        print("Log collector returned error code {0}. Please look through the above output to\n"\
            "find the reason for the error.".format(log_collection))
    return
예제 #2
0
def check_disk_space():
    print(
        "--------------------------------------------------------------------------------"
    )
    print(" Please input the number of files you want to check, as well as the length of\n"\
          " time you want to observe these files for.")

    def check_int(i):
        try:
            return (int(i) > 0)
        except ValueError:
            return (i == '')

    num_files_in = get_input("How many files do you want to check? (Default is top 20 files)",\
                          check_int,\
                          "Please either type a positive integer, or just hit enter to go\n"\
                            "with the default value.")
    num_files = 20 if (num_files_in == '') else int(num_files_in)
    tto_in = get_input("How many seconds do you want to observe the files? (Default is 60sec)",\
                    check_int,\
                    "Please either type a positive integer, or just hit enter to go\nwith "\
                        "the default value.")
    tto = 60 if (tto_in == '') else int(tto_in)

    # gather info for files
    print("Checking top {0} files for the next {1} seconds...".format(
        num_files, tto))
    return scan_top_files(num_files, tto)
예제 #3
0
def collect_logs():
    # get SR number / company name
    print("Please input the SR number to collect OMS logs and (if applicable) the company\n"\
        "name for reference. (Leave field empty to skip)")
    sr_num = get_input("SR Number", (lambda x : (x=="" or x.isalnum())),
                       "Please enter the SR number (without any spaces or special \n"\
                           "characters) to continue.")
    com_name = get_input("Company Name", (lambda x: True), "")

    # create command to run
    logcollect_cmd = "cd {0}; sudo sh ./omslinux_agentlog.sh -s {1}".format(
        LOGCOLLECT_PATH, sr_num)
    if (com_name != ''):
        logcollect_cmd = logcollect_cmd + ("-c {0}".format(com_name))

    # run command
    print("Starting up log collector...")
    print(
        "--------------------------------------------------------------------------------"
    )
    log_collection = subprocess.call(logcollect_cmd, shell=True)
    if (log_collection != 0):
        print(
            "--------------------------------------------------------------------------------"
        )
        print("Log collector returned error code {0}. Please look through the above output to\n"\
            "find the reason for the error.".format(log_collection))
    return
예제 #4
0
def ask_update_old_version(oms_version, curr_oms_version, cpu_bits):
    print(
        "--------------------------------------------------------------------------------"
    )
    print("You are currently running OMS Verion {0}. There is a newer version\n"\
          "available which may fix your issue (version {1}).".format(oms_version, curr_oms_version))
    answer = get_input("Do you want to update? (y/n)", (lambda x : x.lower() in ['y','yes','n','no']),\
                       "Please type either 'y'/'yes' or 'n'/'no' to proceed.")
    # user does want to update
    if (answer.lower() in ['y', 'yes']):
        print(
            "--------------------------------------------------------------------------------"
        )
        print("Please head to the Github link below and click on 'Download Latest OMS Agent\n"\
              "for Linux ({0})' in order to update to the newest version:".format(cpu_bits))
        print("\n    https://github.com/microsoft/OMS-Agent-for-Linux\n")
        print("And follow the instructions given here:")
        print("\n    https://github.com/microsoft/OMS-Agent-for-Linux/blob/master/docs/"\
                "OMS-Agent-for-Linux.md#upgrade-from-a-previous-release\n")
        return USER_EXIT
    # user doesn't want to update
    elif (answer.lower() in ['n', 'no']):
        print("Continuing on with troubleshooter...")
        print(
            "--------------------------------------------------------------------------------"
        )
        return NO_ERROR
예제 #5
0
def ask_error_codes(err_type, err_types):
    # ask if user has error code
    answer = get_input("Do you have an {0} error code? (y/n)".format(err_type.lower()),\
                       (lambda x : x.lower() in ['y','yes','n','no']),\
                       "Please type either 'y'/'yes' or 'n'/'no' to proceed.")
    if (answer.lower() in ['y', 'yes']):
        # get dict of all error codes
        (err_codes, tsg_error) = get_error_codes(err_type)
        if (err_codes == None):
            print("WARNING (INTERNAL): {0}\n Skipping this check...".format(
                tsg_error))
            print(
                "--------------------------------------------------------------------------------"
            )
            return NO_ERROR

        # ask user for error code
        poss_ans = lambda x: x.isdigit() or (x in ['NOT_DEFINED', 'none'])
        err_code = get_input("Please input the error code", poss_ans,\
                             "Please enter an error code ({0})\nto get the error message, or "\
                                "type 'none' to continue with the troubleshooter.".format(err_types))
        # did user give integer, but not valid error code
        while (err_code.isdigit()
               and (not err_code in list(err_codes.keys()))):
            print("{0} is not a valid {1} error code.".format(
                err_code, err_type.lower()))
            err_code = get_input("Please input the error code", poss_ans,\
                                 "Please enter an error code ({0})\nto get the error message, or type "\
                                    "'none' to continue with the troubleshooter.".format(err_types))
        # print out error, ask to exit
        if (err_code != 'none'):
            print("\nError {0}: {1}\n".format(err_code, err_codes[err_code]))
            answer1 = get_input("Would you like to continue with the troubleshooter? (y/n)",\
                                (lambda x : x.lower() in ['y','yes','n','no']),
                                "Please type either 'y'/'yes' or 'n'/'no' to proceed.")
            if (answer1.lower() in ['n', 'no']):
                print("Exiting troubleshooter...")
                print(
                    "================================================================================"
                )
                return USER_EXIT
    print("Continuing on with troubleshooter...")
    print(
        "--------------------------------------------------------------------------------"
    )
    return NO_ERROR
예제 #6
0
def no_clconf(interactive):
    # check if enough time has passed for agent to pull config from OMS backend
    print(
        "--------------------------------------------------------------------------------"
    )
    print(" The troubleshooter cannot find the customlog.conf file. If the custom log \n"\
          " configuration was just applied in portal, it takes up to 15 minutes for the \n"\
          " agent to pick the new configuration.\n"\
          " You can manually pull the config from the OMS backend by running this command:\n"\
          "\n  $ sudo su omsagent -c 'python /opt/microsoft/omsconfig/Scripts/PerformRequiredConfigurationChecks.py'\n")

    # errors out here if not using custom logs (for silent mode)
    if (not interactive):
        print(
            " (NOTE: if you aren't using custom logs, please ignore this message.)"
        )
        error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH))
        return ERR_BACKEND_CONFIG

    # ask if already tried pulling config from OMS backend
    if (interactive):
        manual_pull = get_input("Have you already tried pulling the config manually? (y/n)",\
                             (lambda x : x.lower() in ['y','yes','n','no']),\
                             "Please type either 'y'/'yes' or 'n'/'no' to proceed.")

        # tried pulling, see if that fixed it
        if (manual_pull.lower() in ['y', 'yes']):
            # config now exists
            if (os.path.isfile(CLCONF_PATH)):
                print("The config file has been pulled successfully.")
                print("Continuing on with troubleshooter...")
                print(
                    "--------------------------------------------------------------------------------"
                )
                return NO_ERROR
            # config still doesn't exist
            else:
                # TODO: check the log files for an error in DSC
                error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH))
                return ERR_BACKEND_CONFIG

        # haven't tried pulling yet
        else:
            print(
                " Please try running the above command to pull the config file."
            )
            return ERR_FOUND
예제 #7
0
def ask_install_error_codes():
    print(
        "--------------------------------------------------------------------------------"
    )
    print("Installation error codes can be found for either shell bundle or extension\n"\
          "installation, and can help give a quick idea of what went wrong (separate \n"\
          "from the troubleshooter's tests).")
    do_install_tests = get_input("Do you have an installation code from either installing via shell bundle (b) or\n"\
                                    "via extension (e)? (Type 's' to skip)",\
                                 (lambda x : x.lower() in ['b','bundle','e','extension','s','skip']),\
                                 "Please enter 'bundle'/'b' for a shell bundle code, 'extension'/'e' for an\n"\
                                    "extension code, or 's'/'skip' to skip.")

    # shell bundle error code
    if (do_install_tests.lower() in ['b', 'bundle']):
        print(
            "--------------------------------------------------------------------------------"
        )
        print("Shell bundle error codes can be found by going through the command output in \n"\
              "the terminal after running the `omsagent-*.universal.x64.sh` script to find \n"\
              "a line that matches:\n"\
              "\n    Shell bundle exiting with code <err>\n")
        return ask_error_codes('Installation',
                               "either an integer or 'NOT_DEFINED'")

    # extension error code
    elif (do_install_tests.lower() in ['e', 'extension']):
        print(
            "--------------------------------------------------------------------------------"
        )
        print("Data about the state of extension deployments can be retrieved from the Azure \n"\
              "portal, and by using the Azure CLI.")
        return ask_error_codes('Extension', "an integer")

    # requested to skip
    else:
        print("Continuing on with troubleshooter...")
        print(
            "--------------------------------------------------------------------------------"
        )
예제 #8
0
def check_e2e():
    # get machine's hostname
    hostname = subprocess.check_output(['hostname'], universal_newlines=True).rstrip('\n')

    sources = ['Heartbeat', 'Syslog', 'Perf']

    successes = []
    failures = []

    print("--------------------------------------------------------------------------------")
    print(" Please go to https://portal.azure.com and navigate to your workspace.\n"\
          " Once there, please navigate to the 'Logs' blade, and input the queries that\n"\
          " will be printed below. If the query was successful, then you should see one\n"\
          " result; if not, then there will be no results.\n")
    # ask if user wants to skip entire query section
    no_skip_all = get_input("Do you want to continue with this section (all queries)? (y/n)",\
                            (lambda x : x.lower() in ['y','yes','n','no']),\
                            "Please type either 'y'/'yes' or 'n'/'no' to proceed.")

    if (no_skip_all.lower() in ['y','yes']):
        for source in sources:
            query = "{0} | where Computer == '{1}' | sort by TimeGenerated desc | take 1".format(source, hostname)
            print("--------------------------------------------------------------------------------")
            print(" Please run this query:")
            print("\n    {0}\n".format(query))

            # ask if query was successful
            q_result = get_input("Was the query successful? (y/n/skip)",\
                                 (lambda x : x.lower() in ['y','yes','n','no','s','skip']),\
                                 "Please type either 'y'/'yes' or 'n'/'no' to proceed, or\n"\
                                    "'s'/'skip' to skip the {0} query.".format(source))

            # skip current query
            if (q_result.lower() in ['s','skip']):
                print(" Skipping {0} query...".format(source))
                continue

            # query was successful
            elif (q_result.lower() in ['y','yes']):
                successes.append(source)
                print(" Continuing to next query...")
                continue

            # query wasn't successful
            elif (q_result.lower() in ['n','no']):
                failures.append(source)
                print(" Continuing to next query...")
                continue
            
        # summarize query section
        success_qs = ', '.join(successes) if (len(successes) > 0) else 'none'
        failed_qs  = ', '.join(failures)  if (len(failures) > 0)  else 'none'
        print("--------------------------------------------------------------------------------")
        print(" Successful queries: {0}".format(success_qs))
        print(" Failed queries: {0}".format(failed_qs))
        
        if (len(failures) > 0):
            error_info.append((', '.join(failures),))
            return ERR_QUERIES
    
    print("Continuing on with troubleshooter...")
    print("--------------------------------------------------------------------------------")
    return NO_ERROR
                    
예제 #9
0
def run_troubleshooter():
    # check if running as sudo
    if (not check_sudo()):
        return

    # check if want to run again
    run_again = True

    print(
        "Welcome to the OMS Agent for Linux Troubleshooter! What is your issue?\n"
    )
    while (run_again):
        print("================================================================================\n"\
              "1: Agent is unhealthy or heartbeat data missing.\n"\
              "2: Agent doesn't start, can't connect to Log Analytic Services.\n"\
              "3: Syslog issue.\n"\
              "4: Agent consuming high CPU/memory.\n"\
              "5: Installation failures.\n"\
              "6: Custom logs issue.\n"\
              "================================================================================\n"\
              "A: Run through all scenarios.\n"\
              "L: Collect the logs for OMS Agent.\n"\
              "Q: Press 'Q' to quit.\n"\
              "================================================================================")
        switcher = {
            '1': check_heartbeat,
            '2': check_connection,
            '3': check_syslog,
            '4': check_high_cpu_memory,
            '5': check_installation,
            '6': check_custom_logs,
            'A': check_all
        }
        issue = get_input("Please select an option",\
                        (lambda x : x.lower() in ['1','2','3','4','5','6','q','quit','a','l']),\
                        "Please enter an integer corresponding with your issue (1-6) to\n"\
                            "continue (or 'A' to run through all scenarios), 'L' to run the log\n"\
                            "collector, or 'Q' to quit.")

        # quit troubleshooter
        if (issue.lower() in ['q', 'quit']):
            print("Exiting the troubleshooter...")
            return

        # collect logs
        if (issue.lower() == 'l'):
            print("Running the OMS Log Collector...")
            print(
                "================================================================================"
            )
            collect_logs()
            return

        # silent vs interactive mode
        print(
            "--------------------------------------------------------------------------------"
        )
        print("The troubleshooter can be run in two different modes.\n"\
            "  - Silent Mode runs through with no input required\n"\
            "  - Interactive Mode includes extra checks that require input")
        mode = get_input("Do you want to run the troubleshooter in silent (s) or interactive (i) mode?",\
                        (lambda x : x.lower() in ['s','silent','i','interactive','q','quit']),\
                        "Please enter 's'/'silent' to run silent mode, 'i'/'interactive' to run \n"\
                            "interactive mode, or 'q'/'quit' to quit.")
        if (mode.lower() in ['q', 'quit']):
            print("Exiting the troubleshooter...")
            return
        elif (mode.lower() in ['s', 'silent']):
            print("Running troubleshooter in silent mode...")
            interactive_mode = False
        elif (mode.lower() in ['i', 'interactive']):
            print("Running troubleshooter in interactive mode...")
            interactive_mode = True

        # run troubleshooter
        section = switcher.get(issue.upper(), lambda: "Invalid input")
        print(
            "================================================================================"
        )
        success = section(interactive=interactive_mode)

        print(
            "================================================================================"
        )
        print(
            "================================================================================"
        )
        # print out all errors/warnings
        if (len(err_summary) > 0):
            print("ALL ERRORS/WARNINGS ENCOUNTERED:")
            for err in err_summary:
                print("  {0}".format(err))
                print(
                    "--------------------------------------------------------------------------------"
                )

        # no errors found
        if (success == NO_ERROR):
            print("No errors were found.")
        # user requested to exit
        elif (success == USER_EXIT):
            return
        # error found
        else:
            print("Please review the errors found above.")

        # if user ran single scenario, ask if they want to run again
        if (issue in ['1', '2', '3', '4', '5', '6']):
            run_again = get_input("Do you want to run another scenario? (y/n)",\
                                  (lambda x : x.lower() in ['y','yes','n','no']),\
                                  "Please type either 'y'/'yes' or 'n'/'no' to proceed.")

            if (run_again.lower() in ['y', 'yes']):
                print("Please select another scenario below:")
            elif (run_again.lower() in ['n', 'no']):
                run_again = False
        else:
            run_again = False

    # give information to user about next steps
    print(
        "================================================================================"
    )
    print("If you still have an issue, please run the troubleshooter again and collect the\n"\
        "logs for OMS.\n"\
        "In addition, please include the following information:\n"\
        "  - Azure Subscription ID where the Log Analytics Workspace is located\n"\
        "  - Workspace ID the agent has been onboarded to\n"\
        "  - Workspace Name\n"\
        "  - Region Workspace is located\n"\
        "  - Pricing Tier assigned to the Workspace\n"\
        "  - Linux Distribution on the VM\n"\
        "  - Log Analytics Agent Version")
    return
예제 #10
0
def check_custom_logs(interactive, prev_success=NO_ERROR):
    if (interactive):
        print(" To check if you are using custom logs, please go to https://ms.portal.azure.com\n"\
            " and navigate to your workspace. Once there, please navigate to the 'Advanced\n"\
            " settings' blade, and then go to 'Data' > 'Custom Logs'. There you should be\n"\
            " to see any custom logs you may have.\n")
        using_cl = get_input("Are you currently using custom logs? (y/n)",\
                            (lambda x : x.lower() in ['y','yes','n','no']),\
                            "Please type either 'y'/'yes' or 'n'/'no' to proceed.")
        # not using custom logs
        if (using_cl in ['n', 'no']):
            print("Continuing on with the rest of the troubleshooter...")
            print(
                "================================================================================"
            )
            return prev_success
        # using custom logs
        else:
            print("Continuing on with troubleshooter...")
            print(
                "--------------------------------------------------------------------------------"
            )

    print("CHECKING FOR CUSTOM LOG ISSUES...")

    success = prev_success

    # check if installed / connected / running correctly
    print("Checking if omsagent installed and running...")
    # check installation
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # check connection
    checked_la_endpts = check_log_analytics_endpts()
    if (checked_la_endpts != NO_ERROR):
        print_errors(checked_la_endpts)
        print(
            "Running the connection part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_connection(interactive,
                                err_codes=False,
                                prev_success=ERR_FOUND)

    # check running
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running != NO_ERROR):
        print_errors(checked_omsagent_running)
        print(
            "Running the general health part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_heartbeat(interactive, prev_success=ERR_FOUND)

    # check customlog.conf
    print("Checking for custom log configuration files...")
    checked_clconf = check_customlog_conf(interactive)
    if (is_error(checked_clconf)):
        return print_errors(checked_clconf)
    else:
        success = print_errors(checked_clconf)

    return success