Exemplo n.º 1
0
def update_omsadmin():
    try:
        with open(CONF_PATH, 'r') as conf_file:
            for line in conf_file:
                parsed_line = (line.rstrip('\n')).split('=')
                general_info[parsed_line[0]] = '='.join(parsed_line[1:])
        return NO_ERROR
    except IOError as e:
        if (e.errno == errno.EACCES):
            error_info.append((CONF_PATH, ))
            return ERR_SUDO_PERMS
        elif (e.errno == errno.ENOENT):
            error_info.append(('file', CONF_PATH))
            return ERR_FILE_MISSING
        else:
            raise
Exemplo n.º 2
0
def no_clconf(interactive):
    # check if enough time has passed for agent to pull config from OMS backend
    print(
        "--------------------------------------------------------------------------------"
    )
    print(" The troubleshooter cannot find the customlog.conf file. If the custom log \n"\
          " configuration was just applied in portal, it takes up to 15 minutes for the \n"\
          " agent to pick the new configuration.\n"\
          " You can manually pull the config from the OMS backend by running this command:\n"\
          "\n  $ sudo su omsagent -c 'python /opt/microsoft/omsconfig/Scripts/PerformRequiredConfigurationChecks.py'\n")

    # errors out here if not using custom logs (for silent mode)
    if (not interactive):
        print(
            " (NOTE: if you aren't using custom logs, please ignore this message.)"
        )
        error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH))
        return ERR_BACKEND_CONFIG

    # ask if already tried pulling config from OMS backend
    if (interactive):
        manual_pull = get_input("Have you already tried pulling the config manually? (y/n)",\
                             (lambda x : x.lower() in ['y','yes','n','no']),\
                             "Please type either 'y'/'yes' or 'n'/'no' to proceed.")

        # tried pulling, see if that fixed it
        if (manual_pull.lower() in ['y', 'yes']):
            # config now exists
            if (os.path.isfile(CLCONF_PATH)):
                print("The config file has been pulled successfully.")
                print("Continuing on with troubleshooter...")
                print(
                    "--------------------------------------------------------------------------------"
                )
                return NO_ERROR
            # config still doesn't exist
            else:
                # TODO: check the log files for an error in DSC
                error_info.append((OMSCONFLOG_PATH, OMSCONFLOGDET_PATH))
                return ERR_BACKEND_CONFIG

        # haven't tried pulling yet
        else:
            print(
                " Please try running the above command to pull the config file."
            )
            return ERR_FOUND
Exemplo n.º 3
0
def check_omi_cpu():
    # Run script
    try:
        script_output = subprocess.check_output(['bash',SCRIPT_FILE,'--runtime-in-min','1',\
                            '--cpu-threshold','80'], universal_newlines=True, stderr=subprocess.STDOUT)
        script_lines = script_output.split('\n')
        for script_line in script_lines:
            if (script_line.startswith("Traces will be saved to this file: ")):
                # started running successfully
                return check_output_file()
        # script didn't start running successfully
        error_info.append((script_output,))
        return ERR_OMICPU
    # process errored out
    except subprocess.CalledProcessError as e:
        error_info.append((e.output,))
        return ERR_OMICPU
Exemplo n.º 4
0
def start_omsagent(workspace, enabled=False):
    print("Agent curently not running. Attempting to start omsagent...")
    result = 0
    # enable the agent if necessary
    if (not enabled):
        result = subprocess.call([SC_PATH, 'enable'])
    # start the agent if enable was successful
    result = (subprocess.call([SC_PATH, 'start'])) if (result
                                                       == 0) else (result)

    # check if successful
    if (result == 0):
        return check_omsagent_running(workspace)
    elif (result == 127):
        # script doesn't exist
        error_info.append(('executable shell script', SC_PATH))
        return ERR_FILE_MISSING
Exemplo n.º 5
0
def check_oms(interactive):
    cpu_bits = geninfo_lookup('CPU_BITS')

    oms_version = get_oms_version()
    if (oms_version == None):
        return ERR_OMS_INSTALL

    # check if version is >= 1.11
    if (not comp_versions_ge(oms_version, '1.11')):
        error_info.append((oms_version, cpu_bits))
        return ERR_OLD_OMS_VER

    # get most recent version
    (curr_oms_version, e) = get_curr_oms_version(OMSAGENT_URL)

    # getting current version failed
    if (curr_oms_version == None):
        # could connect, just formatting issue
        if (e == None):
            return ERR_GETTING_OMS_VER
        # couldn't connect
        else:
            checked_urlopen = check_urlopen_errs(e)
            # issue with connecting to Github specifically
            if (checked_urlopen == ERR_ENDPT):
                print(
                    "WARNING: can't connect to {0}: {1}\n Skipping this check..."
                    .format(OMSAGENT_URL, e))
                print(
                    "--------------------------------------------------------------------------------"
                )
            # issue with general internet connectivity / ssl package
            else:
                error_info.append((OMSAGENT_URL, e))
                return checked_urlopen

    # got current version
    else:
        # if not most recent version, ask if want to update
        if (interactive
                and (not comp_versions_ge(oms_version, curr_oms_version))):
            if (ask_update_old_version(oms_version, curr_oms_version,
                                       cpu_bits) == USER_EXIT):
                return USER_EXIT

    return update_omsadmin()
Exemplo n.º 6
0
def check_vm_supported(vm_dist, vm_ver):
    vm_supported = False

    # find VM distribution in supported list
    vm_supported_dist = None
    for supported_dist in (supported_dists.keys()):
        if (not vm_dist.lower().startswith(supported_dist)):
            continue

        vm_supported_dist = supported_dist
        # check if version is supported
        vm_ver_split = vm_ver.split('.')
        for supported_ver in (supported_dists[supported_dist]):
            supported_ver_split = supported_ver.split('.')
            vm_ver_match = True
            # try matching VM version with supported version
            for (idx, supported_ver_num) in enumerate(supported_ver_split):
                try:
                    supported_ver_num = int(supported_ver_num)
                    vm_ver_num = int(vm_ver_split[idx])
                    if (vm_ver_num is not supported_ver_num):
                        vm_ver_match = False
                        break
                except (IndexError, ValueError) as e:
                    vm_ver_match = False
                    break

            # check if successful in matching
            if (vm_ver_match):
                vm_supported = True
                break

        # check if any version successful in matching
        if (vm_supported):
            return NO_ERROR

    # VM distribution is supported, but not current version
    if (vm_supported_dist != None):
        alt_vers = get_alternate_versions(vm_supported_dist)
        error_info.append((vm_dist, vm_ver, alt_vers))
        return ERR_OS_VER

    # VM distribution isn't supported
    else:
        error_info.append((vm_dist, ))
        return ERR_OS
Exemplo n.º 7
0
def check_syslogdest(sys_bind, sys_pt):
    # get workspace id
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # set up regex lines
    comment_line = "# OMS Syslog collection for workspace (\S+)"
    spec_line = "(\w+).=alert;(\w+).=crit;(\w+).=debug;(\w+).=emerg;(\w+).=err;"\
                "(\w+).=info;(\w+).=notice;(\w+).=warning"

    # open file
    with open(SYSLOGDEST_PATH, 'r') as syslogdest_file:
        for line in syslogdest_file:
            line = line.rstrip('\n')
            # skip empty lines
            if (line == ''):
                continue

            # check if workspace for syslog collection lines up
            match_comment = re.match(comment_line, line)
            if (match_comment == None):
                continue
            syslog_wkspc = (match_comment.groups())[0]
            if (workspace_id != syslog_wkspc):
                error_info.append(
                    (syslog_wkspc, workspace_id, SYSLOGCONF_PATH))
                return ERR_SYSLOG_WKSPC
            else:
                continue

            # check if port is correct
            parsed_line = line.split()
            match_spec = re.match(spec_line, parsed_line[0])
            if (match_comment != None):
                checked_port = check_port(parsed_line[1], sys_port, sys_bind)
                if (checked_port != NO_ERROR):
                    return checked_port
                else:
                    continue
            else:
                continue

    # all ports set up correctly
    return NO_ERROR
Exemplo n.º 8
0
def check_log_rotation():
    # update logrotate config path with wsid
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    global LR_CONFIG_PATH
    LR_CONFIG_PATH = LR_CONFIG_PATH.format(workspace_id)

    # check logrotate config file exists
    if (not os.path.isfile(LR_CONFIG_PATH)):
        error_info.append(('logrotate config file', LR_CONFIG_PATH))
        return ERR_FILE_MISSING

    # go through logrotate config file
    logrotate_configs = dict()
    with open(LR_CONFIG_PATH, 'r') as f:
        lr_lines = f.readlines()
        in_file = None
        for lr_line in lr_lines:
            lr_line = lr_line.rstrip('\n')

            # start of log rotation config
            lr_start = re.match("^(\S+) \{$", lr_line)
            if (lr_start != None):
                in_file = lr_start.groups()[0]
                logrotate_configs[in_file] = set()
                continue
            # log rotation config info
            elif (in_file != None):
                logrotate_configs[in_file].add(lr_line.lstrip())
                continue
            # end of log rotation config
            elif (lr_line == '}'):
                in_file = None
                continue

    # check size rotation working
    checked_size_config = check_size_config(logrotate_configs)
    if (checked_size_config != NO_ERROR):
        return checked_size_config

    return NO_ERROR
Exemplo n.º 9
0
def scan_top_files(num_files, tto):
    top_files = []
    print('num_files: {0}, tto: {1}'.format(num_files, tto))
    with open(os.devnull, 'w') as devnull:
        find_cmd = subprocess.Popen(['find','/','-type','f','-exec','du','-S','\{\}','+'],\
                        stdout=subprocess.PIPE, stderr=devnull)
        print('find_cmd: {0}'.format(find_cmd))
        sort_cmd = subprocess.Popen(['sort', '-rh'],
                                    stdin=find_cmd.stdout,
                                    stdout=subprocess.PIPE)
        find_cmd.stdout.close()
        print('sort_cmd: {0}'.format(sort_cmd))
        head_cmd = subprocess.Popen(['head','-n',str(num_files)], stdin=sort_cmd.stdout,\
                        stdout=subprocess.PIPE)
        sort_cmd.stdout.close()
        print('head_cmd: {0}'.format(head_cmd))
        files = head_cmd.communicate()[0]
        print('files: {0}'.format(files))
        # format file list
        parsed_files = files.split('\n')
        print('parsed_files: {0}'.format(parsed_files))

        for f in parsed_files:
            print('f: {0}'.format(f))
            fpath = f.split()[1]
            fstat = os.stat(fpath)
            top_files.append(
                (fpath, fstat.st_size, fstat.st_size, fstat.st_mtime, []))
        # top_files : [ (fpath1, finitsize1, fsize1, ftime1, [ (fsizechange1, fsizechangetime1), ... ]), ... ]

    # check every second
    for sec in range(tto):
        check_top_files(top_files)
        time.sleep(1)

    # go over each file's changes
    result = NO_ERROR
    for (fpath, finitsize, fsize, ftime, fchanges) in top_files:
        if (fsize > finitsize):
            error_info.append((fpath, len(fchanges), ftime))
            # TODO: add to file with more info or smth
            result = WARN_LARGE_FILES
    return result
Exemplo n.º 10
0
def check_log_analytics_endpts():
    success = NO_ERROR
    no_certs_printed = False

    # get OMS endpoint to check if fairfax region
    oms_endpt = geninfo_lookup('OMS_ENDPOINT')
    if (oms_endpt == None):
        error_info.append(('OMS endpoint', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get workspace ID
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get log analytics endpoints
    if ('.us' in oms_endpt):
        log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \
            "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \
            "*.oms.opinsights.azure.us"]
    else:
        log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \
            "ods.systemcenteradvisor.com"]

    for endpt in log_analytics_endpts:
        ssl_command = SSL_CMD

        # replace '*' with workspace ID
        if ('*' in endpt):
            endpt = endpt.replace('*', workspace_id)

        # check endpoint without certs
        if (not check_endpt_ssl(ssl_command, endpt)):
            # try with certs (if they exist)
            if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)):
                ssl_command = "{0} -cert {1} -key {2}".format(
                    SSL_CMD, CERT_PATH, KEY_PATH)
                if (not check_endpt_ssl(ssl_command, endpt)):
                    error_info.append((endpt, ssl_command.format(endpt)))
                    success = ERR_ENDPT
            else:
                # lets user know cert and key aren't there
                if (not no_certs_printed):
                    print(
                        "NOTE: Certificate and key files don't exist, OMS isn't onboarded."
                    )
                    no_certs_printed = True

                error_info.append((endpt, ssl_command.format(endpt)))
                success = ERR_ENDPT

    return success
Exemplo n.º 11
0
def check_sys_invoke_rc(service, controller):
    try:
        sys_status = subprocess.check_output([controller, service, 'status'], \
                        universal_newlines=True, stderr=subprocess.STDOUT)
        sys_line = sys_status.split('\n')[0]
        sys_info = sys_line.split()  # [service, status+',', 'process', PID]
        status = sys_info[1].rstrip(',')  # [goal+'/'+curr_state]
        if (status == 'start/running'):
            # exists and running correctly
            return NO_ERROR
        else:
            # exists but not running correctly
            error_info.append((service, status, controller))
            return ERR_SERVICE_STATUS
    except subprocess.CalledProcessError as e:
        # service not on machine
        if (e.returncode == 100):
            return ERR_SYSLOG
        else:
            error_info.append((service, e.output, controller))
            return ERR_SERVICE_STATUS
Exemplo n.º 12
0
def check_log_analytics_endpts():
    success = NO_ERROR

    # get OMS endpoint to check if fairfax region
    oms_endpt = geninfo_lookup('OMS_ENDPOINT')
    if (oms_endpt == None):
        error_info.append(('OMS endpoint', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get workspace ID
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get log analytics endpoints
    if ('.us' in oms_endpt):
        log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \
            "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \
            "*.oms.opinsights.azure.us"]
    else:
        log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \
            "ods.systemcenteradvisor.com"]

    for endpt in log_analytics_endpts:
        # replace '*' with workspace ID
        if ('*' in endpt):
            endpt = endpt.replace('*', workspace_id)

        # ping endpoint
        if (not check_endpt(endpt)):
            error_info.append((endpt, ))
            success = ERR_ENDPT

    return success
def check_multihoming(workspace):
    directories = []
    potential_workspaces = []

    for (dirpath, dirnames,
         filenames) in os.walk("/var/opt/microsoft/omsagent"):
        directories.extend(dirnames)
        break  # Get the top level of directories

    for directory in directories:
        if len(directory) >= 32:
            potential_workspaces.append(directory)
    workspace_id_list = ', '.join(potential_workspaces)

    # 2+ potential workspaces
    if len(potential_workspaces) > 1:
        error_info.append((workspace_id_list))
        return ERR_MULTIHOMING

    # 0 potential workspaces
    if (len(potential_workspaces) == 0):
        missing_dir = "/var/opt/microsoft/omsagent/{0}".format(workspace)
        error_info.append(('Directory', missing_dir))
        return ERR_FILE_MISSING

    # 1 incorrect workspace
    if (potential_workspaces[0] != workspace):
        error_info.append(potential_workspaces[0], workspace)
        return ERR_GUID

    # 1 correct workspace
    return NO_ERROR
Exemplo n.º 14
0
def check_omsagent_running_ps(workspace):
    # check if OMS is running through 'ps'
    processes = subprocess.check_output(['ps', '-ef'],
                                        universal_newlines=True).split('\n')
    for process in processes:
        # check if process is OMS
        if (not process.startswith('omsagent')):
            continue

        # [ UID, PID, PPID, C, STIME, TTY, TIME, CMD ]
        process = process.split()
        command = ' '.join(process[7:])

        # try to match command with omsagent command
        regx_cmd = "/opt/microsoft/omsagent/ruby/bin/ruby /opt/microsoft/omsagent/bin/omsagent "\
                   "-d /var/opt/microsoft/omsagent/(\S+)/run/omsagent.pid "\
                   "-o /var/opt/microsoft/omsagent/(\S+)/log/omsagent.log "\
                   "-c /etc/opt/microsoft/omsagent/(\S+)/conf/omsagent.conf "\
                   "--no-supervisor"
        matches = re.match(regx_cmd, command)
        if (matches == None):
            continue

        matches_tup = matches.groups()
        guid = matches_tup[0]
        if (matches_tup.count(guid) != len(matches_tup)):
            continue

        # check if OMS is running with a different workspace
        if (workspace != guid):
            error_info.append((guid, workspace))
            return ERR_GUID

        # OMS currently running and delivering to the correct workspace
        return NO_ERROR

    # none of the processes running are OMS
    return ERR_OMS_WONT_RUN
Exemplo n.º 15
0
def check_sys_systemctl(service, controller):
    try:
        sys_status = subprocess.check_output([controller, 'status', service], \
                        universal_newlines=True, stderr=subprocess.STDOUT)
        sys_lines = sys_status.split('\n')
        for line in sys_lines:
            line = line.strip()
            if line.startswith('Active: '):
                stripped_line = line.lstrip('Active: ')
                # exists and running correctly
                if stripped_line.startswith('active (running) since '):
                    return NO_ERROR
                # exists but not running correctly
                else:
                    error_info.append((service, stripped_line, controller))
                    return ERR_SERVICE_STATUS
    except subprocess.CalledProcessError as e:
        # service not on machine
        if (e.returncode == 4):
            return ERR_SYSLOG
        else:
            error_info.append((service, e.output, controller))
            return ERR_SERVICE_STATUS
Exemplo n.º 16
0
def check_sys_service(service, controller):
    try:
        sys_status = subprocess.check_output([controller, service, 'status'], \
                        universal_newlines=True, stderr=subprocess.STDOUT)
        sys_line = sys_status.split('\n')[0]
        sys_info = sys_line.split()  # [service, '(pid', pid+')', 'is', status+'...']
        status = sys_info[-1].rstrip('.')
        if (status == 'running'):
            # exists and running correctly
            return NO_ERROR
        else:
            # exists but not running correctly
            error_info.append((service, status, controller))
            return ERR_SERVICE_STATUS
    except subprocess.CalledProcessError as e:
        # permissions issue
        if (e.returncode == 4):
            return ERR_SUDO_PERMS
        # service not on machine
        elif ((e.returncode == 1) and ('unrecognized service' in e.output)):
            return ERR_SYSLOG
        else:
            error_info.append((service, e.output, controller))
            return ERR_SERVICE_STATUS
Exemplo n.º 17
0
def check_agent_service_endpt():
    # get endpoint
    dsc_endpt = geninfo_lookup('DSC_ENDPOINT')
    if (dsc_endpt == None):
        error_info.append(('DSC (agent service) endpoint', OMSADMIN_PATH))
        return ERR_INFO_MISSING
    agent_endpt = dsc_endpt.split('/')[2]

    # check without certs
    (dsc_connected, dsc_verified) = check_endpt_ssl(SSL_CMD, agent_endpt)
    if (dsc_connected and dsc_verified):
        return NO_ERROR

    else:
        # try with certs (if they exist)
        if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)):
            ssl_command = "{0} -cert {1} -key {2}".format(
                SSL_CMD, CERT_PATH, KEY_PATH)
            (dsc_cert_connected,
             dsc_cert_verified) = check_endpt_ssl(ssl_command, agent_endpt)
            # with certs connected and verified
            if (dsc_cert_connected and dsc_cert_verified):
                return NO_ERROR
            # with certs connected, but didn't verify
            elif (dsc_cert_connected and not dsc_cert_verified):
                error_info.append(
                    (agent_endpt, ssl_command.format(agent_endpt)))
                return WARN_ENDPT
        else:
            # lets user know cert and key aren't there
            print(
                "NOTE: Certificate and key files don't exist, OMS isn't onboarded."
            )

        # if certs didn't work at all, check to see if no certs was connected (but not verified)
        if (dsc_connected and not dsc_verified):
            error_info.append((agent_endpt, SSL_CMD.format(agent_endpt)))
            return WARN_ENDPT

        # neither with nor without certs connected
        error_info.append((agent_endpt, SSL_CMD.format(agent_endpt)))
        return ERR_ENDPT
Exemplo n.º 18
0
def get_omsagent_logs(LOG_PATH):
    log_tail_size = 50
    lts_mult = 1
    parsed_log_lines = []
    log_template = r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (\+|-)\d{4} \[\w+\]: .*$"

    try:
        # open omsagent.log
        with open(LOG_PATH, 'r') as log_file:
            log_lines = log_file.readlines()
            last_update_lines = log_lines
            # read from bottom up until run into end of omsagent.conf (printed when agent starts up)
            # then grab all logs after (for all logs since agent started)
            for i in range(len(log_lines) - 1, -1, -1):
                if (log_lines[i] == "</ROOT>\n"):
                    last_update_lines = log_lines[i + 1:]
                    break

            # parse logs
            for line in last_update_lines:
                line = line.rstrip('\n')
                # empty line
                if (line == ''):
                    continue
                # conf file text
                if (re.match(log_template, line) == None):
                    continue
                parsed_log = line.split(' ', 4)
                parsed_log[3] = parsed_log[3].rstrip(':')
                parsed_log.append(line)
                # [ date, time, zone, [logtype], log, unparsed log ]
                parsed_log_lines.append(parsed_log)

        return (parsed_log_lines, None)

    # ran into an error with opening file
    except IOError as e:
        # can't access due to permissions
        if (e.errno == errno.EACCES):
            error_info.append((LOG_PATH, ))
            return (None, 100)
        # file doesn't exist
        elif (e.errno == errno.ENOENT):
            error_info.append(("file", LOG_PATH))
            return (None, ERR_FILE_MISSING)
        # some other error
        else:
            error_info.append((LOG_PATH, e))
            return (None, ERR_FILE_ACCESS)
Exemplo n.º 19
0
def check_port(port, sys_bind, sys_pt):
    oms_version = get_oms_version()
    if (oms_version == None):
        return ERR_OMS_INSTALL

    # get number of '@'s in front of port
    corr_pt = None
    if (comp_versions_ge(oms_version, '1.12')):
        if (sys_pt == 'udp'):
            corr_pt = '@'
        elif (sys_pt == 'tcp'):
            corr_pt = '@@'
    elif (sys_pt in ['udp', 'tcp']):
        corr_pt = '@'
    # verify protocol type is valid
    if (corr_pt == None):
        error_info.append(("protocol type", SYSLOGCONF_PATH))
        return ERR_INFO_MISSING

    # syslog destination file is sending to right port
    corr_port = corr_pt + sys_bind
    if (port.startswith(corr_port)):
        return NO_ERROR
    # wrong number of '@'s
    pt_count = port.count('@')
    corr_pt_count = corr_pt.count('@')
    if (pt_count != corr_pt_count):
        pt = port[:pt_count]
        error_info.append((sys_pt, corr_pt, pt, SYSLOGDEST_PATH))
        return ERR_PT
    # wrong port
    curr_bind = (port[pt_count + 1:]).split(':')[0]
    if (curr_bind != sys_bind):
        error_info.append((sys_bind, curr_bind, SYSLOGDEST_PATH))
        return ERR_PORT_MISMATCH
    # some other error?
    error_info.append((SYSLOGCONF_PATH, SYSLOGDEST_PATH))
    return ERR_PORT_SETUP
Exemplo n.º 20
0
def check_customlog(log_dict):
    log_path = log_dict[path]
    # check if path exists
    if (not os.path.isfile(log_path)):
        # try splitting on like './' or something to check both file paths
        # if that doesn't work:
        error_info.append(('file', log_path))
        return ERR_FILE_MISSING

    # check if pos file exists
    log_pos_file = log_dict[pos_file]
    if (not os.path.isfile(log_pos_file)):
        error_info.append(('file', log_pos_file))
        return ERR_FILE_MISSING

    # check pos file contents
    with open(log_pos_file, 'r') as lpf:
        parsed_lines = lpf.readlines().split()
        # mismatch in pos file filepath and custom log filepath
        if (parsed_lines[0] != log_path):
            error_info.append((log_pos_file, log_path, CLCONF_PATH))
            return ERR_CL_FILEPATH
        #TODO: check size of custom log
        pos_size = parsed_lines[1]

        # check unique number with custom log
        un_pos = parsed_lines[2]
        log_ls_info = subprocess.check_output(['ls', '-li', log_path])
        un_log = (log_ls_info.split())[0]
        un_log_hex = hex(int(un_log)).lstrip('0x').rstrip('L')
        if (un_pos != un_log_hex):
            error_info.append((log_path, un_log_hex, log_pos_file, un_pos, \
                                    CLCONF_PATH))
            return ERR_CL_UNIQUENUM

    return NO_ERROR
Exemplo n.º 21
0
def check_conf_files():
    # verify syslog.conf exists / not empty
    if (not os.path.isfile(SYSLOGCONF_PATH)):
        error_info.append(('file', SYSLOGCONF_PATH))
        return ERR_FILE_MISSING
    if (os.stat(SYSLOGCONF_PATH).st_size == 0):
        error_info.append((SYSLOGCONF_PATH, ))
        return ERR_FILE_EMPTY

    # update syslog destination path with correct location
    syslog_dest = geninfo_lookup('SYSLOG_DEST')
    if (syslog_dest == None):
        return ERR_SYSLOG
    global SYSLOGDEST_PATH
    SYSLOGDEST_PATH = syslog_dest

    # verify syslog destination exists / not empty
    if (not os.path.isfile(SYSLOGDEST_PATH)):
        error_info.append(('file', SYSLOGDEST_PATH))
        return ERR_FILE_MISSING
    if (os.stat(SYSLOGDEST_PATH).st_size == 0):
        error_info.append((SYSLOGDEST_PATH, ))
        return ERR_FILE_EMPTY

    # parse syslog.conf
    syslogconf_dict = parse_syslogconf()
    if (not syslogconf_dict):
        error_info.append(("syslog configuration info", SYSLOGCONF_PATH))
        return ERR_INFO_MISSING

    # get info for checking syslog destination file
    try:
        sys_bind = syslogconf_dict['bind']
        sys_pt = syslogconf_dict['protocol_type']
    except KeyError:
        error_info.append(("syslog configuration info", SYSLOGCONF_PATH))
        return ERR_INFO_MISSING

    # check with syslog destination file
    return check_syslogdest(sys_bind, sys_pt)
Exemplo n.º 22
0
def check_heartbeat(interactive, prev_success=NO_ERROR):
    print("CHECKING HEARTBEAT / HEALTH...")

    success = prev_success

    # TODO: run `sh /opt/microsoft/omsagent/bin/omsadmin.sh -l` to check if onboarded and running

    # check if installed correctly
    print("Checking if installed correctly...")
    if (get_oms_version() == None):
        print_errors(ERR_OMS_INSTALL)
        print(
            "Running the installation part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_installation(interactive,
                                  err_codes=False,
                                  prev_success=ERR_FOUND)

    # get workspace ID
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_CONF_PATH))
        print_errors(ERR_INFO_MISSING)
        print(
            "Running the connection part of the troubleshooter in order to find the issue..."
        )
        print(
            "================================================================================"
        )
        return check_connection(interactive,
                                err_codes=False,
                                prev_success=ERR_FOUND)

    # check if running multi-homing
    print("Checking if omsagent is trying to run multihoming...")
    checked_multihoming = check_multihoming(workspace_id)
    if (is_error(checked_multihoming)):
        return print_errors(checked_multihoming)
    else:
        success = print_errors(checked_multihoming)

    # TODO: check if other agents are sending heartbeats

    # check if omsagent is running
    print("Checking if omsagent is running...")
    checked_omsagent_running = check_omsagent_running(workspace_id)
    if (checked_omsagent_running == ERR_OMS_WONT_RUN):
        # try starting omsagent
        # TODO: find better way of doing this, check to see if agent is stopped / grab results
        checked_omsagent_running = start_omsagent(workspace_id)
    if (is_error(checked_omsagent_running)):
        return print_errors(checked_omsagent_running)
    else:
        success = print_errors(checked_omsagent_running)

    # check if omsagent.log finds any heartbeat errors
    print("Checking for errors in omsagent.log...")
    checked_log_hb = check_log_heartbeat(workspace_id)
    if (is_error(checked_log_hb)):
        # connection issue
        if (checked_log_hb == ERR_HEARTBEAT):
            print_errors(checked_log_hb)
            print(
                "Running the connection part of the troubleshooter in order to find the issue..."
            )
            print(
                "================================================================================"
            )
            return check_connection(err_codes=False, prev_success=ERR_FOUND)
        # other issue
        else:
            return print_errors(checked_log_hb)
    else:
        success = print_errors(checked_log_hb)

    return success
Exemplo n.º 23
0
def check_internet_connect():
    if (check_endpt_ssl(SSL_CMD, "docs.microsoft.com")):
        return NO_ERROR
    else:
        error_info.append((SSL_CMD.format("docs.microsoft.com"), ))
        return ERR_INTERNET
Exemplo n.º 24
0
def check_cert():
    crt_path = "/etc/opt/microsoft/omsagent/certs/oms.crt"
    try:
        crt_info = subprocess.check_output(['openssl','x509','-in',crt_path,'-text','-noout'],\
                        universal_newlines=True, stderr=subprocess.STDOUT)
        if (crt_info.startswith("Certificate:\n")):
            return NO_ERROR
        error_info.append((crt_path, ))
        return ERR_CERT
    # error with openssl
    except subprocess.CalledProcessError as e:
        try:
            err = e.output.split('\n')[1].split(':')[5]
            # openssl permissions error
            if (err == "Permission denied"):
                error_info.append((crt_path, ))
                return ERR_SUDO_PERMS
            # openssl file existence error
            elif (err == "No such file or directory"):
                error_info.append(("file", crt_path))
                return ERR_FILE_MISSING
            # openssl some other error
            else:
                error_info.append((crt_path, err))
                return ERR_FILE_ACCESS
        # catch-all in case of fluke error
        except:
            error_info.append((crt_path, e.output))
            return ERR_FILE_ACCESS
    # general error
    except:
        error_info.append((crt_path, ))
        return ERR_CERT
Exemplo n.º 25
0
def check_log_analytics_endpts():
    success = NO_ERROR
    no_certs_printed = False
    connected_err = []
    verified_err = []

    # get OMS endpoint to check if fairfax region
    oms_endpt = geninfo_lookup('OMS_ENDPOINT')
    if (oms_endpt == None):
        error_info.append(('OMS endpoint', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get workspace ID
    workspace_id = geninfo_lookup('WORKSPACE_ID')
    if (workspace_id == None):
        error_info.append(('Workspace ID', OMSADMIN_PATH))
        return ERR_INFO_MISSING

    # get log analytics endpoints
    if ('.us' in oms_endpt):
        log_analytics_endpts = ["usge-jobruntimedata-prod-1.usgovtrafficmanager.net", \
            "usge-agentservice-prod-1.usgovtrafficmanager.net", "*.ods.opinsights.azure.us", \
            "*.oms.opinsights.azure.us"]
    else:
        log_analytics_endpts = ["*.ods.opinsights.azure.com", "*.oms.opinsights.azure.com", \
            "ods.systemcenteradvisor.com"]

    for endpt in log_analytics_endpts:
        # replace '*' with workspace ID
        if ('*' in endpt):
            endpt = endpt.replace('*', workspace_id)

        # check endpoint without certs
        (la_connected, la_verified) = check_endpt_ssl(SSL_CMD, endpt)
        if (not (la_connected or la_verified)):
            # try with certs (if they exist)
            if (os.path.isfile(CERT_PATH) and os.path.isfile(KEY_PATH)):
                ssl_command = "{0} -cert {1} -key {2}".format(
                    SSL_CMD, CERT_PATH, KEY_PATH)
                (la_cert_connected,
                 la_cert_verified) = check_endpt_ssl(ssl_command, endpt)

                # didn't connect or verify with certs
                if (not (la_cert_connected or la_cert_verified)):
                    connected_err.append((endpt, ssl_command.format(endpt)))
                    success = ERR_ENDPT

                # connected but didn't verify with certs
                elif (la_cert_connected and not la_cert_verified):
                    # haven't run into a connected error already
                    if (success != ERR_ENDPT):
                        verified_err.append((endpt, ssl_command.format(endpt)))
                        success = WARN_ENDPT

            else:
                # lets user know cert and key aren't there
                if (not no_certs_printed):
                    print(
                        "NOTE: Certificate and key files don't exist, OMS isn't onboarded."
                    )
                    no_certs_printed = True

                # if certs didn't work at all, check to see if no certs was connected (but not verified)
                if (la_connected and not la_verified):
                    # haven't run into a connected error already
                    if (success != ERR_ENDPT):
                        verified_err.append((endpt, SSL_CMD.format(endpt)))
                        success = WARN_ENDPT

                # neither with nor without certs connected
                connected_err.append((endpt, SSL_CMD.format(endpt)))
                success = ERR_ENDPT

    # if any connection issues found
    if (success == ERR_ENDPT):
        error_info.extend(connected_err)
    # if no connection issues found but some verification issues found
    elif (success == WARN_ENDPT):
        error_info.extend(verified_err)
    return success
Exemplo n.º 26
0
def check_e2e():
    # get machine's hostname
    hostname = subprocess.check_output(['hostname'], universal_newlines=True).rstrip('\n')

    sources = ['Heartbeat', 'Syslog', 'Perf']

    successes = []
    failures = []

    print("--------------------------------------------------------------------------------")
    print(" Please go to https://portal.azure.com and navigate to your workspace.\n"\
          " Once there, please navigate to the 'Logs' blade, and input the queries that\n"\
          " will be printed below. If the query was successful, then you should see one\n"\
          " result; if not, then there will be no results.\n")
    # ask if user wants to skip entire query section
    no_skip_all = get_input("Do you want to continue with this section (all queries)? (y/n)",\
                            (lambda x : x.lower() in ['y','yes','n','no']),\
                            "Please type either 'y'/'yes' or 'n'/'no' to proceed.")

    if (no_skip_all.lower() in ['y','yes']):
        for source in sources:
            query = "{0} | where Computer == '{1}' | sort by TimeGenerated desc | take 1".format(source, hostname)
            print("--------------------------------------------------------------------------------")
            print(" Please run this query:")
            print("\n    {0}\n".format(query))

            # ask if query was successful
            q_result = get_input("Was the query successful? (y/n/skip)",\
                                 (lambda x : x.lower() in ['y','yes','n','no','s','skip']),\
                                 "Please type either 'y'/'yes' or 'n'/'no' to proceed, or\n"\
                                    "'s'/'skip' to skip the {0} query.".format(source))

            # skip current query
            if (q_result.lower() in ['s','skip']):
                print(" Skipping {0} query...".format(source))
                continue

            # query was successful
            elif (q_result.lower() in ['y','yes']):
                successes.append(source)
                print(" Continuing to next query...")
                continue

            # query wasn't successful
            elif (q_result.lower() in ['n','no']):
                failures.append(source)
                print(" Continuing to next query...")
                continue
            
        # summarize query section
        success_qs = ', '.join(successes) if (len(successes) > 0) else 'none'
        failed_qs  = ', '.join(failures)  if (len(failures) > 0)  else 'none'
        print("--------------------------------------------------------------------------------")
        print(" Successful queries: {0}".format(success_qs))
        print(" Failed queries: {0}".format(failed_qs))
        
        if (len(failures) > 0):
            error_info.append((', '.join(failures),))
            return ERR_QUERIES
    
    print("Continuing on with troubleshooter...")
    print("--------------------------------------------------------------------------------")
    return NO_ERROR