Exemplo n.º 1
0
def connect_to_resource(resource):
    """connect to resource defined in resource dictionary"""
    log.info("Validating resource accessibility. Connecting to %s.",
             resource['name'])
    if resource['ssh_private_key_file'] is not None and os.path.isfile(
            resource['ssh_private_key_file']) is False:
        log.error("Can not access ssh private key (%s)"
                  "", resource['ssh_private_key_file'])
        exit(1)

    str_io = io.StringIO()
    try:
        sys.stdout = sys.stderr = str_io
        rsh = akrr.util.ssh.ssh_resource(resource)

        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__

        log.info("Successfully connected to %s\n", resource['name'])
        log.empty_line()

        return rsh
    except AkrrError:
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__

        log.critical("Can not connect to %s\nMessage:\n%s", resource['name'],
                     str_io.getvalue())
        exit(1)
Exemplo n.º 2
0
def setup():
    if add_fake_modw:
        _add_fake_modw()
    # start bash shell
    bash = get_bash()
    bash.output = ""
    bash.timeoutMessage = 'Unexpected behavior of prep.sh (premature EOF or TIMEOUT)'

    bash.runcmd('which python3', printOutput=True)
    bash.runcmd('which ' + cfg.which_akrr, printOutput=True)

    akrr_home_arg = ""
    if cfg.default_akrr_home_dir != cfg.akrr_home_dir:
        akrr_home_arg = " --akrr-home " + cfg.akrr_home_dir

    # start akrr setup
    bash.startcmd(cfg.which_akrr + " setup " + dry_run_flag + akrr_home_arg)

    # set database user for AKRR
    _send_user_password(
        bash,
        r'Please specify a database user to access mod_akrr database.*\n\[\S+\]:',
        akrr_db_user_name, akrr_db_user_password)
    _send_su_user_password(bash, akrr_db_su_user_name,
                           akrr_db_su_user_password)

    # AK database:
    _send_user_password(
        bash,
        r'Please specify a database user to access mod_appkernel database.*\n\[\S+\]:',
        ak_db_user_name, ak_db_user_password)
    _send_su_user_password(bash, ak_db_su_user_name, ak_db_su_user_password)

    # XD database:
    _send_user_password(
        bash,
        r'Please specify the user that will be connecting to the XDMoD database.*\n\[\S+\]:',
        ak_db_user_name, ak_db_user_password)
    _send_su_user_password(bash, ak_db_su_user_name, ak_db_su_user_password)

    bash.expectSendline(
        r'.*INPUT.* Please enter the e-mail where cron will send messages.*\n',
        "" if cron_email is None else cron_email)
    # wait for prompt
    bash.justExpect(bash.prompt, timeout=60)

    log.info(bash.output)

    if bash.output.count("AKRR is set up and is running.") == 0:

        log.critical("AKRR was not set up")
        exit(1)
    else:
        log.info("AKRR is set up and is running.")
    return
Exemplo n.º 3
0
def append_to_bashrc(resource):
    # append environment variables to .bashrc
    log.info("\nAdding AKRR enviroment variables to resource's .bashrc!\n")
    if akrr.dry_run:
        return

    str_io = io.StringIO()
    try:
        sys.stdout = sys.stderr = str_io
        rsh = akrr.util.ssh.ssh_resource(resource)
        akrr_header = 'AKRR Remote Resource Environment Variables'

        out = akrr.util.ssh.ssh_command(
            rsh, '''if [ -e $HOME/.bashrc ]
then
   if [[ `grep "\#''' + akrr_header + ''' \[Start\]" $HOME/.bashrc` == *"''' +
            akrr_header + ''' [Start]"* ]]
   then
       echo "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak"
       cp $HOME/.bashrc $HOME/.bashrc_akrrbak
       head -n "$(( $(grep -n '\#''' + akrr_header +
            ''' \[Start\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) - 1 ))" $HOME/.bashrc_akrrbak > $HOME/.bashrc
       tail -n "+$(( $(grep -n '\#''' + akrr_header +
            ''' \[End\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) + 1 ))" $HOME/.bashrc_akrrbak  >> $HOME/.bashrc
   fi
fi''')
        log.debug(out)
        cmds = ('''echo "Appending AKRR records to $HOME/.bashrc"''',
                '''echo "#''' + akrr_header + ''' [Start]" >> $HOME/.bashrc''',
                '''echo "export AKRR_NETWORK_SCRATCH=\\"''' +
                resource['network_scratch'] + '''\\"" >> $HOME/.bashrc''',
                '''echo "export AKRR_LOCAL_SCRATCH=\\"''' +
                resource['local_scratch'] + '''\\"" >> $HOME/.bashrc''',
                '''echo "export AKRR_APPKER_DIR=\\"''' +
                resource['appkernel_dir'] + '''\\"" >> $HOME/.bashrc''',
                '''echo "export AKRR_AKRR_DIR=\\"''' + resource['akrr_data'] +
                '''\\"" >> $HOME/.bashrc''',
                '''echo "#''' + akrr_header + ''' [End]" >> $HOME/.bashrc''',
                '''echo "Appending AKRR records to $HOME/.bashrc"''')
        for cmd in cmds:
            out = akrr.util.ssh.ssh_command(rsh, cmd)
            log.debug(out)
        rsh.close(force=True)
        del rsh

        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
    except Exception as e:
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        log.critical(
            "Can not connect to %s\nProbably invalid credential, see full error report:\n%s",
            resource['name'], str_io.getvalue())
        raise e
Exemplo n.º 4
0
def _config_setup():
    if cfg.which_akrr is None:
        log.critical("Can not find akrr. It should be in PATH or set in conf.")
        exit(1)

    # set config
    globals().update(cfg.yml["setup"])

    if cfg.dry_run:
        global dry_run_flag
        dry_run_flag = " --dry-run "
Exemplo n.º 5
0
def set_default_value_for_unset_vars():
    """post process settings"""
    import os
    from .util import run_cmd_getoutput
    from akrr.util import log

    global which_akrr
    global akrr_conf
    global akrr_conf_dir
    global akrr_home_dir
    global default_akrr_home_dir
    global akrr_log_dir
    global in_source_install
    global rpm_install
    global dev_install

    if which_akrr is None or which_akrr == "akrr":
        try:
            which_akrr = run_cmd_getoutput("which akrr").strip()
        except Exception as e:
            log.critical("Can not find akrr executable")
            raise e
    if os.path.dirname(which_akrr) == "/usr/bin":
        rpm_install = True
    if os.path.dirname(which_akrr) == "/usr/local/bin":
        dev_install = True
    else:
        in_source_install = True

    # set default_akrr_home_dir
    if in_source_install:
        default_akrr_home_dir = os.path.abspath(
            os.path.dirname(os.path.dirname(which_akrr)))
    elif rpm_install or dev_install:
        default_akrr_home_dir = os.path.expanduser("~/akrr")

    if akrr_home_dir is None:
        akrr_home_dir = default_akrr_home_dir
    else:
        akrr_home_dir = os.path.expanduser(akrr_home_dir)

    akrr_conf_dir = os.path.join(akrr_home_dir, "etc")
    akrr_conf = os.path.join(akrr_home_dir, "etc", 'akrr.conf')
    akrr_log_dir = os.path.join(akrr_home_dir, "log")

    log.debug("AKRR conf dir and log dir locations:\n"
              "    akrr_home: {}\n"
              "    akrr_conf: {}\n"
              "    akrr_conf_dir: {}\n"
              "    akrr_log_dir: {}\n"
              "".format(akrr_home_dir, akrr_conf, akrr_conf_dir, akrr_log_dir))
Exemplo n.º 6
0
    def start_daemon():
        """
        Start the daemon
        """
        log.info("Starting AKRR daemon")
        if akrr.dry_run:
            return

        akrr_cli = os.path.join(_akrr_bin_dir, 'akrr')
        status = subprocess.call(akrr_cli + " daemon start", shell=True)

        if status != 0:
            log.critical("AKRR daemon didn't start.")
            exit(status)
Exemplo n.º 7
0
def check_connection_to_rest_api():
    # get check connection
    try:
        r = akrrrestclient.get('/scheduled_tasks')
        if r.status_code != 200:
            log.error(
                "Can not get token for AKRR REST API ( %s )\nSee server response below\n%s",
                akrrrestclient.restapi_host, json.dumps(r.json(), indent=4))
            exit(1)
    except Exception as e:
        log.critical(
            "Can not connect to AKRR REST API ( %s )\nIs it running?\nSee full error report below",
            akrrrestclient.restapi_host)
        raise e
Exemplo n.º 8
0
def validate_resource_parameter_file(resource_name):
    """validate resource parameter file and return dictionary with resource configuration"""
    # @todo reuse  cfg.verify_resource_params
    default_resource_param_filename = os.path.join(cfg.akrr_mod_dir,
                                                   "default_conf",
                                                   "default.resource.conf")
    resource_param_filename = os.path.join(cfg.cfg_dir, "resources",
                                           resource_name, "resource.conf")

    log.info("Validating %s parameters from %s", resource_name,
             resource_param_filename)

    if not os.path.isfile(resource_param_filename):
        log.error("resource parameters file (%s) does not exist!",
                  resource_param_filename)
        exit(1)

    # check syntax
    try:
        tmp = {}
        exec(
            compile(
                open(default_resource_param_filename).read(),
                default_resource_param_filename, 'exec'), tmp)
        exec(
            compile(
                open(resource_param_filename).read(), resource_param_filename,
                'exec'), tmp)
    except Exception as e:
        log.critical(
            "Can not load resource from %s.\nProbably invalid syntax.",
            resource_param_filename)
        raise e

    resource = None
    try:
        # now we can load akrr, parameters checking did h
        resource = cfg.find_resource_by_name(resource_name)
    except Exception as e:
        log.error("Can not load resource config from %s!\n%s\n%s",
                  resource_param_filename, str(e), traceback.format_exc())
        exit(1)

    log.info(
        "Syntax of %s is correct and all necessary parameters are present.",
        resource_param_filename)
    log.empty_line()
    return resource
Exemplo n.º 9
0
def submit_test_job(resource, app_name="test", nodes=2):
    # submit test job
    r = None
    try:
        payload = {
            'resource': resource['name'],
            'app': app_name,
            'resource_param': "{'nnodes':%d}" % nodes,
            'task_param': "{'test_run':True}"
        }
        r = akrrrestclient.post('/scheduled_tasks', data=payload)
        if r.status_code != 200:
            log.error(
                "Can not submit task through AKRR REST API ( %s )\nSee server response below\n%s\n",
                akrrrestclient.restapi_host, json.dumps(r.json(), indent=4))
            exit(1)
        task_id = r.json()['data']['data']['task_id']
    except Exception as e:
        if r is not None:
            log.critical(
                "Can not submit task through AKRR REST API ( %s )\n"
                "Is it still running?\nSee full error report below\n%s",
                akrrrestclient.restapi_host, r.json())
        else:
            log.critical(
                "Can not submit task through AKRR REST API ( %s )\n"
                "Is it still running?\n", akrrrestclient.restapi_host)
        raise e

    # write file with task_id
    test_job_lock_filename = get_test_job_lock_filename(resource, app_name)
    with open(test_job_lock_filename, "w") as fout:
        print(task_id, file=fout)

    log.info("\nSubmitted test job to AKRR, task_id is %d\n", task_id)
    return task_id
Exemplo n.º 10
0
    def process_common_args(cli_args):
        from . import cfg

        if "cfg" in cli_args:
            cfg.load_cfg(cli_args.cfg)

        if "verbose" in cli_args and cli_args.verbose:
            log.basicConfig(level=log.DEBUG)
            log.getLogger().setLevel(log.DEBUG)
            cfg.verbose = True

        if "very_verbose" in cli_args and cli_args.very_verbose:
            log.basicConfig(level=1)
            log.getLogger().setLevel(1)

        if "dry_run" in cli_args and cli_args.dry_run:
            cfg.dry_run = cli_args.dry_run
        if "which_akrr" in cli_args and cli_args.which_akrr is not None:
            cfg.which_akrr = cli_args.which_akrr
            if cfg.which_akrr != "akrr" and not os.path.exists(cfg.which_akrr):
                log.critical("Path to akrr is incorrect. Can not find " +
                             cfg.which_akrr)

        cfg.set_default_value_for_unset_vars()
Exemplo n.º 11
0
from akrr.util.sql import set_user_password_host_port_db
from akrr.util.sql import db_exist
from akrr.util.sql import cv
from akrr.util.sql import db_check_priv
from akrr.util.sql import get_db_client_host
from akrr.util.sql import create_user_if_not_exists
import akrr.update
from akrr.util import make_dirs
from akrr.akrrversion import akrrversion

# Since AKRR setup is the first script to execute
# Lets check python version, proper library presence and external commands.

# Python version
if sys.version_info.major < 3 or sys.version_info.minor < 4:
    log.critical("Python should be of version 3.4+. This one is " +
                 sys.version)
    exit(1)

# check openssl presence
try:
    subprocess.check_output("which openssl", shell=True)
except Exception as _e:
    log.error("""openssl program is not available. Install it!
    For example by running
    on CentOS
        sudo yum install openssl openssh-clients
    on Ubuntu:
        sudo apt-get install openssl""")
    raise _e

_akrr_dirs = akrr.get_akrr_dirs()
Exemplo n.º 12
0
def analyse_test_job_results(task_id, resource, app_name="test"):
    """analysing the output"""
    log.info("Test job is completed analyzing output\n")
    test_job_lock_filename = get_test_job_lock_filename(resource, app_name)
    r = akrrrestclient.get('/tasks/%d' % task_id)

    if r.status_code != 200:
        log.error(
            "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n",
            r.text)
        exit(1)

    completed_tasks = r.json()['data']['data']['completed_tasks']
    akrr_xdmod_instance_info = r.json(
    )['data']['data']['akrr_xdmod_instanceinfo']
    akrr_errmsg = r.json()['data']['data'].get('akrr_errmsg', "None")

    results_summary = make_results_summary(resource['name'], app_name,
                                           completed_tasks,
                                           akrr_xdmod_instance_info,
                                           akrr_errmsg)

    if completed_tasks['status'].count("ERROR") > 0:
        # execution was not successful
        if completed_tasks['status'].count(
                "ERROR Can not created batch job script and submit it to remote queue"
        ) > 0:
            log.error(
                "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s",
                results_summary)
        else:
            log.error("Status: %s\nSee full error report below\n%s",
                      completed_tasks['status'], results_summary)
        os.remove(test_job_lock_filename)
        exit(1)

    if akrr_xdmod_instance_info['status'] == 0:
        # execution was not successful
        log.error(
            "Task execution was not successful\nSee full error report below\n%s",
            results_summary)
        os.remove(test_job_lock_filename)
        exit(1)

    # see what is in report
    elm_perf = xml.etree.ElementTree.fromstring(
        akrr_xdmod_instance_info['body'])
    elm_parameters = elm_perf.find('benchmark').find('parameters')
    elm_statistics = elm_perf.find('benchmark').find('statistics')

    parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''}
    statistics = {
        'Wall Clock Time': '0.0',
        'Network scratch directory exists': '0',
        'Network scratch directory accessible': '0',
        'App kernel input exists': '0',
        'Task working directory accessible': '0',
        'local scratch directory accessible': '0',
        'local scratch directory exists': '0',
        'App kernel executable exists': '0',
        'Task working directory exists': '0',
        'Shell is BASH': '0'
    }

    for elm in list(elm_parameters):
        variable = elm.findtext('ID')
        if variable is not None:
            variable = variable.strip()
        value = elm.findtext('value')
        if value is not None:
            value = value.strip()
        units = elm.findtext('units')
        if units is not None:
            units = units.strip()

        if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes':
            value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read()

        log.debug2("parameter: {} = {} {}".format(variable, value, units))
        parameters[variable] = value

    for elm in list(elm_statistics):
        variable = elm.findtext('ID')
        if variable is not None:
            variable = variable.strip()
        value = elm.findtext('value')
        if value is not None:
            value = value.strip()
        units = elm.findtext('units')
        if units is not None:
            units = units.strip()

        statistics[variable] = value
        log.debug2("statistic: {} = {} {}".format(variable, value, units))

    files_exists = [
        'Network scratch directory exists', 'App kernel input exists',
        'local scratch directory exists', 'App kernel executable exists',
        'Task working directory exists'
    ]
    dirs_access = [
        'Network scratch directory accessible',
        'Task working directory accessible',
        'local scratch directory accessible'
    ]

    if statistics['Shell is BASH'] == '0':
        log.error(
            "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n",
            resource['name'])
        log.error_count += 1
    for file_exists in files_exists:
        if statistics[file_exists] == '0':
            log.error(file_exists.replace('exists', 'does not exist'))
            log.error_count += 1
    for dirAccess in dirs_access:
        if statistics[dirAccess] == '0':
            log.error(dirAccess.replace('accessible', 'is not accessible'))
            log.error_count += 1

    if parameters['App:ExeBinSignature'] == '':
        log.error(
            "Application signature calculator is not working, you might need to recompile it."
            "see application output for more hints")
        log.error_count += 1

    if resource['batch_scheduler'].lower() != "openstack":
        # test the nodes, log to headnode and ping them
        if parameters['RunEnv:Nodes'] == '':
            log.error(
                "Nodes are not detected, check batch_job_template and setup of AKRR_NODELIST variable"
            )
            log.error_count += 1

        nodes = parameters['RunEnv:Nodes'].split()

        requested_nodes = eval(completed_tasks['resource_param'])['nnodes']

        str_io = io.StringIO()
        try:
            sys.stdout = sys.stderr = str_io
            rsh = akrr.util.ssh.ssh_resource(resource)

            number_of_unknown_hosts = 0
            for node in set(nodes):
                log.debug2(node)
                out = akrr.util.ssh.ssh_command(rsh, "ping -c 1 %s" % node)
                if out.count("unknown host") > 0:
                    number_of_unknown_hosts += 1

            rsh.close(force=True)
            del rsh

            sys.stdout = sys.__stdout__
            sys.stderr = sys.__stderr__

            if number_of_unknown_hosts > 0:
                log.error(
                    "ERROR %d: Can not ping compute nodes from head node\n" %
                    (log.error_count + 1) +
                    "Nodes on which test job was executed detected as " +
                    parameters['RunEnv:Nodes'] + "\n" +
                    "If these names does not have sense check batch_job_template and setup of AKRR_NODELIST "
                    "variable in resource configuration file")
                log.error_count += 1
        except Exception as e:
            sys.stdout = sys.__stdout__
            sys.stderr = sys.__stderr__
            log.critical(
                "Can not connect to %s\nProbably invalid credential, see full error report:\n%s",
                resource['name'], str_io.getvalue())
            raise e

        # check ppn count
        if requested_nodes * resource['ppn'] != len(nodes):
            log.error(
                "ERROR {}: Number of requested processes (processes per node * nodes) "
                "do not match actual processes executed"
                "Either\n"
                "    AKRR_NODELIST variable is set incorrectly\n"
                "Or\n"
                "    processes per node (PPN) is wrong\n".format(
                    log.error_count + 1))
            log.error_count += 1
    log.info("\nTest kernel execution summary:\n%s", results_summary)
    log.info("\nThe output looks good.\n")
Exemplo n.º 13
0
def copy_exec_sources_and_inputs(rsh, resource):
    """Copy exec sources and inputs to remote resource"""
    log.info(
        "Preparing to copy application signature calculator,\n"
        "    app. kernel input files and \n"
        "    HPCC, IMB, IOR and Graph500 source code to remote resource\n")

    try:
        akrr.util.ssh.ssh_command(rsh, "cd %s" % resource['appkernel_dir'])
        out = akrr.util.ssh.ssh_command(rsh, "ls " + resource['appkernel_dir'])
        files_in_appker_dir = out.strip().split()

        if not ("inputs" in files_in_appker_dir
                or "inputs/" in files_in_appker_dir):
            log.info("Copying app. kernel input tarball to %s",
                     resource['appkernel_dir'])
            if not akrr.dry_run:
                akrr.util.ssh.scp_to_resource(
                    resource, cfg.appker_repo_dir + "/inputs.tar.gz",
                    resource['appkernel_dir'])

            log.info("Unpacking app. kernel input files to %s/inputs",
                     resource['appkernel_dir'])
            if not akrr.dry_run:
                out = akrr.util.ssh.ssh_command(
                    rsh,
                    "tar xvfz %s/inputs.tar.gz" % resource['appkernel_dir'])
                log.debug(out)

                out = akrr.util.ssh.ssh_command(
                    rsh, "du -h %s/inputs" % resource['appkernel_dir'])
                log.debug(out)

                if out.count("No such file or directory") == 0:
                    log.info("App. kernel input files are in %s/inputs\n",
                             resource['appkernel_dir'])
                else:
                    raise Exception("files are not copied!")
        else:
            log.warning_count += 1
            log.warning(
                "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n",
                log.warning_count, resource['appkernel_dir'])

        if not ("execs" in files_in_appker_dir
                or "execs/" in files_in_appker_dir):
            log.info(
                "Copying app. kernel execs tarball to %s\n" %
                (resource['appkernel_dir']) +
                "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator"
            )
            if not akrr.dry_run:
                akrr.util.ssh.scp_to_resource(
                    resource, cfg.appker_repo_dir + "/execs.tar.gz",
                    resource['appkernel_dir'])
            log.info(
                "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs",
                resource['appkernel_dir'])
            if not akrr.dry_run:
                out = akrr.util.ssh.ssh_command(
                    rsh,
                    "tar xvfz %s/execs.tar.gz" % resource['appkernel_dir'])
                log.debug(out)

                out = akrr.util.ssh.ssh_command(
                    rsh, "df -h %s/execs" % resource['appkernel_dir'])
                log.debug(out)

                if out.count("No such file or directory") == 0:
                    log.info(
                        "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n",
                        resource['appkernel_dir'])
                else:
                    raise Exception("files are not copied!")
        else:
            log.warning_count += 1
            log.warning(
                "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.",
                log.warning_count, resource['appkernel_dir'])
            log.warning(
                "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n"
            )

        akrr.util.ssh.ssh_command(rsh, "rm execs.tar.gz  inputs.tar.gz")
    except Exception as e:
        log.critical("Can not copy files to %s", resource['name'])
        raise e