Exemplo n.º 1
0
def append_to_bashrc(resource):
    # append environment variables to .bashrc
    log.info("\nAdding AKRR enviroment variables to resource's .bashrc!\n")
    if dry_run:
        return

    str_io = io.StringIO()
    try:
        sys.stdout = sys.stderr = str_io
        rsh = cfg.sshResource(resource)
        akrr_header = 'AKRR Remote Resource Environment Variables'

        out = cfg.sshCommand(
            rsh, '''if [ -e $HOME/.bashrc ]
then
   if [[ `grep "\#''' + akrr_header + ''' \[Start\]" $HOME/.bashrc` == *"''' +
            akrr_header + ''' [Start]"* ]]
   then
       echo "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak"
       cp $HOME/.bashrc $HOME/.bashrc_akrrbak
       head -n "$(( $(grep -n '\#''' + akrr_header +
            ''' \[Start\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) - 1 ))" $HOME/.bashrc_akrrbak > $HOME/.bashrc
       tail -n "+$(( $(grep -n '\#''' + akrr_header +
            ''' \[End\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) + 1 ))" $HOME/.bashrc_akrrbak  >> $HOME/.bashrc
   fi
fi''')
        log.debug2(out)
        out = cfg.sshCommand(
            rsh, '''
echo "Appending AKRR records to $HOME/.bashrc"
echo "#''' + akrr_header + ''' [Start]" >> $HOME/.bashrc
echo "export AKRR_NETWORK_SCRATCH=\\"''' + resource['networkScratch'] +
            '''\\"" >> $HOME/.bashrc
echo "export AKRR_LOCAL_SCRATCH=\\"''' + resource['localScratch'] +
            '''\\"" >> $HOME/.bashrc
echo "export AKRR_APPKER_DIR=\\"''' + resource['appKerDir'] +
            '''\\"" >> $HOME/.bashrc
echo "export AKRR_AKRR_DIR=\\"''' + resource['akrrData'] +
            '''\\"" >> $HOME/.bashrc
echo "#''' + akrr_header + ''' [End]" >> $HOME/.bashrc
''')
        log.debug2(out)
        rsh.close(force=True)
        del rsh

        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
    except Exception as e:
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        log.critical(
            "Can not connect to %s\nProbably invalid credential, see full error report:\n%s",
            resource['name'], str_io.getvalue())
        raise e
Exemplo n.º 2
0
def check_shell(rsh, resource):
    log.info("Checking if shell is BASH\n")
    msg = cfg.sshCommand(rsh, "echo $BASH")
    if msg.count("bash") > 0:
        log.info("Shell is BASH\n")
    else:
        log.error(
            "Shell on headnode of %s is not BASH, change it to bash and try again.\n",
            resource['name'])
        exit(1)
Exemplo n.º 3
0
def check_dir_simple(sh, d):
    """
    check that directory exists and verify its accessibility
    return None,message if does not exists
    return True,message if can write there
    return False,message if can not write there
    """
    dir(sh)
    cmd = "if [ -d \"%s\" ]\n then \n echo EXIST\n else echo DOES_NOT_EXIST\n fi" % (
        d, )
    msg = cfg.sshCommand(sh, cmd)
    if msg.find("DOES_NOT_EXIST") >= 0:
        return None, "Directory %s:%s does not exists!" % (sh.remotemachine, d)

    cmd = "echo test > " + os.path.join(d, 'akrr_test_write')
    # print cmd
    cfg.sshCommand(sh, cmd)
    # print msg
    cmd = "cat " + os.path.join(d, 'akrr_test_write')
    # print cmd
    msg = cfg.sshCommand(sh, cmd)
    # print msg
    if msg.strip() == "test":
        cmd = "rm " + os.path.join(d, 'akrr_test_write')
        cfg.sshCommand(sh, cmd)
        return True, "Directory exist and accessible for read/write"
    else:
        return False, "Directory %s:%s is NOT accessible for read/write!" % (
            sh.remotemachine, d)
Exemplo n.º 4
0
def check_dir(sh, d, exit_on_fail=True, try_to_create=True):
    status, msg = check_dir_simple(sh, d)
    if try_to_create is True and status is None:
        log.info("Directory %s:%s does not exists, will try to create it",
                 sh.remotemachine, d)
        if not dry_run:
            cmd = "mkdir -p \"%s\"" % (d, )
            cfg.sshCommand(sh, cmd)
            status, msg = check_dir_simple(sh, d)
        else:
            status, msg = (True,
                           "Directory exist and accessible for read/write")
    if exit_on_fail is False:
        return status, msg

    if status is None:
        log.error("Directory %s:%s does not exists!", sh.remotemachine, d)
        exit()
    elif status is True:
        return True, msg
    else:
        log.error("Directory %s:%s is NOT accessible for read/write!",
                  sh.remotemachine, d)
        exit()
Exemplo n.º 5
0
def check_appsig(rsh, resource):
    log.info("Testing app.signature calculator on headnode\n")
    out = cfg.sshCommand(
        rsh, "%s/execs/bin/appsigcheck.sh `which md5sum`" %
        (resource['appKerDir'], ))
    if out.count("===ExeBinSignature===") > 0 and out.count("MD5:") > 0:
        log.info("App.signature calculator is working on headnode\n")
    else:
        if dry_run:
            log.dry_run("App.signature calculator is not working\n")
            return
        log.error(
            "App.signature calculator is not working\n" +
            "See full error report below\n%s", out)
        exit(1)
Exemplo n.º 6
0
def analyse_test_job_results(task_id, resource, app_name="test"):
    """analysing the output"""
    log.info("Test job is completed analyzing output\n")
    test_job_lock_filename = get_test_job_lock_filename(resource, app_name)
    r = akrrrestclient.get('/tasks/%d' % task_id)

    if r.status_code != 200:
        log.error(
            "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n",
            r.text)
        exit(1)

    completed_tasks = r.json()['data']['data']['completed_tasks']
    akrr_xdmod_instance_info = r.json(
    )['data']['data']['akrr_xdmod_instanceinfo']
    akrr_errmsg = r.json()['data']['data']['akrr_errmsg']

    results_summary = make_results_summary(resource['name'], app_name,
                                           completed_tasks,
                                           akrr_xdmod_instance_info,
                                           akrr_errmsg)

    if completed_tasks['status'].count("ERROR") > 0:
        # execution was not successful
        if completed_tasks['status'].count(
                "ERROR Can not created batch job script and submit it to remote queue"
        ) > 0:
            log.error(
                "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s",
                results_summary)
        else:
            log.error("Status: %s\nSee full error report below\n%s",
                      completed_tasks['status'], results_summary)
        os.remove(test_job_lock_filename)
        exit(1)

    if akrr_xdmod_instance_info['status'] == 0:
        # execution was not successful
        log.error(
            "Task execution was not successful\nSee full error report below\n%s",
            results_summary)
        os.remove(test_job_lock_filename)
        exit(1)

    # see what is in report
    elm_perf = xml.etree.ElementTree.fromstring(
        akrr_xdmod_instance_info['body'])
    elm_parameters = elm_perf.find('benchmark').find('parameters')
    elm_statistics = elm_perf.find('benchmark').find('statistics')

    parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''}
    statistics = {
        'Wall Clock Time': '0.0',
        'Network scratch directory exists': '0',
        'Network scratch directory accessible': '0',
        'App kernel input exists': '0',
        'Task working directory accessible': '0',
        'local scratch directory accessible': '0',
        'local scratch directory exists': '0',
        'App kernel executable exists': '0',
        'Task working directory exists': '0',
        'Shell is BASH': '0'
    }

    for elm in list(elm_parameters):
        variable = elm.findtext('ID')
        if variable is not None:
            variable = variable.strip()
        value = elm.findtext('value')
        if value is not None:
            value = value.strip()
        units = elm.findtext('units')
        if units is not None:
            units = units.strip()

        if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes':
            value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read()

        log.debug2("parameter: {} = {} {}".format(variable, value, units))
        parameters[variable] = value

    for elm in list(elm_statistics):
        variable = elm.findtext('ID')
        if variable is not None:
            variable = variable.strip()
        value = elm.findtext('value')
        if value is not None:
            value = value.strip()
        units = elm.findtext('units')
        if units is not None:
            units = units.strip()

        statistics[variable] = value
        log.debug2("statistic: {} = {} {}".format(variable, value, units))

    files_exists = [
        'Network scratch directory exists', 'App kernel input exists',
        'local scratch directory exists', 'App kernel executable exists',
        'Task working directory exists'
    ]
    dirs_access = [
        'Network scratch directory accessible',
        'Task working directory accessible',
        'local scratch directory accessible'
    ]

    if statistics['Shell is BASH'] == '0':
        log.error(
            "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n",
            resource['name'])
        log.error_count += 1
    for file_exists in files_exists:
        if statistics[file_exists] == '0':
            log.error(file_exists.replace('exists', 'does not exist'))
            log.error_count += 1
    for dirAccess in dirs_access:
        if statistics[dirAccess] == '0':
            log.error(dirAccess.replace('accessible', 'is not accessible'))
            log.error_count += 1

    if parameters['App:ExeBinSignature'] == '':
        log.error(
            "Application signature calculator is not working, you might need to recompile it."
            "see application output for more hints")
        log.error_count += 1

    # test the nodes, log to headnode and ping them
    if parameters['RunEnv:Nodes'] == '':
        log.error(
            "Nodes are not detected, check batchJobTemplate and setup of AKRR_NODELIST variable"
        )
        log.error_count += 1

    nodes = parameters['RunEnv:Nodes'].split()

    requested_nodes = eval(completed_tasks['resource_param'])['nnodes']

    str_io = io.StringIO()
    try:
        sys.stdout = sys.stderr = str_io
        rsh = cfg.sshResource(resource)

        number_of_unknown_hosts = 0
        for node in set(nodes):
            log.debug2(node)
            out = cfg.sshCommand(rsh, "ping -c 1 %s" % node)
            if out.count("unknown host") > 0:
                number_of_unknown_hosts += 1

        rsh.close(force=True)
        del rsh

        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__

        if number_of_unknown_hosts > 0:
            log.error(
                "ERROR %d: Can not ping compute nodes from head node\n" %
                (log.error_count + 1) +
                "Nodes on which test job was executed detected as " +
                parameters['RunEnv:Nodes'] + "\n" +
                "If these names does not have sense check batchJobTemplate and setup of AKRR_NODELIST "
                "variable in resource configuration file")
            log.error_count += 1
    except Exception as e:
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        log.critical(
            "Can not connect to %s\nProbably invalid credential, see full error report:\n%s",
            resource['name'], str_io.getvalue())
        raise e

    # check ppn count
    if requested_nodes * resource['ppn'] != len(nodes):
        log.error(
            "ERROR {}: Number of requested processes (processes per node * nodes) "
            "do not match actual processes executed"
            "Either\n"
            "    AKRR_NODELIST variable is set incorrectly\n"
            "Or\n"
            "    processes per node (PPN) is wrong\n".format(log.error_count +
                                                             1))
        log.error_count += 1
    log.info("\nTest kernel execution summary:\n%s", results_summary)
    log.info("\nThe output looks good.\n")
Exemplo n.º 7
0
def copy_exec_sources_and_inputs(rsh, resource):
    """Copy exec sources and inputs to remote resource"""
    log.info(
        "Preparing to copy application signature calculator,\n"
        "    app. kernel input files and \n"
        "    HPCC, IMB, IOR and Graph500 source code to remote resource\n")

    try:
        cfg.sshCommand(rsh, "cd %s" % resource['appKerDir'])
        out = cfg.sshCommand(rsh, "ls " + resource['appKerDir'])
        files_in_appker_dir = out.strip().split()

        if not ("inputs" in files_in_appker_dir
                or "inputs/" in files_in_appker_dir):
            log.info("Copying app. kernel input tarball to %s",
                     resource['appKerDir'])
            if not dry_run:
                cfg.scpToResource(resource,
                                  cfg.appker_repo_dir + "/inputs.tar.gz",
                                  resource['appKerDir'])

            log.info("Unpacking app. kernel input files to %s/inputs",
                     resource['appKerDir'])
            if not dry_run:
                out = cfg.sshCommand(
                    rsh, "tar xvfz %s/inputs.tar.gz" % resource['appKerDir'])
                log.debug(out)

                out = cfg.sshCommand(rsh,
                                     "du -h %s/inputs" % resource['appKerDir'])
                log.debug(out)

                if out.count("No such file or directory") == 0:
                    log.info("App. kernel input files are in %s/inputs\n",
                             resource['appKerDir'])
                else:
                    raise Exception("files are not copied!")
        else:
            log.warning_count += 1
            log.warning(
                "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n",
                log.warning_count, resource['appKerDir'])

        if not ("execs" in files_in_appker_dir
                or "execs/" in files_in_appker_dir):
            log.info(
                "Copying app. kernel execs tarball to %s\n" %
                (resource['appKerDir']) +
                "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator"
            )
            if not dry_run:
                cfg.scpToResource(resource,
                                  cfg.appker_repo_dir + "/execs.tar.gz",
                                  resource['appKerDir'])
            log.info(
                "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs",
                resource['appKerDir'])
            if not dry_run:
                out = cfg.sshCommand(
                    rsh, "tar xvfz %s/execs.tar.gz" % resource['appKerDir'])
                log.debug(out)

                out = cfg.sshCommand(rsh,
                                     "df -h %s/execs" % resource['appKerDir'])
                log.debug(out)

                if out.count("No such file or directory") == 0:
                    log.info(
                        "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n",
                        resource['appKerDir'])
                else:
                    raise Exception("files are not copied!")
        else:
            log.warning_count += 1
            log.warning(
                "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.",
                log.warning_count, resource['appKerDir'])
            log.warning(
                "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n"
            )

        cfg.sshCommand(rsh, "rm execs.tar.gz  inputs.tar.gz")
    except Exception as e:
        log.critical("Can not copy files to %s", resource['name'])
        raise e
Exemplo n.º 8
0
def get_file_system_access_points():
    global resource_name
    global networkScratch
    global localScratch
    global akrrData
    global appKerDir

    home_dir = cfg.sshCommand(rsh, "echo $HOME").strip()
    scratch_network_dir = cfg.sshCommand(rsh, "echo $SCRATCH").strip()

    # localScratch
    local_scratch_default = "/tmp"
    while True:
        log.log_input(
            "Enter location of local scratch (visible only to single node):")
        localScratch = input("[%s]" % local_scratch_default)
        if localScratch.strip() == "":
            localScratch = local_scratch_default
        status, msg = resource_deploy.check_dir_simple(rsh, localScratch)
        if status:
            log.info(msg)
            log.empty_line()
            break
        else:
            log.warning(msg)
            log.warning(
                'local scratch might be have a different location on head node, so if it is by design it is ok'
            )
            log.empty_line()
            break
    localScratch = cfg.sshCommand(rsh, "echo %s" % (localScratch, )).strip()
    # networkScratch
    network_scratch_default = ""
    if scratch_network_dir != "":
        network_scratch_default = scratch_network_dir
    network_scratch_visible = False
    while True:
        log.log_input(
            "Enter location of network scratch (visible only to all nodes),"
            "used for temporary storage of app kernel input/output:")
        if network_scratch_default != "":
            networkScratch = input("[%s]" % network_scratch_default)
            if networkScratch.strip() == "":
                networkScratch = network_scratch_default
        else:
            networkScratch = input("")

        if networkScratch == "":
            log.error("Incorrect value for networkScratch, try again")
            continue

        status, msg = resource_deploy.check_dir(rsh,
                                                networkScratch,
                                                exit_on_fail=False,
                                                try_to_create=True)
        if status:
            log.info(msg)
            network_scratch_visible = True
            log.empty_line()
            break
        else:
            log.warning(msg)
            break
    networkScratch = cfg.sshCommand(rsh,
                                    "echo %s" % (networkScratch, )).strip()
    # appKerDir
    appker_dir_default = os.path.join(home_dir, "appker", resource_name)
    while True:
        log.log_input(
            "Enter future location of app kernels input and executable files:")
        appKerDir = input("[%s]" % appker_dir_default)
        if appKerDir.strip() == "":
            appKerDir = appker_dir_default
        status, msg = resource_deploy.check_dir(rsh,
                                                appKerDir,
                                                exit_on_fail=False,
                                                try_to_create=True)
        if status:
            log.info(msg)
            log.empty_line()
            break
        else:
            log.error(msg)
    appKerDir = cfg.sshCommand(rsh, "echo %s" % (appKerDir, )).strip()
    # akrrData
    akrr_data_default = os.path.join(home_dir, "akrr_data", resource_name)
    if network_scratch_visible:
        akrr_data_default = os.path.join(networkScratch, "akrr_data",
                                         resource_name)
    while True:
        log.log_input(
            "Enter future locations for app kernels working directories (can or even should be on scratch space):"
        )
        akrrData = input("[%s]" % akrr_data_default)
        if akrrData.strip() == "":
            akrrData = akrr_data_default
        status, msg = resource_deploy.check_dir(rsh,
                                                akrrData,
                                                exit_on_fail=False,
                                                try_to_create=True)
        if status:
            log.info(msg)
            log.empty_line()
            break
        else:
            log.error(msg)
    akrrData = cfg.sshCommand(rsh, "echo %s" % (akrrData, )).strip()