示例#1
0
    def delete(self):
        try:
            # Track actions through different webservices.
            uuidcode = request.headers.get('uuidcode', '<no uuidcode>')
            app.log.info("uuidcode={} - Delete Server".format(uuidcode))
            app.log.trace("uuidcode={} - Headers: {}".format(
                uuidcode, request.headers))

            # Check for the J4J intern token
            validate_auth(app.log, uuidcode,
                          request.headers.get('Intern-Authorization', None))

            accesstoken, expire, security_session = stop_job(
                app.log, uuidcode, request.headers.get('servername'),
                request.headers.get('system'), request.headers, app.urls,
                False)
            app.log.trace("uuidcode={} - Return: {};{};{}".format(
                uuidcode, accesstoken, expire, security_session))

            return "", 200, {
                'accesstoken': accesstoken,
                'expire': str(expire),
                'X-UNICORE-SecuritySession': security_session
            }
        except:
            app.log.exception("Jobs.delete failed. Bugfix required")
def get(app_logger, uuidcode, request_headers, unicore_header, app_urls, cert):
    try:
        servername = request_headers.get('servername')
        if ':' in servername:
            servername = servername.split(':')[1]
        else:
            servername = ''
        counter = 0
        children = []
        status = ''
        accesstoken = request_headers.get('accesstoken')
        expire = request_headers.get('expire')
        while True:
            # start with sleep, this function is only called, if .host was not in children
            time.sleep(3)
            # renew token. This may be run for a long time, so the accesstoken can expire
            accesstoken, expire = renew_token(
                app_logger, uuidcode, request_headers.get("tokenurl"),
                request_headers.get("authorizeurl"),
                request_headers.get("refreshtoken"), accesstoken, expire,
                request_headers.get('jhubtoken'),
                app_urls.get('hub', {}).get('url_proxy_route'),
                app_urls.get('hub', {}).get('url_token'),
                request_headers.get('escapedusername'),
                request_headers.get('servername'))
            unicore_header['Authorization'] = 'Bearer {}'.format(accesstoken)

            for i in range(3):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": request_headers.get('kernelurl'),
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app_logger.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, request_headers.get('kernelurl')))
                    text, status_code, response_header = unicore_communication.request(
                        app_logger, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app_logger.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            orchestrator_communication.set_skip(
                                app_logger, uuidcode,
                                app_urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request_headers.get('servername'), 'False')
                            app_logger.error(
                                "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            app_logger.error(
                                "uuidcode={} - Could not get properties. system: {}"
                                .format(
                                    uuidcode,
                                    request_headers.get(
                                        'system', '<system_unknown>')))
                            app_logger.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            app_logger.warning(
                                "uuidcode={} - Do not send update to JupyterHub."
                                .format(uuidcode))
                            # If JupyterHub don't receives an update for a long time it can stop the job itself.
                            orchestrator_communication.set_skip(
                                app_logger, uuidcode,
                                app_urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request_headers.get('servername'), 'False')
                            return "", 539
                    else:
                        app_logger.error(
                            "uuidcode={} - Unknown status_code. Add case for this"
                            .format(uuidcode))
                        if i < 4:
                            app_logger.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            time.sleep(2)
                        else:
                            app_logger.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                except:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.exception(
                        "uuidcode={} - Could not get properties. Try to stop it {} {}"
                        .format(uuidcode, method, remove_secret(method_args)))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(
                            app_logger, uuidcode, servername,
                            request_headers.get('system'), request_headers,
                            app_urls, True,
                            "Jupyter@JSC backend error. An administrator is informed. Please try again in a few minutes."
                        )
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return "", 539

            if properties_json.get('status') in [
                    'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL'
            ]:
                # Job is Finished for UNICORE, so it should be for JupyterHub
                orchestrator_communication.set_skip(
                    app_logger, uuidcode,
                    app_urls.get('orchestrator', {}).get('url_skip'),
                    request_headers.get('servername'), 'False')
                if not properties_json.get(
                        'statusMessage') == 'Job was aborted by the user.':
                    app_logger.error(
                        'uuidcode={} - Get: Job is finished or failed - JobStatus: {}. Send Information to JHub.\n{}'
                        .format(uuidcode, properties_json.get('status'),
                                properties_json))
                app_logger.trace(
                    "uuidcode={} - Call stop_job".format(uuidcode))
                error_msg = ""
                try:
                    mem = utils_file_loads.map_error_messages()
                    if properties_json.get('status') in [
                            'FAILED'
                    ] and properties_json.get('statusMessage') in mem.keys():
                        error_msg = mem.get(
                            properties_json.get('statusMessage', ''),
                            "Could not start your Job. Please check your configuration. An administrator is informed."
                        )
                    else:
                        app_logger.error(
                            "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience"
                            .format(uuidcode))
                        error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                except:
                    error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                try:
                    stop_job(app_logger, uuidcode, servername,
                             request_headers.get('system'), request_headers,
                             app_urls, True, error_msg)
                except:
                    app_logger.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 530

            try:
                method = "GET"
                method_args = {
                    "url": request_headers.get('filedir'),
                    "headers": unicore_header,
                    "certificate": cert
                }
                text, status_code, response_header = unicore_communication.request(
                    app_logger, uuidcode, method, method_args)
                if status_code == 200:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    # in UNICORE 8 the answer is a bit different
                    children_json = json.loads(text)
                    if 'children' in children_json.keys():
                        children = json.loads(text).get('children', [])
                    elif 'content' in children_json.keys():
                        children = list(
                            json.loads(text).get('content', {}).keys())
                    else:
                        app_logger.warning(
                            "uuidcode={} - Could not find any childrens in {}".
                            format(uuidcode, text))
                        children = []
                elif status_code == 404:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.warning(
                        "uuidcode={} - Could not get properties. 404 Not found. Do nothing and return. {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    return "", 539
                else:
                    app_logger.warning(
                        "uuidcode={} - Could not get information about filedirectory. UNICORE/X Response: {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    raise Exception(
                        "{} - Could not get information about filedirectory. Throw Exception because of wrong status_code: {}"
                        .format(uuidcode, status_code))
            except:
                counter += 1
                if counter > 10:
                    app_logger.error(
                        "uuidcode={} - Get filelist ({}) failed 10 times over 30 seconds. {} {}"
                        .format(uuidcode, request_headers.get('filedir'),
                                method, remove_secret(method_args)))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app_logger, uuidcode, servername,
                                 request_headers.get('system'),
                                 request_headers, app_urls)
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                app_logger.info(
                    "uuidcode={} - Get filelist ({}) failed {} time(s)".format(
                        uuidcode, request_headers.get('filedir'), counter))
                hub_communication.status(
                    app_logger, uuidcode,
                    app_urls.get('hub', {}).get('url_proxy_route'),
                    app_urls.get('hub', {}).get('url_status'),
                    request_headers.get('jhubtoken'), 'waitforhostname',
                    request_headers.get('escapedusername'), servername)
                continue
            if '.end' in children or '/.end' in children:
                # It's not running anymore
                status = 'stopped'
            elif '.host' in children or '/.host' in children:
                # running, build up tunnel if quota_check was successful
                if '.quota_check.out' in children or '/.quota_check.out' in children:
                    quota_result = jobs_utils.quota_check(
                        app_logger, uuidcode, app_urls, request_headers,
                        unicore_header, cert, servername)
                    if not quota_result:
                        # User Quota in $HOME is exceeded. The job was stopped.
                        return
                try:
                    tunnel_utils.create(
                        app_logger, uuidcode,
                        app_urls.get('hub', {}).get('url_proxy_route'),
                        app_urls.get('tunnel', {}).get('url_tunnel'),
                        app_urls.get('hub', {}).get('url_cancel'),
                        request_headers.get('kernelurl'),
                        request_headers.get('filedir'), unicore_header,
                        request_headers.get('servername'),
                        request_headers.get('system'),
                        request_headers.get('port'), cert,
                        request_headers.get('jhubtoken'),
                        request_headers.get('escapedusername'), servername,
                        app_urls.get('orchestrator', {}).get('url_hostname'))
                except:
                    orchestrator_communication.set_skip(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_skip'),
                        request_headers.get('servername'), 'False')
                    app_logger.exception(
                        "uuidcode={} - Could not create tunnel".format(
                            uuidcode))
                    app_logger.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app_logger, uuidcode, servername,
                                 request_headers.get('system'),
                                 request_headers, app_urls)
                    except:
                        app_logger.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return
                status = "running"
            else:
                app_logger.info(
                    "uuidcode={} - Update JupyterHub status ({})".format(
                        uuidcode, "waitforhostname"))
                hub_communication.status(
                    app_logger, uuidcode,
                    app_urls.get('hub', {}).get('url_proxy_route'),
                    app_urls.get('hub', {}).get('url_status'),
                    request_headers.get('jhubtoken'), "waitforhostname",
                    request_headers.get('escapedusername'), servername)
                continue
            app_logger.info(
                "uuidcode={} - Update JupyterHub status ({})".format(
                    uuidcode, status))
            hub_communication.status(
                app_logger, uuidcode,
                app_urls.get('hub', {}).get('url_proxy_route'),
                app_urls.get('hub', {}).get('url_status'),
                request_headers.get('jhubtoken'), status,
                request_headers.get('escapedusername'), servername)
            if status in ['running', 'stopped'] and request_headers.get(
                    'spawning',
                    'true').lower() == 'true':  # spawning is finished
                app_logger.trace(
                    'uuidcode={} - Tell J4J_Orchestrator that the spawning is done'
                    .format(uuidcode))
                try:
                    orchestrator_communication.set_spawning(
                        app_logger, uuidcode,
                        app_urls.get('orchestrator', {}).get('url_spawning'),
                        request_headers.get('servername'), 'False')
                except:
                    app_logger.exception(
                        "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}"
                        .format(uuidcode, request_headers.get('servername')))
            orchestrator_communication.set_skip(
                app_logger, uuidcode,
                app_urls.get('orchestrator', {}).get('url_skip'),
                request_headers.get('servername'), 'False')
            return
    except:
        app_logger.exception("uuidcode={} - Bugfix required".format(uuidcode))
示例#3
0
    def post(self):
        try:
            # Track actions through different webservices.
            uuidcode = request.headers.get('uuidcode', '<no uuidcode>')
            app.log.info("uuidcode={} - Spawn Server".format(uuidcode))
            app.log.trace("uuidcode={} - Headers: {}".format(
                uuidcode, request.headers))
            app.log.trace("uuidcode={} - Json: {}".format(
                uuidcode, request.json))

            # Check for J4J intern token
            validate_auth(app.log, uuidcode,
                          request.headers.get('Intern-Authorization'))

            servername = request.headers.get('servername')
            # Create header for unicore job
            try:
                unicore_header, accesstoken, expire = unicore_utils.create_header(
                    app.log,  # @UnusedVariable
                    uuidcode,
                    request.headers,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_token'),
                    request.headers.get('escapedusername'),
                    servername)
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                app.log.exception(
                    "uuidcode={} - Could not create header for UNICORE/X Job. {} {}"
                    .format(uuidcode, remove_secret(request.json),
                            app.urls.get('tunnel', {}).get('url_remote')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                # Return positive status: Administrator is informed and there is nothing we can do here otherwise.
                return "", 200

            # Create input files for the job. A working J4J_tunnel webservice is required
            try:
                unicore_input = unicore_utils.create_inputs(
                    app.log, uuidcode, request.json,
                    request.headers.get('project'),
                    app.urls.get('tunnel', {}).get('url_remote'),
                    request.headers.get('account'))
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes."
                app.log.exception(
                    "uuidcode={} - Could not create input files for UNICORE/X Job. {} {}"
                    .format(uuidcode, remove_secret(request.json),
                            app.urls.get('tunnel', {}).get('url_remote')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 534

            # Create Job description
            unicore_file = utils_file_loads.get_unicorex()
            if unicore_file.get(request.json.get('system').upper(),
                                {}).get("UNICORE8", False):
                unicore_json = unicore_utils.create_unicore8_job(
                    app.log, uuidcode, request.json,
                    request.headers.get('Project'), unicore_input,
                    request.headers.get('escapedusername'))
            else:
                unicore_json = unicore_utils.create_job(
                    app.log, uuidcode, request.json,
                    request.headers.get('Project'), unicore_input)

            # Get URL and certificate to communicate with UNICORE/X
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X url".format(uuidcode))
            unicorex = utils_file_loads.get_unicorex()
            url = unicorex.get(request.json.get('system', ''), {}).get(
                'link',
                '<no_url_found_for_{}>'.format(request.json.get('system')))
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X url Result: {}".format(
                    uuidcode, url))
            cert = unicorex.get(request.json.get('system', ''),
                                {}).get('certificate', False)
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}"
                .format(uuidcode, cert))

            # Submit Job. It will not be started, because of unicore_json['haveClientStageIn']='true'
            kernelurl = ""
            try:
                hub_communication.status(
                    app.log, uuidcode,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_status'),
                    request.headers.get('jhubtoken'), 'submitunicorejob',
                    request.headers.get('escapedusername'), servername)
                method = "POST"
                method_args = {
                    "url": url + "/jobs",
                    "headers": unicore_header,
                    "data": json.dumps(unicore_json),
                    "certificate": cert
                }
                app.log.info("uuidcode={} - Submit UNICORE/X Job to {}".format(
                    uuidcode, url + "/jobs"))
                text, status_code, response_header = unicore_communication.request(
                    app.log, uuidcode, method, method_args)
                if status_code != 201:
                    app.log.warning(
                        "uuidcode={} - Could not submit Job. Response from UNICORE/X: {} {} {}."
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    if status_code == 500:
                        app.log.error(
                            "uuidcode={} - UNICORE RESTART REQUIRED!! {}".
                            format(
                                uuidcode,
                                request.json.get('system',
                                                 '<system_unknown>')))
                    elif status_code == 403 or status_code == 432:
                        raise SpawnException(
                            "Invalid token. Please logout and login again.")
                    else:
                        app.log.error(
                            "uuidcode={} - Unexpected status_code. Add case for this status_code."
                            .format(uuidcode))
                    raise SpawnException(
                        "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes."
                    )
                else:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    kernelurl = response_header['Location']
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                    app.log.exception(
                        "uuidcode={} - User message: {} - Could not submit Job. {} {}"
                        .format(uuidcode, err_msg, method,
                                remove_secret(method_args)))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg, False)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 539

            # get properties of job
            for i in range(5):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": kernelurl,
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code != 200:
                        if status_code == 500:
                            app.log.error(
                                "uuidcode={} - UNICORE RESTART REQUIRED!! {}".
                                format(
                                    uuidcode,
                                    request.json.get('system',
                                                     '<system_unknown>')))
                            raise SpawnException(
                                "A backend service has to be restarted. An administrator is informed. Please try again in a few minutes."
                            )
                        else:
                            app.log.error(
                                "uuidcode={} - Unexpected status_code. Add case for this status_code."
                                .format(uuidcode))
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties of Job. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties of Job. Response from UNICORE/X: {} {} {}."
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties of Job. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                    else:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                except (SpawnException, Exception) as e:
                    if type(e).__name__ == "SpawnException":
                        err_msg = str(e)
                    else:
                        err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                        app.log.exception(
                            "uuidcode={} - Could not get properties of Job. {} {}"
                            .format(uuidcode, method,
                                    remove_secret(method_args)))
                    app.log.trace(
                        "uuidcode={} - Call stop_job".format(uuidcode))
                    try:
                        stop_job(app.log, uuidcode, servername,
                                 request.json.get('system'), request.headers,
                                 app.urls, True, err_msg)
                    except:
                        app.log.exception(
                            "uuidcode={} - Could not stop Job. It may still run"
                            .format(uuidcode))
                    return "", 539

            # get file directory
            # this will be used in get. Ask it here once and send it to get() afterwards
            filedirectory = ""
            try:
                method = "GET"
                method_args = {
                    "url":
                    properties_json['_links']['workingDirectory']['href'],
                    "headers": unicore_header,
                    "certificate": cert
                }
                app.log.info(
                    "uuidcode={} - Get path of file directory of UNICORE/X Job"
                    .format(uuidcode))
                text, status_code, response_header = unicore_communication.request(
                    app.log, uuidcode, method, method_args)
                if status_code != 200:
                    app.log.error(
                        "uuidcode={} - Unknown status_code. Please add case for this status_code"
                        .format(uuidcode))
                    app.log.warning(
                        "uuidcode={} - Could not get filedirectory. UNICORE/X Response: {} {} {}"
                        .format(uuidcode, text, status_code,
                                remove_secret(response_header)))
                    raise Exception(
                        "{} - Could not get filedirectory. Throw exception because of wrong status_code: {}"
                        .format(uuidcode, status_code))
                else:
                    unicore_header[
                        'X-UNICORE-SecuritySession'] = response_header[
                            'X-UNICORE-SecuritySession']
                    filedirectory = json.loads(text)['_links']['files']['href']
            except (SpawnException, Exception) as e:
                if type(e).__name__ == "SpawnException":
                    err_msg = str(e)
                else:
                    err_msg = "Unknown Error. An administrator is informed. Please try again in a few minutes"
                app.log.exception(
                    "uuidcode={} - Could not get filedirectory. {} {}".format(
                        uuidcode, method, remove_secret(method_args)))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.json.get('system'), request.headers,
                             app.urls, True, err_msg)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 539

            return "", 201, {
                'kernelurl':
                kernelurl,
                'filedir':
                filedirectory,
                'X-UNICORE-SecuritySession':
                unicore_header.get('X-UNICORE-SecuritySession')
            }
        except:
            app.log.exception("Jobs.post failed. Bugfix required")
示例#4
0
    def get(self):
        try:
            # Track actions through different webservices.
            uuidcode = request.headers.get('uuidcode', '<no uuidcode>')
            app.log.info("uuidcode={} - Get Server Status".format(uuidcode))
            app.log.trace("uuidcode={} - Headers: {}".format(
                uuidcode, request.headers))

            # Check for J4J intern token
            validate_auth(app.log, uuidcode,
                          request.headers.get('intern-authorization'))
            servername = request.headers.get('servername')

            # Create UNICORE header and get certificate
            try:
                unicore_header, accesstoken, expire = unicore_utils.create_header(
                    app.log,  # @UnusedVariable
                    uuidcode,
                    request.headers,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_token'),
                    request.headers.get('escapedusername'),
                    servername)
            except (SpawnException, Exception):
                app.log.exception(
                    "uuidcode={} - Could not Create Header. Token from user {} might be revoked. Do nothing and return."
                    .format(uuidcode, request.headers.get('escapedusername')))
                # Return positive status: Administrator is informed and there is nothing we can do here otherwise.
                return "", 200
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path".format(
                    uuidcode))
            unicorex = utils_file_loads.get_unicorex()
            cert = unicorex.get(request.headers.get('system', ''),
                                {}).get('certificate', False)
            app.log.trace(
                "uuidcode={} - FileLoad: UNICORE/X certificate path Result: {}"
                .format(uuidcode, cert))

            # Get Properties of kernelurl
            kernelurl = request.headers.get('kernelurl')
            for i in range(5):  # @UnusedVariable
                properties_json = {}
                try:
                    method = "GET"
                    method_args = {
                        "url": kernelurl,
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get Properties of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        properties_json = json.loads(text)
                        if properties_json.get(
                                'status') == 'UNDEFINED' and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received status UNDEFINED. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. 404 Not found. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties. 404 Not found. Stop Job and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            try:
                                stop_job(app.log, uuidcode, servername,
                                         request.headers.get('system'),
                                         request.headers, app.urls, True, '',
                                         False)
                            except:
                                app.log.exception(
                                    "uuidcode={} - Could not stop Job. It may still run"
                                    .format(uuidcode))
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - UNICORE RESTART REQUIRED!!. system: {}"
                                .format(
                                    uuidcode,
                                    request.headers.get(
                                        'system', '<system_unknown>')))
                            app.log.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            app.log.warning(
                                "uuidcode={} - Do not send update to JupyterHub."
                                .format(uuidcode))
                            # If JupyterHub don't receives an update for a long time it can stop the job itself.
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    else:
                        app.log.error(
                            "uuidcode={} - Unknown status_code received. Add case for this: {} {}"
                            .format(uuidcode, status_code, text))
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get properties. Sleep for 2 seconds and try again"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not get properties. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            raise Exception(
                                "{} - Could not get properties. Throw exception because of wrong status_code: {}"
                                .format(uuidcode, status_code))
                except:
                    app.log.exception(
                        "uuidcode={} - Could not get properties. JupyterLab will be still running. {} {}"
                        .format(uuidcode, method, remove_secret(method_args)))
                    app.log.warning(
                        "uuidcode={} - Do not send update to JupyterHub.".
                        format(uuidcode))
                    # If JupyterHub don't receives an update for a long time it can stop the job itself.
                    orchestrator_communication.set_skip(
                        app.log, uuidcode,
                        app.urls.get('orchestrator', {}).get('url_skip'),
                        request.headers.get('servername'), 'False')
                    return "", 539

            if properties_json.get('status') in [
                    'SUCCESSFUL', 'ERROR', 'FAILED', 'NOT_SUCCESSFUL'
            ]:
                # Job is Finished for UNICORE, so it should be for JupyterHub
                if request.headers.get('pollspawner',
                                       'false').lower() == 'true':
                    app.log.error(
                        'uuidcode={} - Get (poll spawner): Job is finished or failed - JobStatus: {}. Send Information to JHub. {}'
                        .format(uuidcode, properties_json.get('status'),
                                properties_json))
                    if properties_json.get(
                            'statusMessage', ''
                    ) == "Failed: Execution was not completed (no exit code file found), please check standard error file <stderr>":
                        app.log.error(
                            "uuidcode={} - UNICORE hotfix: do nothing because that's most likely a bug."
                            .format(uuidcode))
                        return "", 200
                else:
                    if not properties_json.get(
                            'statusMessage') == 'Job was aborted by the user.':
                        app.log.error(
                            'uuidcode={} - At starting process: Job is finished or failed - JobStatus: {}. Send Information to JHub. {}'
                            .format(uuidcode, properties_json.get('status'),
                                    properties_json))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                orchestrator_communication.set_skip(
                    app.log, uuidcode,
                    app.urls.get('orchestrator', {}).get('url_skip'),
                    request.headers.get('servername'), 'False')
                error_msg = ""
                try:
                    mem = utils_file_loads.map_error_messages()
                    if properties_json.get('status') in [
                            'FAILED'
                    ] and properties_json.get('statusMessage') in mem.keys():
                        error_msg = mem.get(
                            properties_json.get('statusMessage', ''),
                            "Could not start your Job. Please check your configuration. An administrator is informed."
                        )
                    else:
                        for key, value in mem.items():
                            if properties_json.get('statusMessage',
                                                   '').startswith(key):
                                error_msg = value
                        if error_msg == "":
                            if request.headers.get('pollspawner',
                                                   'false').lower() == 'true':
                                app.log.error(
                                    "uuidcode={} - StatusMessage from Failed UNICORE Job not found in /etc/j4j/j4j_mount/j4j_unicore/map_error_messages.json. Please update to have a better user experience"
                                    .format(uuidcode))
                            error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                except:
                    error_msg = "Could not start your Job. Please check your configuration. An administrator is informed."
                try:
                    stop_job(app.log, uuidcode, servername,
                             request.headers.get('system'), request.headers,
                             app.urls, True, error_msg)
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
                return "", 530

            # The Job is not finished yet (good)
            # Get Files in the filedir
            children = []
            for i in range(5):  # @UnusedVariable
                try:
                    method = "GET"
                    method_args = {
                        "url": request.headers.get('filedir'),
                        "headers": unicore_header,
                        "certificate": cert
                    }
                    app.log.info(
                        "uuidcode={} - Get list of files of UNICORE/X Job {}".
                        format(uuidcode, kernelurl))
                    text, status_code, response_header = unicore_communication.request(
                        app.log, uuidcode, method, method_args)
                    if status_code == 200:
                        unicore_header[
                            'X-UNICORE-SecuritySession'] = response_header[
                                'X-UNICORE-SecuritySession']
                        # in UNICORE 8 the answer is a bit different
                        children_json = json.loads(text)
                        if 'children' in children_json.keys():
                            children = json.loads(text).get('children', [])
                        elif 'content' in children_json.keys():
                            children = list(
                                json.loads(text).get('content', {}).keys())
                        else:
                            app.log.warning(
                                "uuidcode={} - Could not find any childrens in {}"
                                .format(uuidcode, text))
                            children = []
                        if len(children) == 0 and i < 4:
                            app.log.debug(
                                "uuidcode={} - Received empty children list. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            break
                    elif status_code == 404:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. 404 Not found. Try again in 2 seconds."
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - Could not get children list. 404 Not found. Do nothing and return. {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    elif status_code == 500:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. Status Code 500. Try again in 2 seconds."
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - UNICORE/X RESTART REQUIRED".
                                format(uuidcode))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                    else:
                        if i < 4:
                            app.log.debug(
                                "uuidcode={} - Could not get children list. Try again in 2 seconds"
                                .format(uuidcode))
                            sleep(2)
                        else:
                            app.log.error(
                                "uuidcode={} - Unknown status code. Add case for this: {} {}"
                                .format(status_code, text))
                            app.log.error(
                                "uuidcode={} - Could not get children list. Do nothing and return. UNICORE/X Response: {} {} {}"
                                .format(uuidcode, text, status_code,
                                        remove_secret(response_header)))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            return "", 539
                except:
                    app.log.error(
                        "uuidcode={} - UNICORE/X RESTART REQUIRED".format(
                            uuidcode))
                    app.log.exception(
                        "uuidcode={} - Could not get children list. {} {}".
                        format(uuidcode, method, remove_secret(method_args)))
                    orchestrator_communication.set_skip(
                        app.log, uuidcode,
                        app.urls.get('orchestrator', {}).get('url_skip'),
                        request.headers.get('servername'), 'False')
                    return "", 539

            # get the 'real' status of the job from the files in the working_directory
            # 'real' means: We don't care about Queued, ready, running or something. We just want to know: Is it bad (failed or cancelled) or good (running or spawning)
            status = ''
            if properties_json.get('status') in [
                    'QUEUED', 'READY', 'RUNNING', 'STAGINGIN'
            ]:
                if '.end' in children or '/.end' in children:
                    # It's not running anymore
                    status = 'stopped'
                elif '.tunnel' in children or '/.tunnel' in children:
                    # It's running and tunnel is up
                    status = 'running'
                elif '.host' in children or '/.host' in children:
                    if request.headers.get('pollspawner',
                                           'false').lower() == 'true':
                        # If there's an error when collecting the children list it may happen, that we would try to create a tunnel for a server that's already running for a long time
                        app.log.error(
                            'uuidcode={} - Poll Spawner wants to create tunnel. Stop it. Children list: {}'
                            .format(uuidcode, children))
                        status = 'running'
                    else:
                        # build up tunnel
                        try:
                            tunnel_utils.create(
                                app.log, uuidcode,
                                app.urls.get('hub', {}).get('url_proxy_route'),
                                app.urls.get('tunnel', {}).get('url_tunnel'),
                                app.urls.get('hub',
                                             {}).get('url_cancel'), kernelurl,
                                request.headers.get('filedir'), unicore_header,
                                request.headers.get('servername'),
                                request.headers.get('system'),
                                request.headers.get('port'), cert,
                                request.headers.get('jhubtoken'),
                                request.headers.get('escapedusername'),
                                servername)
                        except:
                            app.log.error(
                                "uuidcode={} - Could not create Tunnel. Used Parameters: {} {} {} {} {} {} {} {} {} {}"
                                .format(
                                    uuidcode,
                                    app.urls.get('tunnel',
                                                 {}).get('url_tunnel'),
                                    app.urls.get('hub', {}).get('url_cancel'),
                                    kernelurl, request.headers.get('filedir'),
                                    remove_secret(unicore_header),
                                    request.headers.get('servername'),
                                    request.headers.get('system'),
                                    request.headers.get('port'), cert,
                                    '<secret>'))
                            app.log.trace(
                                "uuidcode={} - Call stop_job".format(uuidcode))
                            orchestrator_communication.set_skip(
                                app.log, uuidcode,
                                app.urls.get('orchestrator',
                                             {}).get('url_skip'),
                                request.headers.get('servername'), 'False')
                            try:
                                stop_job(
                                    app.log, uuidcode, servername,
                                    request.headers.get('system'),
                                    request.headers, app.urls, True,
                                    "Jupyter@JSC internal error. An administrator is informed. Please try again in a few minutes."
                                )
                            except:
                                app.log.exception(
                                    "uuidcode={} - Could not stop Job. It may still run"
                                    .format(uuidcode))
                            return "", 539
                    status = 'running'
                else:
                    if request.headers.get('pollspawner',
                                           'false').lower() == 'true':
                        # If there's an error when collecting the children list it may happen, that we would create a thread to get better information. We just send running and hope for the next run
                        app.log.error(
                            'uuidcode={} - Poll Spawner wants to create get_status thread. Prevent it. Children list: {}'
                            .format(uuidcode, children))
                        status = 'running'
                    else:
                        request_headers = {}
                        for key, value in request.headers.items():
                            if 'Token' in key:
                                key = key.replace('-', '_')
                            request_headers[key.lower()] = value
                        app.log.trace(
                            "uuidcode={} - New Header for Thread: {}".format(
                                uuidcode, request_headers))
                        # no .host in children, let's start a thread which looks for it every second
                        t = Thread(target=jobs_threads.get,
                                   args=(app.log, uuidcode, request_headers,
                                         unicore_header, app.urls, cert))
                        t.start()
                        status = 'waitforhostname'
                app.log.info(
                    "uuidcode={} - Update JupyterHub status ({})".format(
                        uuidcode, status))
                hub_communication.status(
                    app.log, uuidcode,
                    app.urls.get('hub', {}).get('url_proxy_route'),
                    app.urls.get('hub', {}).get('url_status'),
                    request.headers.get('jhubtoken'), status,
                    request.headers.get('escapedusername'), servername)
                if status in ['running', 'stopped'] and request.headers.get(
                        'spawning',
                        'true').lower() == 'true':  # spawning is finished
                    app.log.trace(
                        'uuidcode={} - Tell J4J_Orchestrator that the spawning is done'
                        .format(uuidcode))
                    try:
                        orchestrator_communication.set_spawning(
                            app.log, uuidcode,
                            app.urls.get('orchestrator',
                                         {}).get('url_spawning'),
                            request.headers.get('servername'), 'False')
                    except:
                        app.log.exception(
                            "uuidcode={} - Could not set spawning to false in J4J_Orchestrator database for {}"
                            .format(uuidcode,
                                    request_headers.get('servername')))

            else:
                app.log.error('uuidcode={} - Unknown JobStatus: {}'.format(
                    uuidcode, properties_json.get('status')))
                app.log.trace("uuidcode={} - Call stop_job".format(uuidcode))
                try:
                    stop_job(
                        app.log, uuidcode, servername,
                        request.headers.get('system'), request.headers,
                        app.urls, True,
                        "A backend Service had a problem. An administrator is informed. Please try it again in a few minutes."
                    )
                except:
                    app.log.exception(
                        "uuidcode={} - Could not stop Job. It may still run".
                        format(uuidcode))
            if status != 'waitforhostname':  # no thread was started, so the check is finished
                orchestrator_communication.set_skip(
                    app.log, uuidcode,
                    app.urls.get('orchestrator', {}).get('url_skip'),
                    request.headers.get('servername'), 'False')
        except:
            app.log.exception("Jobs.get failed. Bugfix required")