def __init__(self, task_infos, parent_task_id, to_translate): self._task_suffix = "trans" self._task_type = "trans" self._parent_task_id = parent_task_id task_infos.content["priority"] = task_infos.content.get("priority", 0) + 1 task_infos.content["ngpus"] = 0 task_infos.content["ncpus"] = get_cpu_count( task_infos.routes_configuration.service_config, task_infos.content["ngpus"], "trans") task_infos.content["docker"]["command"] = ["trans"] task_infos.content["docker"]["command"].append("--as_release") task_infos.content["docker"]["command"].append('-i') for f in to_translate: task_infos.content["docker"]["command"].append(f[0]) change_parent_task(task_infos.content["docker"]["command"], parent_task_id) task_infos.content["docker"]["command"].append('-o') for f in to_translate: sub_file = f[1].replace('<MODEL>', parent_task_id) task_infos.content["docker"]["command"].append(sub_file) TaskBase.__init__(self, task_infos)
def __init__(self, task_infos): self._task_suffix = "prepr" self._task_type = "prepr" self._parent_task_id = None # launch preprocess task on cpus only task_infos.content["ngpus"] = 0 task_infos.content["ncpus"] = get_cpu_count( task_infos.routes_configuration.service_config, task_infos.content["ngpus"], "preprocess") idx = 0 preprocess_command = [] train_command = task_infos.content["docker"]["command"] while train_command[idx] != 'train' and train_command[ idx] != 'preprocess': preprocess_command.append(train_command[idx]) idx += 1 # create preprocess command, don't push the model on the catalog, # and generate a pseudo model preprocess_command.append("--no_push") preprocess_command.append("preprocess") preprocess_command.append("--build_model") task_infos.content["docker"]["command"] = preprocess_command TaskBase.__init__(self, task_infos, must_patch_config_name=True)
def __init__(self, task_infos, parent_task_id): self._task_suffix = "train" self._task_type = "train" self._parent_task_id = parent_task_id task_infos.content["ngpus"] = get_gpu_count( task_infos.routes_configuration.service_config, "train") task_infos.content["ncpus"] = get_cpu_count( task_infos.routes_configuration.service_config, task_infos.content["ngpus"], "train") TaskBase.__init__(self, task_infos, must_patch_config_name=True)
def launch(service): pool_entity = service[0:2].upper() if not has_ability(flask.g, "train", pool_entity): abort(make_response(jsonify(message="insufficient credentials for train " "(entity %s)" % pool_entity), 403)) current_configuration_name = redis.hget("admin:service:%s" % service, "current_configuration") configurations = json.loads(redis.hget("admin:service:%s" % service, "configurations")) current_configuration = json.loads(configurations[current_configuration_name][1]) content = flask.request.form.get('content') if content is not None: content = json.loads(content) else: abort(flask.make_response(flask.jsonify(message="missing content in request"), 400)) files = {} for k in flask.request.files: files[k] = flask.request.files[k].read() service_module = get_service(service) content["service"] = service exec_mode = content.get('exec_mode', False) if not exec_mode: task_type = '????' if "train" in content["docker"]["command"]: task_type = "train" elif "trans" in content["docker"]["command"]: task_type = "trans" elif "preprocess" in content["docker"]["command"]: task_type = "prepr" elif "release" in content["docker"]["command"]: task_type = "relea" elif "buildvocab" in content["docker"]["command"]: task_type = "vocab" else: task_type = 'exec' if task_type == '????': abort(flask.make_response(flask.jsonify(message="incorrect task definition"), 400)) elif task_type != "exec": task_suffix = task_type else: task_suffix = get_docker_action(content["docker"]["command"]) if task_suffix is None: task_suffix = task_type # Sanity check on content. if 'options' not in content or not isinstance(content['options'], dict): abort(flask.make_response(flask.jsonify(message="invalid options field"), 400)) if 'docker' not in content: abort(flask.make_response(flask.jsonify(message="missing docker field"), 400)) if ('image' not in content['docker'] or 'registry' not in content['docker'] or 'tag' not in content['docker'] or 'command' not in content['docker']): abort(flask.make_response(flask.jsonify(message="incomplete docker field"), 400)) if content['docker']['registry'] == 'auto': content['docker']['registry'] = _get_registry(service_module, content['docker']['image']) elif content['docker']['registry'] not in service_module._config['docker']['registries']: abort(flask.make_response(flask.jsonify(message="unknown docker registry"), 400)) resource = service_module.get_resource_from_options(content["options"]) iterations = 1 if "iterations" in content: iterations = content["iterations"] if exec_mode: abort(flask.make_response(flask.jsonify(message="chain mode unavailable in exec mode"), 400)) if (task_type != "train" and iterations != 1) or iterations < 1: abort(flask.make_response(flask.jsonify(message="invalid value for iterations"), 400)) ngpus = 1 if "ngpus" in content: ngpus = content["ngpus"] ncpus = content.get("ncpus") # check that we have a resource able to run such a request if not _find_compatible_resource(service_module, ngpus, ncpus, resource): abort(flask.make_response( flask.jsonify(message="no resource available on %s for %d gpus (%s cpus)" % (service, ngpus, ncpus and str(ncpus) or "-")), 400)) if "totranslate" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="translate mode unavailable for exec cmd"), 400)) totranslate = content["totranslate"] del content["totranslate"] else: totranslate = None if "toscore" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="score mode unavailable for exec cmd"), 400)) toscore = content["toscore"] del content["toscore"] else: toscore = None if "totuminer" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="tuminer chain mode unavailable for exec cmd"), 400)) totuminer = content["totuminer"] del content["totuminer"] else: totuminer = None docker_version = content['docker']['tag'] if docker_version.startswith('v'): docker_version = docker_version[1:] try: chain_prepr_train = (not exec_mode and not content.get("nochainprepr", False) and task_type == "train" and semver.match(docker_version, ">=1.4.0")) can_trans_as_release = semver.match(docker_version, ">=1.8.0") trans_as_release = (not exec_mode and not content.get("notransasrelease", False) and semver.match(docker_version, ">=1.8.0")) content["support_statistics"] = semver.match(docker_version, ">=1.17.0") except ValueError as err: # could not match docker_version - not valid semver chain_prepr_train = False trans_as_release = False priority = content.get("priority", 0) (xxyy, parent_task_id) = shallow_command_analysis(content["docker"]["command"]) parent_struct = None parent_task_type = None if not exec_mode and parent_task_id: (parent_struct, parent_task_type) = model_name_analysis(parent_task_id) # check that parent model type matches current command if parent_task_type: if (parent_task_type == "trans" or parent_task_type == "relea" or (task_type == "prepr" and parent_task_type != "train" and parent_task_type != "vocab")): abort(flask.make_response(flask.jsonify(message="invalid parent task type: %s" % (parent_task_type)), 400)) task_ids = [] task_create = [] while iterations > 0: if (chain_prepr_train and parent_task_type != "prepr") or task_type == "prepr": prepr_task_id, explicitname = build_task_id(content, xxyy, "prepr", parent_task_id) if explicitname: patch_config_explicitname(content, explicitname) idx = 0 prepr_command = [] train_command = content["docker"]["command"] while train_command[idx] != 'train' and train_command[idx] != 'preprocess': prepr_command.append(train_command[idx]) idx += 1 # create preprocess command, don't push the model on the catalog, # and generate a pseudo model prepr_command.append("--no_push") prepr_command.append("preprocess") prepr_command.append("--build_model") content["docker"]["command"] = prepr_command content["ncpus"] = ncpus or \ get_cpu_count(current_configuration, 0, "preprocess") content["ngpus"] = 0 preprocess_resource = service_module.select_resource_from_capacity( resource, Capacity(content["ngpus"], content["ncpus"])) # launch preprocess task on cpus only task_create.append( (redis, taskfile_dir, prepr_task_id, "prepr", parent_task_id, preprocess_resource, service, _duplicate_adapt(service_module, content), files, priority, 0, content["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ("prepr", prepr_task_id, 0, content["ncpus"])) remove_config_option(train_command) change_parent_task(train_command, prepr_task_id) parent_task_id = prepr_task_id content["docker"]["command"] = train_command if task_type != "prepr": task_id, explicitname = build_task_id(content, xxyy, task_suffix, parent_task_id) if explicitname: patch_config_explicitname(content, explicitname) file_to_transtaskid = {} if task_type == "trans": try: idx = content["docker"]["command"].index("trans") output_files = get_params(("-o", "--output"), content["docker"]["command"][idx+1:]) for ofile in output_files: file_to_transtaskid[ofile] = task_id except Exception: pass content["ncpus"] = ncpus or \ get_cpu_count(current_configuration, ngpus, task_type) content["ngpus"] = ngpus if task_type == "trans" and can_trans_as_release: if "--as_release" not in content["docker"]["command"] and trans_as_release: content["docker"]["command"].append("--as_release") content["ngpus"] = ngpus = 0 task_resource = service_module.select_resource_from_capacity( resource, Capacity(content["ngpus"], content["ncpus"])) task_create.append( (redis, taskfile_dir, task_id, task_type, parent_task_id, task_resource, service, _duplicate_adapt(service_module, content), files, priority, content["ngpus"], content["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( task_type, task_id, content["ngpus"], content["ncpus"])) parent_task_type = task_type[:5] remove_config_option(content["docker"]["command"]) if totranslate: content_translate = deepcopy(content) content_translate["priority"] = priority + 1 if trans_as_release: content_translate["ngpus"] = 0 else: content_translate["ngpus"] = min(ngpus, 1) content_translate["ncpus"] = ncpus or \ get_cpu_count(current_configuration, content_translate["ngpus"], "trans") translate_resource = service_module.select_resource_from_capacity( resource, Capacity(content_translate["ngpus"], content_translate["ncpus"])) if ngpus == 0 or trans_as_release: file_per_gpu = len(totranslate) else: file_per_gpu = (len(totranslate)+ngpus-1) / ngpus subset_idx = 0 while subset_idx * file_per_gpu < len(totranslate): content_translate["docker"]["command"] = ["trans"] if trans_as_release: content_translate["docker"]["command"].append("--as_release") content_translate["docker"]["command"].append('-i') subset_totranslate = totranslate[subset_idx*file_per_gpu: (subset_idx+1)*file_per_gpu] for f in subset_totranslate: content_translate["docker"]["command"].append(f[0]) change_parent_task(content_translate["docker"]["command"], task_id) trans_task_id, explicitname = build_task_id(content_translate, xxyy, "trans", task_id) content_translate["docker"]["command"].append('-o') for f in subset_totranslate: ofile = f[1].replace('<MODEL>', task_id) file_to_transtaskid[ofile] = trans_task_id content_translate["docker"]["command"].append(ofile) task_create.append( (redis, taskfile_dir, trans_task_id, "trans", task_id, translate_resource, service, _duplicate_adapt(service_module, content_translate), (), content_translate["priority"], content_translate["ngpus"], content_translate["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "trans", trans_task_id, content_translate["ngpus"], content_translate["ncpus"])) subset_idx += 1 if toscore: toscore_parent = {} for (ofile, rfile) in toscore: ofile = ofile.replace('<MODEL>', task_id) parent_task_id = file_to_transtaskid.get(ofile) if parent_task_id: if parent_task_id not in toscore_parent: toscore_parent[parent_task_id] = {"output": [], "ref": []} ofile_split = ofile.split(':') if len(ofile_split) == 2 and ofile_split[0] == 'launcher': ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1] toscore_parent[parent_task_id]["output"].append(ofile) toscore_parent[parent_task_id]["ref"].append(rfile) for parent_task_id, oref in six.iteritems(toscore_parent): content_score = deepcopy(content) content_score["priority"] = priority + 1 content_score["ngpus"] = 0 content_score["ncpus"] = 1 score_resource = service_module.select_resource_from_capacity(resource, Capacity(0, 1)) image_score = "nmtwizard/score" option_lang = [] if parent_struct is not None: option_lang.append('-l') option_lang.append(parent_struct['xxyy'][-2:]) content_score["docker"] = { "image": image_score, "registry": _get_registry(service_module, image_score), "tag": "latest", "command": ["score", "-o"] + oref["output"] + ["-r"] + oref["ref"] + option_lang + ['-f', "launcher:scores"] } score_task_id, explicitname = build_task_id(content_score, xxyy, "score", parent_task_id) task_create.append( (redis, taskfile_dir, score_task_id, "exec", parent_task_id, score_resource, service, content_score, (), priority+2, 0, 1, {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "score", score_task_id, 0, 1)) if totuminer: # tuminer can run in CPU only mode, but it will be very slow for large data ngpus_recommend = ngpus ncpus_recommend = ncpus totuminer_parent = {} for (ifile, ofile) in totuminer: #ofile = ofile.replace('<MODEL>', task_id) parent_task_id = file_to_transtaskid.get(ofile) if parent_task_id: if parent_task_id not in totuminer_parent: totuminer_parent[parent_task_id] = {"infile": [], "outfile": [], "scorefile": []} ofile_split = ofile.split(':') if len(ofile_split) == 2 and ofile_split[0] == 'launcher': ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1] totuminer_parent[parent_task_id]["infile"].append(ifile) totuminer_parent[parent_task_id]["outfile"].append(ofile) scorefile = ofile if scorefile.endswith(".gz"): scorefile = scorefile[:-3] totuminer_parent[parent_task_id]["scorefile"].append(scorefile[:-3]) for parent_task_id, in_out in six.iteritems(totuminer_parent): content_tuminer = deepcopy(content) content_tuminer["priority"] = priority + 1 content_tuminer["ngpus"] = ngpus_recommend content_tuminer["ncpus"] = ncpus_recommend tuminer_resource = service_module.select_resource_from_capacity(resource, Capacity(ngpus_recommend, ncpus_recommend)) image_score = "nmtwizard/tuminer" content_tuminer["docker"] = { "image": image_score, "registry": _get_registry(service_module, image_score), "tag": "latest", "command": ["tuminer", "--tumode", "score", "--srcfile"] + in_out["infile"] + ["--tgtfile"] + in_out["outfile"]+ ["--output"] + in_out["scorefile"] } tuminer_task_id, explicitname = build_task_id(content_tuminer, xxyy, "tuminer", parent_task_id) task_create.append( (redis, taskfile_dir, tuminer_task_id, "exec", parent_task_id, tuminer_resource, service, content_tuminer, (), priority+2, ngpus_recommend, ncpus_recommend, {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "tuminer", tuminer_task_id, ngpus_recommend, ncpus_recommend)) iterations -= 1 if iterations > 0: parent_task_id = task_id change_parent_task(content["docker"]["command"], parent_task_id) (task_ids, task_create) = post_function('POST/task/launch', task_ids, task_create) for tc in task_create: task.create(*tc) if len(task_ids) == 1: task_ids = task_ids[0] return flask.jsonify(task_ids)