def _apply_binding_to_task(task, binding, allow_object=False): """ Helper method to apply bindings to a given task. This function modifies the given task object and returns it modified. When specifying bindings, there is no qualification for a binding being an input, output, or param, so when trying to apply the binding we need to try each. A TypeError is thrown whenever the name for an input is already applied to another type of input (ex. set_input('foo') called, then later set_param('foo','bar') will throw a TypeError. So it is expected that the name for each binding throw a TypeError for the other types of slot names (so if param named 'foo', then TypeError should be thrown for set_input('foo') and set_output('foo') """ for (param_name, param_value) in binding.iteritems(): bound = False if param_name in task.get_inputs(): try: # special-case when task.run() is specifying input binding with object # this way when _realize_input is called it will find the object dependency if allow_object is True and _Task._is_valid_data_structure(param_value): if not hasattr(task, "_local_binding") or task._local_binding is None: task._local_binding = {} task._local_binding[param_name] = param_value else: task.set_inputs({param_name: param_value}) bound = True __LOGGER__.debug("Applied binding named: '%s' as input, with value: '%s'" % (param_name, param_value)) except TypeError: pass if param_name in task.get_outputs(): try: task.set_outputs({param_name: param_value}) bound = True __LOGGER__.debug("Applied binding named: '%s' as output, with value: '%s'" % (param_name, param_value)) except TypeError: pass if param_name in task.get_params(): try: task.set_params({param_name: param_value}) bound = True __LOGGER__.debug("Applied binding named: '%s' as param, with value: '%s'" % (param_name, param_value)) except TypeError: pass if bound is False: __LOGGER__.warning( "Binding not applied since not found in input, output, or params. Name: '%s', value: '%s'" % (param_name, param_value) )
def create(tasks=None, name=None, environment=None, function=None, function_arguments=None, required_packages=None): """ Creates a Job and begins executing it. The Job can be defined by either specifying a list of tasks, with optional bindings, or with a function, with arguments defined. Each Job also needs to know where to run, and that is specified by the environment. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- tasks : list [Task | str | tuple [ str, dict ]] | str List of Tasks to run. name : str Name for this execution (names the returned Job). Default is environment name + timestamp. environment : :class:`~graphlab.deploy.environment.EC2` | :class:`~graphlab.deploy.environment.Hadoop` | :class:`~graphlab.deploy.environment.LocalAsync` Optional environment for execution. This would commonly hold access keys, launch locations etc. Also included in the environment object is a dictionary for associated metadata to pass to the execution. Default is 'LocalAsync', which will have the execution occur in the background locally. function : function Function to be executed in this Job, with arguments to pass to this function specified by function_arguments. If a function is specified, then tasks cannot be specified. Specifying a function makes it easy to get code running in a remote environment. If the function returns a dict then it will be collated into the results. If the function returns something other than a dict, it will be cast to an str and that will be collated into the results. See the examples below for more information. See the example below for more information. function_arguments : dict | list [ dict ] | :class:`~graphlab.SFrame` Arguments to pass to the specified function as kwargs. To run multiple invocations of the function, simply specify a list of arguments or an SFrame. Each element in the list will result in invoking the function once. Each row of the SFrame will be used to invoke the function. required_packages : list [ str ] | set [ str ] List of package requirements (same as disutils.requires) format for packages required for running this Job. This is most useful to specify any non-standard Python packages required to run the function specified. Returns ------- job : :py:class:`~graphlab.deploy._job.Job` Used for monitoring and managing the execution of the Job. Notes ----- - When this method is invoked, each Task specified is cloned and a snapshot of it is used for execution. This snapshot can then be queried by inspecting the Job object returned. Examples -------- Creating a Job using a function instead of Tasks is easy. Just define a function and then use it when calling job.create. Using a list of dicts to specify arguments: >>> def sum_four(one, two, three, four): >>> return {'sum': one + two + three + four} >>> >>> job = graphlab.deploy.job.create( >>> function=sum_four, >>> function_arguments=[{'one':1, 'two':2, >>> 'three':3, 'four':4}]) >>> >>> results = job.get_results() # SFrame with aggregated results Using an SFrame to specify multiple sets of arguments: >>> def mult_three(one, two, three): >>> return {'product': one * two * three} >>> >>> sf = graphlab.SFrame(data={'one':[1,5], 'two':[2,6], 'three':[3,7]}) >>> job = graphlab.deploy.job.create(function=mult_three, >>> function_arguments=sf) >>> >>> +----+--------------------------------+------------------+-----------+ >>> | id | input | result | status | >>> +----+--------------------------------+------------------+-----------+ >>> | 0 | {'three': 3, 'two': 2, 'on ... | {'product': 6} | Completed | >>> | 1 | {'three': 7, 'two': 6, 'on ... | {'product': 210} | Completed | >>> +----+--------------------------------+------------------+-----------+ >>> +---------------------------+---------------------------+---------+ >>> | start_time | end_time | message | >>> +---------------------------+---------------------------+---------+ >>> | 2014-11-17 11:06:38+00:00 | 2014-11-17 11:06:38+00:00 | | >>> | 2014-11-17 11:06:40+00:00 | 2014-11-17 11:06:40+00:00 | | >>> +---------------------------+---------------------------+---------+ >>> [2 rows x 7 columns] Each entry in the tasks list could be a pair with a dictionary of bindings for that entry. For example: >>> tasks = [('task1', {'input':'s3://big-file'}), >>> ('task2', {'input':'/localfile'})] >>> graphlab.deploy.job.create(tasks, name='with-bindings') """ tracker = _mt._get_metric_tracker() _session = _gl.deploy._default_session if tasks is None and function is None: raise TypeError("tasks or function needs to be defined") if tasks is not None and function is not None: raise TypeError("Cannot specify BOTH tasks and function") if environment is None: environment = _gl.deploy.environments["async"] if environment is None: environment = _environment.LocalAsync("async") environment.save() else: if isinstance(environment, str): __LOGGER__.debug("Loading environment: %s" % environment) environment = _gl.deploy.environments[environment] elif not isinstance(environment, _environment.Environment): raise Exception("Unknown type of environment") if environment is None: raise TypeError( "Environment cannot be loaded correctly with name '%s', please confirm this environment exists by calling graphlab.deploy.environments." % environment ) # always clone the environment, so not mutating existing environment = environment.clone() __LOGGER__.info("Preparing using environment: %s" % environment.name) if name is not None: if type(name) is not str: raise TypeError("The name you gave for this job is not a string.") __LOGGER__.info("Beginning Job Validation.") # special handling for function= parameter combiner = None if function is not None: # clobber tasks specified and create a Task for the execution of the function tasks = [] if not _inspect.isfunction(function): raise TypeError("Invalid function, must be a Python function.") bindings = function_arguments if bindings is None: bindings = [{}] elif not isinstance(bindings, list) and not isinstance(bindings, _gl.SFrame): bindings = [bindings] # if no name specified make sure the Task names are prefixed with Job name to ensure uniqueness if name is None or name == "": name = "job-%s-%s-%s" % (function.__name__, environment.name, _time.time()) combiner = _Task(name + "-combiner") combiner.set_code(_combiner_task) idx = -1 for binding in bindings: idx = idx + 1 task = _Task("%s-%s-%d" % (name, function.__name__, idx)) task.set_code(_wrapper_task) # validate that no GL data structures being passed in function_arguments for key, value in binding.items(): if _Task._is_valid_data_structure(value): raise RuntimeError( "Validation Failed: Unsupported type for function_arguments. Function arguments must be basic types that can be serialized into JSON. Invalid function_argument: '%s', type: '%s'" % (key, type(value)) ) task.set_params({"params": binding, "function": function}) task.set_outputs(["output"]) # create dependency for output from task to combiner task combiner.set_inputs({"in-%d" % idx: (task, "output")}) tasks.append(task) _gl.deploy.tasks.delete(task) combiner.set_params({"num": len(bindings)}) tasks.append(combiner) _gl.deploy.tasks.delete(combiner) tracker.track("deploy.job.create.fn", value=1) # now make the artifacts a list of objects if not isinstance(tasks, list): # not a list, let's turn it into a list tasks = [tasks] # if Environment object missing num_hosts attribute, set to 1 if not hasattr(environment, "num_hosts"): environment.num_hosts = 1 # If environment.num_hosts > 1 and not using model_parameter_search or parallel_for_each then # reset num_host = 1, since multiple hosts will not be used. if environment.num_hosts != 1 and all(map(lambda x: isinstance(x, _Task), tasks)): __LOGGER__.warn( "Ignoring Environment.num_hosts value since execution will occur only on one host. Using num_hosts=1 for this execution." ) environment.num_hosts = 1 # add required packages to first task in execution # ensures the packages will be present on execution if required_packages is not None: packages = tasks[0].get_required_packages() tasks[0].set_required_packages(packages | set(required_packages)) if name is None or name == "": task = tasks[0] if isinstance(task, tuple): task = task[0] if isinstance(task, str): names = task else: names = task.name name = "job-%s-%s-%s" % (names, environment.name, _time.time()) # if using fn= parameter, we need to wait until name has been determined to # set the results_path, so now that the name is settled, set results_path if combiner is not None: results_path = _get_results_path(environment, name) __LOGGER__.info("Job Results SFrame stored: %s" % results_path) combiner.set_outputs({"results": results_path}) validation_msgs = [] # verify job name is unique if _gl.deploy.jobs[name] is not None: # found another job same name, fail raise RuntimeError( "Validation Error: Job already exists with the name '%s', please rename or delete the existing job." % name ) # Create artifact from their names, if necessary. Clone all artifacts. Add any bindings. cloned_artifacts = [] using_pipeline = False for steps in tasks: # handle pipeline differently then task if isinstance(steps, _Pipeline): using_pipeline = True binding = None if isinstance(steps, tuple): (cur_artifact, binding) = steps else: cur_artifact = steps if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline): cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task") clone = cur_artifact._clone(cur_artifact.name, session_aware=False) # apply bindings if paired with task if binding is not None: _apply_binding_to_task(clone, binding) cloned_artifacts.append(clone) continue if not isinstance(steps, list): steps = [steps] cloned_step = [] for step in steps: binding = None if isinstance(step, tuple): (cur_artifact, binding) = step else: cur_artifact = step if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline): cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task") if cur_artifact is None: raise TypeError("Unable to find Task to try to run") clone = cur_artifact._clone(cur_artifact.name, session_aware=False) # apply bindings if paired with task if binding is not None: _apply_binding_to_task(clone, binding) # if environment is not local then write out any outputs not bound to a location to an # intermediate location, so any subsequent steps can find the output _validate_output_to_environment(clone, environment, validation_msgs) cloned_step.append(clone) cloned_artifacts.append(cloned_step) num_tasks = len(cloned_artifacts) if isinstance(environment, _environment.Local): tracker.track("deploy.job.create.local", value=1, properties={"num_tasks": num_tasks}) env = _env.LocalExecutionEnvironment() elif isinstance(environment, _environment.LocalAsync): tracker.track("deploy.job.create.localasync", value=1, properties={"num_tasks": num_tasks}) env = _env.LocalAsynchronousEnvironment() elif isinstance(environment, _environment.EC2): tracker.track("deploy.job.create.ec2", value=1, properties={"num_tasks": num_tasks}) # name the ec2 instance the job name if not environment.tags: environment.tags = {} if not "Name" in environment.tags: environment.tags["Name"] = name environment.tags["Job"] = name env = _env.Ec2ExecutionEnvironment() elif isinstance(environment, _environment.Hadoop): tracker.track("deploy.job.create.hadoop", value=1, properties={"num_tasks": num_tasks}) env = _env.HadoopExecutionEnvironment() else: raise Exception("Validation Failed: Unknown execution environment.") if len(validation_msgs) > 0: for msg in validation_msgs: __LOGGER__.error(msg) raise RuntimeError( "Validation Failed: output(s) not set to appropriate location for execution environment. See logs for more details." ) try: __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % name) job = env.run(_session, cloned_artifacts, name, environment) _session.register(job) job.save() # save the job once prior to returning. # add a .get_results() method to this job. if function is not None: job.get_results = _types.MethodType(_get_results, job) return job except LicenseValidationException as e: # catch exception and print license check hint message here instead of raise __LOGGER__.info(e)