def _create_map_job(function, parameter_set, name=None, environment=None, combiner_function=None, _job_type="PIPELINE"): _raise_error_if_not_function(function) # Name the job now = _datetime.now().strftime("%b-%d-%Y-%H-%M-%S") function_name = _sub("[<>]", "", function.__name__) name = "%s-%s" % (function_name, now) if not name else name # Validate args function, name, environment = _job._validate_job_create_args(function, name, environment) _session = _gl.deploy._default_session while _session.exists(name, __job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Convert SFrame to a dict if not parameter_set: raise RuntimeError("An empty parameter_set was given. Nothing to do.") # If parameter set is a generator/SFrame, make sure it gets expanded out. parameter_set_copy = [] for i in parameter_set: if not isinstance(i, dict): raise TypeError( "'parameter_set' has to be an iterable of dictionary." " For void functions, use an empty dictionary as inputs." ) parameter_set_copy.append(i) # Create the task. task_prototype = _task.Task(function, function_name) for_each_iterations = _generate_mapjob_tasks(task_prototype, parameter_set_copy) # List of outputs for the final step. if not combiner_function: list_of_tasks = for_each_iterations[0] else: combiner = _task.Task(combiner_function) # The input to this task is all other tasks task_name_to_task = {} for stage in for_each_iterations: for t in stage: task_name_to_task[t.name] = t combiner.set_inputs_from_task(task_name_to_task) for_each_iterations.append([combiner]) list_of_tasks = combiner # Create the job job = __job.Job( name, stages=for_each_iterations, environment=environment, final_stage=list_of_tasks, _job_type=_job_type ) return job
def __init__(self, func, name=None, description=None): """ Create a new Task specifying its name and optionally a description. """ # Must be a function _raise_error_if_not_function(func, "func") # Set the name name = func.__name__ if not name else name _raise_error_if_not_of_type(name, str, "name") self.name = name self._data = dict() self._data['code'] = None self._data['codestr'] = None self._data['inputs'] = dict() self._data['output'] = None self._data['packages'] = set() self._data['description'] = '' self._modified_since_last_saved = None if description is not None: self.set_description(description) # Inspect the function. specs = _inspect.getargspec(func) varargs = specs.varargs defaults = _copy.copy(specs.defaults) args = _copy.copy(specs.args) # Set the code to function arguments + *args + **kwargs self.set_code(func) # Set the inputs all_args = _copy.copy(args) if varargs: all_args.append(varargs) self.set_inputs(all_args) # Bind default values if defaults: for index, arg in enumerate(args[-len(defaults):]): self.set_inputs({arg : defaults[index]}) # Set required packages if _sys.version_info.major == 3: func_dict = func.__dict__ else: func_dict = func.func_dict
def __init__(self, func, name=None, description=None): """ Create a new Task specifying its name and optionally a description. """ # Must be a function _raise_error_if_not_function(func, "func") # Set the name name = func.__name__ if not name else name _raise_error_if_not_of_type(name, str, "name") self.name = name self._data = dict() self._data['code'] = None self._data['codestr'] = None self._data['inputs'] = dict() self._data['output'] = None self._data['packages'] = set() self._data['description'] = '' self._modified_since_last_saved = None if description is not None: self.set_description(description) # Inspect the function. specs = _inspect.getargspec(func) varargs = specs.varargs defaults = _copy.copy(specs.defaults) args = _copy.copy(specs.args) # Set the code to function arguments + *args + **kwargs self.set_code(func) # Set the inputs all_args = _copy.copy(args) if varargs: all_args.append(varargs) self.set_inputs(all_args) # Bind default values if defaults: for index, arg in enumerate(args[-len(defaults):]): self.set_inputs({arg: defaults[index]}) # Set required packages if 'required_packages' in func.func_dict: self.set_required_packages(func.func_dict['required_packages'])
def set_code(self, code): """ Set the code block to run when Task is executed. The code to be run needs to be a function that takes one argument. When this function is called, the arguments will be the inputs and the return will be in the output. The inputs dictionary will have instantiated data sources by name. The output dictionary needs to be assigned by name to the results to save. Parameters ---------- code : function Function to be called when this Task is executed. Returns ------- self : Task Examples -------- Using a defined function: >>> def func(task): >>> input = task.inputs['input'] >>> task.output['output'] = input.apply(lambda x : x * 2) >>> t1 = graphlab.deploy._task.Task("set_code_ex1") >>> t1.set_code(func) """ # Make sure it is a function. _raise_error_if_not_function(code) # Cannot work with instance method if (_inspect.ismethod(code)): raise TypeError(("Function cannot be an instance method, please" " use a function.")) # code is callable, so store it as is self._data['code'] = code self._data['codestr'] = _inspect.getsource(code) self._set_dirty_bit()
def set_code(self, code): """ Set the code block to run when Task is executed. The code to be run needs to be a function that takes one argument. When this function is called, the arguments will be the inputs and the return will be in the output. The inputs dictionary will have instantiated data sources by name. The output dictionary needs to be assigned by name to the results to save. Parameters ---------- code : function Function to be called when this Task is executed. Returns ------- self : Task Examples -------- Using a defined function: >>> def func(task): >>> input = task.inputs['input'] >>> task.output['output'] = input.apply(lambda x : x * 2) >>> t1 = graphlab.deploy._task.Task("set_code_ex1") >>> t1.set_code(func) """ # Make sure it is a function. _raise_error_if_not_function(code) # Cannot work with instance method if(_inspect.ismethod(code)): raise TypeError(("Function cannot be an instance method, please" " use a function.")) # code is callable, so store it as is self._data['code'] = code self._data['codestr'] = _inspect.getsource(code) self._set_dirty_bit()
def create(function, name=None, environment=None, **kwargs): """ Execute arbitrary functions in a remote environment. The job is specified as a function. All functions that are called from within the function are automatically captured. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- function : function Function to be executed in this Job, with arguments to pass to this function specified by `kwargs`. name : str, optional Name for this execution (names the returned Job). If set to None, then the name of the job is set to the name of the function with a time-stamp. Valid characters in job name include: digits, characters, '-' and '_'. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. kwargs: Function kwargs that are passed to the function for execution. Returns ------- job : :py:class:`~graphlab.deploy.Job` Used for monitoring and managing the execution of the Job. See Also -------- graphlab.deploy.map_job.create, graphlab.deploy.Job Examples -------- Let us start out with a simple example to execute a function that can add two numbers. .. sourcecode:: python # Define a function def add(x, y): return x + y # Create a job. job = graphlab.deploy.job.create(add, x=1, y=1) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() 2 Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') # Job execution capture the exception raised by the function. job = graphlab.deploy.job.create(add, x=1, y=None) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() None # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+--------+------------+----------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+--------+------------+----------+-----------------------+ | add | Failed | 1427928898 | None | x or y cannot be None | +-----------+--------+------------+----------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | Traceback (most recent cal... | +-------------------------------+ [1 rows x 6 columns] If a function requires a package to be installed, the function can be annotated with a decorator. .. sourcecode:: python @graphlab.deploy.required_packages(['names == 0.3.0']) def my_function(number = 10): import names people = [names.get_full_name() for i in range(number)] sf = graphlab.SFrame({'names':people}) return sf job = graphlab.deploy.job.create(my_function) >>> print job.get_results() Columns: names str Data: +-------------------+ | names | +-------------------+ | Annette Logan | | Nancy Anthony | | Tiffany Zupancic | | Andre Coppin | | Robert Coe | | Donald Dean | | Lynne Bunton | | John Sartwell | | Peter Nicholas | | Chester Rodriguez | +-------------------+ [10 rows x 1 columns] Complex functions that require SFrames, GraphLab models etc. can be deployed with ease. All additional state required by the function are automatically captured. .. sourcecode:: python GLOBAL_CONSTANT = 10 def foo(x): return x + 1 def bar(x): return x + 2 def my_function(x, y): foo_x = foo(x) bar_y = bar(y) return foo_x + bar_y + GLOBAL_CONSTANT # Automatically captures all state needed by the deployed function. job = graphlab.deploy.job.create(my_function, x = 1, y = 1) >>> print job.get_results() 15 You can execute the same job remotely by passing a different environment. .. sourcecode:: python # Define a function def add(x, y): return x + y # Define an EC2 environment ec2 = graphlab.deploy.Ec2Config() # Create an EC2 cluster object c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2) # Create a job. job = graphlab.deploy.job.create(add, environment=c, x=1, y=1) >>> print job.get_results() 2 Notes ----- - When an exception is raised within the deployed function, :func:`~graphlab.deploy.Job.get_results` returns None. - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a blocking call which will wait for the job execution to complete before returning the results. """ _session = _gl.deploy._default_session _raise_error_if_not_function(function) _get_metric_tracker().track('jobs.job') # Name the job now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S') function_name = _sub('[<>]','',function.__name__) name = '%s-%s' % (function_name, now) if not name else name # Validate args function, name, environment = _validate_job_create_args(function, name, environment) while _session.exists(name, _job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Setup the task & job task = _task.Task(function,function_name) task.set_inputs(kwargs) job = _job.Job(name, stages=[[task]], environment=environment, final_stage=task) # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % name) else: __LOGGER__.info("Job: '%s' finished." % name) _session.register(job) _session.save(job) return job
def create(function, name=None, environment=None, **kwargs): """ Execute arbitrary functions in a remote environment. The job is specified as a function. All functions that are called from within the function are automatically captured. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- function : function Function to be executed in this Job, with arguments to pass to this function specified by `kwargs`. name : str, optional Name for this execution (names the returned Job). If set to None, then the name of the job is set to the name of the function with a time-stamp. Valid characters in job name include: digits, characters, '-' and '_'. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. kwargs: Function kwargs that are passed to the function for execution. Returns ------- job : :py:class:`~graphlab.deploy.Job` Used for monitoring and managing the execution of the Job. See Also -------- graphlab.deploy.map_job.create, graphlab.deploy.Job Examples -------- Let us start out with a simple example to execute a function that can add two numbers. .. sourcecode:: python # Define a function def add(x, y): return x + y # Create a job. job = graphlab.deploy.job.create(add, x=1, y=1) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() 2 Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') # Job execution capture the exception raised by the function. job = graphlab.deploy.job.create(add, x=1, y=None) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() None # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+--------+------------+----------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+--------+------------+----------+-----------------------+ | add | Failed | 1427928898 | None | x or y cannot be None | +-----------+--------+------------+----------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | Traceback (most recent cal... | +-------------------------------+ [1 rows x 6 columns] If a function requires a package to be installed, the function can be annotated with a decorator. .. sourcecode:: python def my_function(number = 10): import names people = [names.get_full_name() for i in range(number)] sf = graphlab.SFrame({'names':people}) return sf job = graphlab.deploy.job.create(my_function) >>> print job.get_results() Columns: names str Data: +-------------------+ | names | +-------------------+ | Annette Logan | | Nancy Anthony | | Tiffany Zupancic | | Andre Coppin | | Robert Coe | | Donald Dean | | Lynne Bunton | | John Sartwell | | Peter Nicholas | | Chester Rodriguez | +-------------------+ [10 rows x 1 columns] Complex functions that require SFrames, GraphLab models etc. can be deployed with ease. All additional state required by the function are automatically captured. .. sourcecode:: python GLOBAL_CONSTANT = 10 def foo(x): return x + 1 def bar(x): return x + 2 def my_function(x, y): foo_x = foo(x) bar_y = bar(y) return foo_x + bar_y + GLOBAL_CONSTANT # Automatically captures all state needed by the deployed function. job = graphlab.deploy.job.create(my_function, x = 1, y = 1) >>> print job.get_results() 15 You can execute the same job remotely by passing a different environment. .. sourcecode:: python # Define a function def add(x, y): return x + y # Define an EC2 environment ec2 = graphlab.deploy.Ec2Config() # Create an EC2 cluster object c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2) # Create a job. job = graphlab.deploy.job.create(add, environment=c, x=1, y=1) >>> print job.get_results() 2 Notes ----- - When an exception is raised within the deployed function, :func:`~graphlab.deploy.Job.get_results` returns None. - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a blocking call which will wait for the job execution to complete before returning the results. """ _session = _gl.deploy._default_session _raise_error_if_not_function(function) _get_metric_tracker().track('jobs.job') # Name the job now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S') function_name = _sub('[<>]', '', function.__name__) name = '%s-%s' % (function_name, now) if not name else name # Validate args function, name, environment = _validate_job_create_args( function, name, environment) while _session.exists(name, _job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Setup the task & job task = _task.Task(function, function_name) task.set_inputs(kwargs) job = _job.Job(name, stages=[[task]], environment=environment, final_stage=task) # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % name) else: __LOGGER__.info("Job: '%s' finished." % name) _session.register(job) _session.save(job) return job