Python _Task示例

编程语言: Python

命名空间/包名称: graphlab.deploy

方法/功能: _Task

hotexamples.com的示例: 3

Python _Task - 已找到3个示例。这些是从开源项目中提取的最受好评的graphlab.deploy._Task现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： _parallel.py 项目： eb777ez/Yelp-Recommendation-System

def parallel_for_each(task, parameter_set, name = None, environment = None):
    '''
    Runs a task once for each entry in the parameter_set. The output will be the concatenation of
    the outputs from each iteration. Each row will also contain the parameter values used to generate
    that row.

    Parameters
    ----------
    task : The task to run for each parallel-for-each iteration. The final outputs (of all
        iterations) will be saved at the outputs specified for this task.

    parameter_set : list of dicts | SFrame
        If parameter_set is a list of dicts, each element of the list corresponds to an iteration
        of the given task. The keys of each dictionary should be the parameter names, and the value
        should be the value of that parameter.
        If parameter_set is an SFrame, each row corresponds to an iteration of the given task. The
        column names of the SFrame should correspond to the parameter names of the task.

    name : str
        The name to use for the job that will be returned. If not specified, one will be generated.

    environment : This must be an environment created from the :py:class:`~graphlab.deploy` module.
        This is the environment where the parallel-for-each will be executed. The environment could
        be a set of EC2 hosts or a Hadoop cluster. It could be an environment that executes locally
        in the background. Or it could be run as a blocking call on your local computer.

    Returns
    -------
    job : :py:class:`~graphlab.deploy._job.Job`
        The job for the parallel-for-each, which was run using the `environment` parameter. This
        object can be used to track the progress of parallel-for-each work.

    Examples
    --------
    .. sourcecode:: python

    >>> my_task = graphlab.deploy.Task('My Task Name')

    >>> def sum(task):
            sum = task.params['x'] + task.params['y']
            task.outputs['sum'] = graphlab.SFrame({'sum': [sum]})
    >>> my_task.set_code(sum)
    >>> my_task.set_params(['x', 'y'])

    >>> my_task.set_outputs({'sum': '/tmp/my_sums'})
    >>> params = [{'x': 1, 'y': 2}, {'x': 0, 'y': 9}, {'x': 23, 'y': 3}]
    >>> env = graphlab.deploy.environment.LocalAsync('Background Runner')

    >>> my_job = graphlab.deploy.parallel_for_each(env, my_task, params)
    >>> # The above call will return immediately.
    >>> # Once my_job.get_status() returns 'Completed' your results are ready.

    >>> results = graphlab.SFrame('/tmp/my_sums')
    >>> results
    +-----+-------------------+
    | sum |     parameters    |
    +-----+-------------------+
    |  3  |  {'y': 2, 'x': 1} |
    |  9  |  {'y': 9, 'x': 0} |
    |  26 | {'y': 3, 'x': 23} |
    +-----+-------------------+
    [3 rows x 2 columns]
    '''

    _get_metric_tracker().track('jobs.parallel_for_each')

    if not name:
        name = "parallel_for_each-%s-%s" % (task.name, _time())

    if isinstance(parameter_set, _SFrame):
        # Convert SFrame to a dict
        parameter_set = [ i for i in parameter_set ]

    if not parameter_set:
        _log.warn('An empty parameter_set was given. No iterations will be run.')

    temp_dir = _job._get_results_path(environment, name, 'temp')

    task_per_step = 1 
    if environment:
        task_per_step = environment.get_max_degree_of_parallelism()

    for_each_iterations, params_to_outputs = _generate_step_assignments(task, parameter_set,
                                                                        task_per_step, temp_dir)

    # Create the combiner task
    combiner = _Task('-'.join([name, 'combine']))
    combiner.set_outputs(task.get_outputs())
    combiner.set_params({_COMBINE_PARAMETER_NAME: params_to_outputs})
    combiner.set_code(_combine)
    _gl.deploy.tasks.delete(combiner)

    pipeline = _Pipeline(name + "-Pipeline")
    pipeline.set_tasks(for_each_iterations + [[combiner]])
    return _job.create([pipeline], name=name, environment=environment)

示例#2

显示文件

文件： job.py 项目： eb777ez/Yelp-Recommendation-System

def create(tasks=None, name=None, environment=None, function=None, function_arguments=None, required_packages=None):
    """
    Creates a Job and begins executing it. The Job can be defined by either
    specifying a list of tasks, with optional bindings, or with a function,
    with arguments defined. Each Job also needs to know where to run, and that
    is specified by the environment.

    By default, this method will kick off asynchronous work, and return a Job
    object to monitor/manage that work.

    Parameters
    ----------
    tasks : list [Task | str | tuple [ str, dict ]] | str
        List of Tasks to run.

    name : str
        Name for this execution (names the returned Job). Default is environment name + timestamp.

    environment : :class:`~graphlab.deploy.environment.EC2` |
                  :class:`~graphlab.deploy.environment.Hadoop` |
                  :class:`~graphlab.deploy.environment.LocalAsync`
        Optional environment for execution. This would commonly hold access
        keys, launch locations etc.  Also included in the environment object is
        a dictionary for associated metadata to pass to the execution. Default
        is 'LocalAsync', which will have the execution occur in the background
        locally.

    function : function
        Function to be executed in this Job, with arguments to pass to this
        function specified by function_arguments. If a function is specified,
        then tasks cannot be specified. Specifying a function makes it easy to
        get code running in a remote environment.

        If the function returns a dict then it will be collated into the
        results. If the function returns something other than a dict, it will
        be cast to an str and that will be collated into the results. See the
        examples below for more information. See the example below for more
        information.

    function_arguments : dict | list [ dict ] | :class:`~graphlab.SFrame`
        Arguments to pass to the specified function as kwargs. To run multiple
        invocations of the function, simply specify a list of arguments or an
        SFrame. Each element in the list will result in invoking the function
        once. Each row of the SFrame will be used to invoke the function.

    required_packages : list [ str ] | set [ str ]
        List of package requirements (same as disutils.requires) format for
        packages required for running this Job. This is most useful to specify
        any non-standard Python packages required to run the function
        specified.

    Returns
    -------
    job : :py:class:`~graphlab.deploy._job.Job`
        Used for monitoring and managing the execution of the Job.

    Notes
    -----
    - When this method is invoked, each Task specified is cloned and a snapshot
      of it is used for execution. This snapshot can then be queried by
      inspecting the Job object returned.

    Examples
    --------
    Creating a Job using a function instead of Tasks is easy. Just define a
    function and then use it when calling job.create.

    Using a list of dicts to specify arguments:
        >>> def sum_four(one, two, three, four):
        >>>     return {'sum': one + two + three + four}
        >>>
        >>> job = graphlab.deploy.job.create(
        >>>             function=sum_four,
        >>>             function_arguments=[{'one':1, 'two':2,
        >>>                 'three':3, 'four':4}])
        >>>
        >>> results = job.get_results() # SFrame with aggregated results

    Using an SFrame to specify multiple sets of arguments:
        >>> def mult_three(one, two, three):
        >>>     return {'product': one * two * three}
        >>>
        >>> sf = graphlab.SFrame(data={'one':[1,5], 'two':[2,6], 'three':[3,7]})
        >>> job = graphlab.deploy.job.create(function=mult_three,
        >>>                                  function_arguments=sf)
        >>>
        >>> +----+--------------------------------+------------------+-----------+
        >>> | id |             input              |      result      |   status  |
        >>> +----+--------------------------------+------------------+-----------+
        >>> | 0  | {'three': 3, 'two': 2, 'on ... |  {'product': 6}  | Completed |
        >>> | 1  | {'three': 7, 'two': 6, 'on ... | {'product': 210} | Completed |
        >>> +----+--------------------------------+------------------+-----------+
        >>> +---------------------------+---------------------------+---------+
        >>> |         start_time        |          end_time         | message |
        >>> +---------------------------+---------------------------+---------+
        >>> | 2014-11-17 11:06:38+00:00 | 2014-11-17 11:06:38+00:00 |         |
        >>> | 2014-11-17 11:06:40+00:00 | 2014-11-17 11:06:40+00:00 |         |
        >>> +---------------------------+---------------------------+---------+
        >>> [2 rows x 7 columns]

    Each entry in the tasks list could be a pair with a dictionary of bindings
    for that entry. For example:

        >>> tasks = [('task1', {'input':'s3://big-file'}),
        >>>          ('task2', {'input':'/localfile'})]
        >>> graphlab.deploy.job.create(tasks, name='with-bindings')

    """
    tracker = _mt._get_metric_tracker()
    _session = _gl.deploy._default_session

    if tasks is None and function is None:
        raise TypeError("tasks or function needs to be defined")
    if tasks is not None and function is not None:
        raise TypeError("Cannot specify BOTH tasks and function")

    if environment is None:
        environment = _gl.deploy.environments["async"]
        if environment is None:
            environment = _environment.LocalAsync("async")
            environment.save()
    else:
        if isinstance(environment, str):
            __LOGGER__.debug("Loading environment: %s" % environment)
            environment = _gl.deploy.environments[environment]

        elif not isinstance(environment, _environment.Environment):
            raise Exception("Unknown type of environment")

        if environment is None:
            raise TypeError(
                "Environment cannot be loaded correctly with name '%s', please confirm this environment exists by calling graphlab.deploy.environments."
                % environment
            )

    # always clone the environment, so not mutating existing
    environment = environment.clone()
    __LOGGER__.info("Preparing using environment: %s" % environment.name)

    if name is not None:
        if type(name) is not str:
            raise TypeError("The name you gave for this job is not a string.")

    __LOGGER__.info("Beginning Job Validation.")

    # special handling for function= parameter
    combiner = None
    if function is not None:
        # clobber tasks specified and create a Task for the execution of the function
        tasks = []
        if not _inspect.isfunction(function):
            raise TypeError("Invalid function, must be a Python function.")

        bindings = function_arguments
        if bindings is None:
            bindings = [{}]
        elif not isinstance(bindings, list) and not isinstance(bindings, _gl.SFrame):
            bindings = [bindings]

        # if no name specified make sure the Task names are prefixed with Job name to ensure uniqueness
        if name is None or name == "":
            name = "job-%s-%s-%s" % (function.__name__, environment.name, _time.time())

        combiner = _Task(name + "-combiner")
        combiner.set_code(_combiner_task)
        idx = -1

        for binding in bindings:
            idx = idx + 1
            task = _Task("%s-%s-%d" % (name, function.__name__, idx))
            task.set_code(_wrapper_task)

            # validate that no GL data structures being passed in function_arguments
            for key, value in binding.items():
                if _Task._is_valid_data_structure(value):
                    raise RuntimeError(
                        "Validation Failed: Unsupported type for function_arguments. Function arguments must be basic types that can be serialized into JSON. Invalid function_argument: '%s', type: '%s'"
                        % (key, type(value))
                    )

            task.set_params({"params": binding, "function": function})
            task.set_outputs(["output"])

            # create dependency for output from task to combiner task
            combiner.set_inputs({"in-%d" % idx: (task, "output")})
            tasks.append(task)
            _gl.deploy.tasks.delete(task)

        combiner.set_params({"num": len(bindings)})
        tasks.append(combiner)
        _gl.deploy.tasks.delete(combiner)
        tracker.track("deploy.job.create.fn", value=1)

    # now make the artifacts a list of objects
    if not isinstance(tasks, list):
        # not a list, let's turn it into a list
        tasks = [tasks]

    # if Environment object missing num_hosts attribute, set to 1
    if not hasattr(environment, "num_hosts"):
        environment.num_hosts = 1

    # If environment.num_hosts > 1 and not using model_parameter_search or parallel_for_each then
    # reset num_host = 1, since multiple hosts will not be used.
    if environment.num_hosts != 1 and all(map(lambda x: isinstance(x, _Task), tasks)):
        __LOGGER__.warn(
            "Ignoring Environment.num_hosts value since execution will occur only on one host. Using num_hosts=1 for this execution."
        )
        environment.num_hosts = 1

    # add required packages to first task in execution
    # ensures the packages will be present on execution
    if required_packages is not None:
        packages = tasks[0].get_required_packages()
        tasks[0].set_required_packages(packages | set(required_packages))

    if name is None or name == "":
        task = tasks[0]
        if isinstance(task, tuple):
            task = task[0]
        if isinstance(task, str):
            names = task
        else:
            names = task.name
        name = "job-%s-%s-%s" % (names, environment.name, _time.time())

    # if using fn= parameter, we need to wait until name has been determined to
    # set the results_path, so now that the name is settled, set results_path
    if combiner is not None:
        results_path = _get_results_path(environment, name)
        __LOGGER__.info("Job Results SFrame stored: %s" % results_path)
        combiner.set_outputs({"results": results_path})

    validation_msgs = []

    # verify job name is unique
    if _gl.deploy.jobs[name] is not None:
        # found another job same name, fail
        raise RuntimeError(
            "Validation Error: Job already exists with the name '%s', please rename or delete the existing job." % name
        )

    # Create artifact from their names, if necessary. Clone all artifacts. Add any bindings.
    cloned_artifacts = []
    using_pipeline = False
    for steps in tasks:

        # handle pipeline differently then task
        if isinstance(steps, _Pipeline):
            using_pipeline = True
            binding = None
            if isinstance(steps, tuple):
                (cur_artifact, binding) = steps
            else:
                cur_artifact = steps
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task")

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)
            cloned_artifacts.append(clone)
            continue

        if not isinstance(steps, list):
            steps = [steps]

        cloned_step = []
        for step in steps:
            binding = None
            if isinstance(step, tuple):
                (cur_artifact, binding) = step
            else:
                cur_artifact = step
            if not isinstance(cur_artifact, _Task) and not isinstance(cur_artifact, _Pipeline):
                cur_artifact = _session._open(cur_artifact, {}, check_cache=True, typename="Task")

            if cur_artifact is None:
                raise TypeError("Unable to find Task to try to run")

            clone = cur_artifact._clone(cur_artifact.name, session_aware=False)

            # apply bindings if paired with task
            if binding is not None:
                _apply_binding_to_task(clone, binding)

            # if environment is not local then write out any outputs not bound to a location to an
            # intermediate location, so any subsequent steps can find the output
            _validate_output_to_environment(clone, environment, validation_msgs)

            cloned_step.append(clone)

        cloned_artifacts.append(cloned_step)

    num_tasks = len(cloned_artifacts)
    if isinstance(environment, _environment.Local):
        tracker.track("deploy.job.create.local", value=1, properties={"num_tasks": num_tasks})
        env = _env.LocalExecutionEnvironment()
    elif isinstance(environment, _environment.LocalAsync):
        tracker.track("deploy.job.create.localasync", value=1, properties={"num_tasks": num_tasks})
        env = _env.LocalAsynchronousEnvironment()
    elif isinstance(environment, _environment.EC2):
        tracker.track("deploy.job.create.ec2", value=1, properties={"num_tasks": num_tasks})
        # name the ec2 instance the job name
        if not environment.tags:
            environment.tags = {}
        if not "Name" in environment.tags:
            environment.tags["Name"] = name
        environment.tags["Job"] = name
        env = _env.Ec2ExecutionEnvironment()
    elif isinstance(environment, _environment.Hadoop):
        tracker.track("deploy.job.create.hadoop", value=1, properties={"num_tasks": num_tasks})
        env = _env.HadoopExecutionEnvironment()
    else:
        raise Exception("Validation Failed: Unknown execution environment.")

    if len(validation_msgs) > 0:
        for msg in validation_msgs:
            __LOGGER__.error(msg)
        raise RuntimeError(
            "Validation Failed: output(s) not set to appropriate location for execution environment. See logs for more details."
        )

    try:
        __LOGGER__.info("Validation complete. Job: '%s' ready for execution" % name)
        job = env.run(_session, cloned_artifacts, name, environment)
        _session.register(job)
        job.save()  # save the job once prior to returning.

        # add a .get_results() method to this job.
        if function is not None:
            job.get_results = _types.MethodType(_get_results, job)

        return job
    except LicenseValidationException as e:
        # catch exception and print license check hint message here instead of raise
        __LOGGER__.info(e)

示例#3

显示文件

文件： model_parameter_search.py 项目： eb777ez/Yelp-Recommendation-System

def model_parameter_search(model_factory, train_set_path, save_path, test_set_path=None,
                           standard_model_params={}, hyper_params=None, max_num_models='all',
                           name=None, environment = None):
    '''
    Searches for optimal model parameters. Automatically creates models using different parameters.
    Optionally, evaluates these models using a test set.

    You can specifiy the values of parameters you want to search over, using the `hyper_params`
    parameter. By default if you specify this, all possible combinations of all parameter values
    will be tried. If you do not specify `hyper_params`, a set of default parameter combinations,
    based on the model type, will be tried.

    This function can also be used for random search, rather than grid search. Using the
    `max_num_models` you can specify max number of models to try; that many combinations of
    `hyper_params`, will be randomly picked from all possible combinations.

    model_parameter_search is supported for:
    :py:class:`~graphlab.linear_regression.LinearRegression`,
    :py:class:`~graphlab.boosted_trees_regression.BoostedTreesRegression`,
    :py:class:`~graphlab.logistic_classifier.LogisticClassifier`,
    :py:class:`~graphlab.svm_classifier.SVMClassifier`,
    :py:class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier`,
    :py:class:`~graphlab.neuralnet_classifier.NeuralNetClassifier`,
    :py:class:`~graphlab.recommender.factorization_recommender.FactorizationRecommender`, and
    :py:class:`~graphlab.kmeans.KmeansModel`

    Parameters
    ----------
    model_factory : function
        This is the function (always ending in "create") that you would normally use to create the
        model. For example, if you wanted to use `model_parameter_search` for
        :py:class:`~graphlab.linear_regression.LinearRegression`, you would set this parameter
        to `graphlab.linear_regression.create`.

    train_set_path : str
        Path to a saved SFrame containing the train set. If running locally, this must be an absolute
        path. If running in EC2, this must be an S3 path. If running in Hadoop, this must be an HDFS
        path.

    save_path : str
        Path where the result of the model parameter search will be saved. Results will be saved as
        an SFrame with one row for each model that was created. If running locally, this path must be
        an absolute path. If running in EC2, this must be an S3 path. If running in Hadoop, this must
        be an HDFS path.

    test_set_path : str, optional
        Path to a saved SFrame containing the test set. This SFrame must be in the same format as the
        train set. If running locally, this must be an absolute path. If running in EC2, this must
        be an S3 path. If running in Hadoop, this must be an HDFS path.

    standard_model_params : dict, optional
        A set of arguments passed to every call to `model_factory`: these parameters will be passed
        to every model that gets created as part of the search.

    hyper_params : dict, optional
        The keys in the dictionary must be the names of the `model_factory` parameters that you want
        to search over. The value for a key will depend on the type that `model_factory` expects for
        that parameter. If the type of the parameter is not a dictionary, use a list of all the
        values you want to search over. For example if we we're using
        :class:`~graphlab.recommender.FactorizationRecommender` and want to search over all
        `num_factors` from 10 to 15 and the `regularization` values of 1 and 5, you would use:
        `{ 'num_factors': [10, 11, 12, 13, 14, 15], 'regularization': [1, 5] }`. If `model_factory`
        expects a dictionary for a parameter, the same rules apply recursively (see boosted trees
        example below).

    max_num_models : int, optional
        The max number of models to create. If `max_num_models` is less than the number of possible
        combinations of `hyper_params`, `max_num_models` of the possible `hyper_params` combinations
        will be randomly picked.

    name : str, optional
        The name for the :class:`Job <graphlab.deploy>` that will be created. If not specified then
        the name will be 'Model-Parameter-Search-(timestamp)'.

    environment : Environment
        This must be an environment created from the :py:class:`~graphlab.deploy` module. This is
        the environment where the parameter search will actually be done. The environment could be a
        set of EC2 hosts or a Hadoop cluster. It could be an environment that executes locally on
        your computer either as a blocking call or as an asynchronous background job.

    Returns
    -------
    out : :class:`job <graphlab.deploy>` object
        The job for the parameter search, which was run using the `environment` parameter. This
        object can be used to track the progress of the parameter search.

    Examples
    --------

    The easiest option to use for an environment is just to have it execute synchronously (i.e. as a
    blocking call) on your local computer. For that you'll need to create a
    :class:`Local Environment <graphlab.deploy>`. Since you're running it locally and synchronously,
    you don't need to worry about the return value of `model_parameter_search`; the function will not
    return until the parameter search is completed.

    .. sourcecode:: python

        # SFrames need to be saved. Make sure all file paths are absolute.
        >>> train_file_path = '/data/train_set.gl'
        >>> test_file_path = '/data/test_set.gl'
        >>> test_set.save(test_file_path)
        >>> train_set.save(train_file_path)

        # Setup a python environment
        >>> env = graphlab.deploy.environment.Local('local env')

        # For Kmeans using the default range of hyper-parameters for that model.
        >>> model_parameter_search(env, graphlab.kmeans.create, train_file_path, '/data/results.gl')

        # For Matrix Factorization with a target of 'rating', using the default range of hyper-params.
        >>> model_parameter_search(env, graphlab.factorization_recommender.create, train_file_path,
                                '/data/results.gl', test_set_path = test_file_path,
                                standard_model_params = {'target': 'rating'}
                              )


        # For boosted trees with a target of 'results', trying only a max_depth 5 and 10.
        # This time the parameter search will be done asynchronously on an EC2 host.
        >>> train_file_path = 's3://my-bucket-name/train_file_path'
        >>> test_file_path = 's3://my-bucket-name/test_file_path'
        >>> test_set.save(test_file_path)
        >>> train_set.save(train_file_path)

        >>> graphlab.aws.set_credentials('RBZH792CTQPP7T435BGQ', '7x2hMqplWsLpU/qQCN6xAPKcmWo46TlPJXYTvKcv')
        >>> ec2_env = graphlab.deploy.environment.EC2('ec2 env', 's3://my-bucket-name/log-dir')
        >>> job = model_parameter_search(ec2_env, graphlab.boosted_trees_classifier.create, train_file_path,
                                          's3://my-bucket-name/save_file_path',
                                          test_set_path = test_file_path,
                                          standard_model_params = {'target': 'result'},
                                          hyper_params = {'max_depth': [5, 10]}
                                        )

        # Get the status of the jobs
        >>> job.get_status()

    Upon completion, an SFrame will be saved at the path specified by the
    save_path parameter. This SFrame will contain the following columns:

        model_name: the type of model
        model_details: additional model details (all the data available from
          a `Model`'s `list_fields` method)
        parameters: the parameters used to generate the model associated with
          that row

    And if the test_set_path parameter was specified, there will be one
    additional column:

        test_metrics: additional model statistics generated on the test set
    '''

    _get_metric_tracker().track('jobs.model_parameter_search')

    if name is None:
        name =  "Model-Parameter-Search-%s" % _time()

    # Determine search space
    if(hyper_params is None):
        hyper_params = _get_default_parameter_range(model_factory, standard_model_params)
    search_space = _get_all_parameters_combinations(hyper_params)
    if(max_num_models != 'all' and max_num_models < len(search_space)):
        search_space = _random_sample(search_space, max_num_models)
    for d in search_space:
        _recursive_dict_update(d, standard_model_params)

    train_test_task = _Task("Model Train Test %s" % _time())
    train_test_task.set_code(_train_test_model)
    train_test_task.set_outputs({'results': save_path})
    train_test_task.set_params({'train_set':train_set_path, 'test_set':test_set_path,
                                'model_factory': model_factory
                                })
    _gl.deploy.tasks.delete(train_test_task)

    return _parallel_for_each(train_test_task, search_space, name = name, environment = environment)