예제 #1
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        # Save to a temoporary pickle file.
        temp_file = tempfile.mktemp()
        self._save_to_pickle(temp_file)

        # Write the pickle file to an OARC
        if not self.__proxy__:
            self.__proxy__ = _gl.extensions._PythonModel()

        # The proxy contains the file.
        self.__proxy__.temp_file = temp_file
        wrapper = self._get_wrapper()
        return glconnect.get_unity().save_model(self.__proxy__,
                          _make_internal_url(location), wrapper)
예제 #2
0
def _get_commander_args(function_name,
                        data,
                        working_dir,
                        num_workers,
                        shared_lib='./libdml_toolkits.so',
                        cluster_type='standalone_passive',
                        output_name='out',
                        **kwargs):
    """
    Get a list of arguments for dml_commander_startup
    """
    args = dict()
    # from arguments
    args['function'] = function_name
    args['args'] = data
    args['num_nodes'] = num_workers
    args['working_dir'] = _make_internal_url(working_dir)

    # from optional arguments
    args['shared_lib'] = shared_lib
    args['cluster_type'] = cluster_type
    args['output_name'] = output_name

    # from kwargs, could overwrite existing args
    accepted_args = list(args.keys()) + [
        'check_hdfs', 'startup_timeout', 'metric_server_address_file',
        'metric_server_port'
    ]
    for key in accepted_args:
        if key in kwargs:
            args[key] = kwargs[key]

    # return a formated list
    return ['--%s=%s' % (k, v) for k, v in args.items()]
예제 #3
0
def _dml_serialize_args(data, working_dir, args):
    logger.info('Serializing arguments to %s' % working_dir)
    data_copy = copy.copy(data)
    internal_working_dir = _make_internal_url(working_dir)
    data_copy['__base_path__'] = internal_working_dir
    args.from_dict(data_copy, internal_working_dir)
    logger.debug('Serialized arguments: %s' % args.to_str())
예제 #4
0
파일: _model.py 프로젝트: zhmz90/Dato-Core
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        # Save to a temoporary pickle file.
        temp_file = tempfile.mktemp()
        self._save_to_pickle(temp_file)

        # Write the pickle file to an OARC
        if not self.__proxy__:
            self.__proxy__ = _gl.extensions._PythonModel()

        # The proxy contains the file.
        self.__proxy__.temp_file = temp_file
        wrapper = self._get_wrapper()
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                wrapper)
예제 #5
0
    def save(self, location):
        """
        Save the transformer into a GraphLab archive. The object is saved as a
        directory which can then be loaded using the
        :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote
            URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python

            >>> model.save('my_model_file')
            >>> loaded_model = gl.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
예제 #6
0
    def save(self, location):
        """
        Save the transformer into a GraphLab archive. The object is saved as a
        directory which can then be loaded using the
        :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                             _make_internal_url(location), self._get_wrapper())
예제 #7
0
def _get_commander_args(function_name, data, env,
                        cluster_type='standalone_passive',
                        output_name='out',
                        **kwargs):
    args = dict()
    # from arguments
    args['function'] = function_name
    args['args'] = data
    args['num_nodes'] = env.num_workers
    args['working_dir'] = _make_internal_url(env.working_dir)

    # from optional arguments
    args['shared_lib'] = env.LIB_UNITY_DISTRIBUTED_PATH
    args['cluster_type'] = cluster_type
    args['output_name'] = output_name

    # from kwargs, could overwrite existing args
    accepted_args = args.keys() + ['check_hdfs', 'startup_timeout',
                                   'metric_server_address_file',
                                   'metric_server_port']
    for key in accepted_args:
        if key in kwargs:
            args[key] = kwargs[key]

    # return a formated list
    return ['--%s=%s' % (k, v) for k, v in args.iteritems()]
예제 #8
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model

    # We need to fix this sometime, but here is the explanation of the stupid
    # check below:
    #
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    if (not file_util.get_protocol(location) in ['http', 'https']) and \
            (not file_util.exists(location +  '/dir_archive.ini')):
        # Not a ToolkitError so try unpickling the model.
        unpickler = gl_pickle.GLUnpickler(location)

        # Get the version
        version = unpickler.load()

        # Load the class name.
        cls_name = unpickler.load()
        cls = _get_class_from_name(cls_name)

        # Load the object with the right version.
        model = cls._load_version(unpickler, version)

        unpickler.close()

        # Return the model
        return model
    else:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
def load_images(url, format='auto', with_path=True, recursive=True, ignore_failure=True, random_order=True):
    """
    Loads images from a directory. JPEG and PNG images are supported.

    Parameters
    ----------
    url : str
        The string of the path where all the images are stored.

    format : {'PNG' | 'JPG' | 'auto'}, optional
        The format of the images in the directory. The default 'auto' parameter
        value tries to infer the image type from the file extension. If a
        format is specified, all images must be of that format.

    with_path : bool, optional
        Indicates whether a path column is added to the SFrame. If 'with_path'
        is set to True,  the returned SFrame contains a 'path' column, which
        holds a path string for each Image object.

    recursive : bool, optional
        Inicates whether 'load_images' should do recursive directory traversal,
        or a flat directory traversal.

    ignore_failure : bool, optional
        If true, prints warning for failed images and keep loading the rest of
        the images.

    random_order : bool, optional
        Load images in random order, useful for Stochastic Gradient Decent like
        algorithm.

    Returns
    -------
    out : SFrame
        Returns an SFrame with either an 'image' column or both an 'image' and
        a 'path' column. The 'image' column is a column of Image objects. If
        with_path is True, there is also a 'path' column which contains the image
        path for each of each corresponding Image object.

    Examples
    --------

    >>> url ='s3://gl-testdata/images/nested'
    >>> image_sarray = graphlab.image_analysis.load_images(url, "auto", with_path=False,
    ...                                                    recursive=True)
    """
    _mt._get_metric_tracker().track('image_analysis.load_images')
    import graphlab.extensions as _extensions
    return _extensions.load_images(_make_internal_url(url), format, with_path,
                                     recursive, ignore_failure, random_order)
예제 #10
0
def _get_worker_args(worker_id, working_dir, **kwargs):
    """
    Get a list of arguments for dml_worker_startup
    """
    args = dict()
    args['worker_id'] = worker_id
    args['working_dir'] = _make_internal_url(working_dir)

    accepted_args = ['check_hdfs', 'startup_timeout', 'consensus_address'
                     ] + list(args.keys())
    for key in accepted_args:
        if key in kwargs:
            args[key] = kwargs[key]
    return ['--%s=%s' % (k, v) for k, v in args.items()]
예제 #11
0
파일: image.py 프로젝트: zhmz90/Dato-Core
    def __init__(self, path=None, format='auto', **__internal_kw_args):
        self._image_data = bytearray()
        self._height = 0
        self._width = 0
        self._channels = 0
        self._image_data_size = 0
        self._version = _CURRENT_VERSION
        self._format_enum = _format[_UNDEFINED]

        if (path is not None):
            from graphlab.util import _make_internal_url
            import graphlab.extensions as _extensions
            img = _extensions.load_image(_make_internal_url(path), format)
            for key, value in img.__dict__.iteritems():
                setattr(self, key, value)
        else:
            for key, value in __internal_kw_args.items():
                setattr(self, key, value)
예제 #12
0
    def __init__(self, path=None, format='auto', **__internal_kw_args):
        self._image_data = bytearray()
        self._height = 0
        self._width = 0
        self._channels = 0
        self._image_data_size = 0
        self._version = _CURRENT_VERSION
        self._format_enum = _format[_UNDEFINED]

        if (path is not None):
            from graphlab.util import make_internal_url as _make_internal_url
            import graphlab.extensions as _extensions
            img = _extensions.load_image(_make_internal_url(path), format)
            for key, value in img.__dict__.iteritems():
                setattr(self, key, value)
        else:
            for key, value in __internal_kw_args.items():
                setattr(self, key, value)
예제 #13
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    try:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
    except Exception as e:
        if isinstance(e, ToolkitError):
            raise
        else:
            # Not a ToolkitError so try unpickling the model.
            unpickler = gl_pickle.GLUnpickler(location)

            # Get the version
            version = unpickler.load()

            # Load the class name.
            cls_name = unpickler.load()
            cls = _get_class_from_name(cls_name)

            # Load the object with the right version.
            model = cls._load_version(unpickler, version)

            unpickler.close()

            # Return the model
            return model
예제 #14
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Note that the diverse_sampler stores the data internally, so you can
        save the model, then load it later and sample from the loaded model
        immediately.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        .. sourcecode:: python 
            >>> ground_set = graphlab.SFrame({'id': [0, 1, 2],
                                              'feature_1': [3, 1, 2],
                                              'feature_2': [[0, 1], [0, 1], [1, 0]]})
            >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set,
                                                                    item_id='id',
                                                                    quality_feature='feature_1',
                                                                    similarity_features=['feature_2'])
            >>> sampler.save('my_sampler')
            >>> loaded_sampler = graphlab.load_model('my_sampler')
            >>> loaded_sampler.sample(k=2)
            +-----------+------------+----+
            | feature_1 | feature_2  | id |
            +-----------+------------+----+
            |     2     | [0.0, 1.0] | 1  |
            |     1     | [1.0, 0.0] | 2  |
            +-----------+------------+----+

        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
예제 #15
0
  def save(self, location):
    """
    Save the model. The model is saved as a directory which can then be
    loaded using the :py:func:`~graphlab.load_model` method.

    Note that the diverse_sampler stores the data internally, so you can
    save the model, then load it later and sample from the loaded model
    immediately.

    Parameters
    ----------
    location : string
        Target destination for the model. Can be a local path or remote URL.

    See Also
    ----------
    graphlab.load_model

    Examples
    ----------
    .. sourcecode:: python 
        >>> ground_set = graphlab.SFrame({'id': [0, 1, 2],
                                          'feature_1': [3, 1, 2],
                                          'feature_2': [[0, 1], [0, 1], [1, 0]]})
        >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set,
                                                                item_id='id',
                                                                quality_feature='feature_1',
                                                                similarity_features=['feature_2'])
        >>> sampler.save('my_sampler')
        >>> loaded_sampler = graphlab.load_model('my_sampler')
        >>> loaded_sampler.sample(k=2)
        +-----------+------------+----+
        | feature_1 | feature_2  | id |
        +-----------+------------+----+
        |     2     | [0.0, 1.0] | 1  |
        |     1     | [1.0, 0.0] | 2  |
        +-----------+------------+----+

    """
    _mt._get_metric_tracker().track(self.__class__.__module__ + '.save')
    return glconnect.get_unity().save_model(self.__proxy__,
                           _make_internal_url(location), self._get_wrapper())
예제 #16
0
    def save(self, url):
        """
        Save the neuralnet to url.

        Parameters
        ----------
        url : str
            The URL to save the network.

        Examples
        --------
        >>> import graphlab as gl
        >>> net = gl.deeplearning.get_builtin_neuralnet('mnist')
        >>> net.save('mnist.conf')

        See Also
        --------
        graphlab.deeplearning.load
        """
        _gl_connect.get_unity().__write__(_make_internal_url(url), self.__config_str__())
예제 #17
0
    def save(self, location):
        """

        Parameters
        ----------
        location: str
            Filename.

        Returns
        -------
        out: None

        Examples
        --------

        """

        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
예제 #18
0
파일: _model.py 프로젝트: zhmz90/Dato-Core
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    return glconnect.get_unity().load_model(_make_internal_url(location))
예제 #19
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    return glconnect.get_unity().load_model(_make_internal_url(location))
    def save(self, url):
        """
        Save the neuralnet to url.

        Parameters
        ----------
        url : str
            The URL to save the network.

        Examples
        --------
        >>> import graphlab as gl
        >>> net = gl.deeplearning.get_builtin_neuralnet('mnist')
        >>> net.save('mnist.conf')

        See Also
        --------
        graphlab.deeplearning.load
        """
        _gl_connect.get_unity().__write__(_make_internal_url(url),
                                          self.__config_str__())
예제 #21
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = graphlab.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track('toolkit.model.save')
        return glconnect.get_unity().save_model(self, _make_internal_url(location))
예제 #22
0
def _get_dml_exec_args(function_name, data, env, **kwargs):
    """
    Return a list of map job arguments for distributed exec
    """
    internal_working_dir = _make_internal_url(env.working_dir)
    startup_timeout = INIT_TIMEOUT_PER_WORKER * env.num_workers
    dml_environment_variables = _get_dml_environment_variables()
    map_job_args = [{'exe': env.COMMANDER_PATH,
                     'args': _get_commander_args(function_name, data, env, startup_timeout=startup_timeout, **kwargs),
                     'setup': _get_commander_setup(env.working_dir),
                     'teardown': _get_commander_teardown(env.working_dir),
                     'out_log_prefix': os.path.join(env.working_dir, COMMANDER_LOG_FILE),
                     'environment_variables': dml_environment_variables
                     }]

    for i in range(env.num_workers):
        worker_args = {'exe': env.WORKER_PATH,
                       'setup': _get_worker_setup(env.working_dir, i),
                       'args': _get_worker_args(i, internal_working_dir, startup_timeout=startup_timeout, **kwargs),
                       'out_log_prefix': os.path.join(env.working_dir, WORKER_LOG_FILE(i)),
                       'environment_variables': dml_environment_variables}
        map_job_args.append(worker_args)
    return map_job_args
예제 #23
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location: str
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        --------
        >>> model.save('my_model_file')
        >>> loaded_model = gl.load_model('my_model_file')
        """

        return glconnect.get_unity().save_model(self.__proxy__,
                                                _make_internal_url(location),
                                                self._get_wrapper())
예제 #24
0
파일: _model.py 프로젝트: zhmz90/Dato-Core
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~graphlab.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        graphlab.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = graphlab.load_model('my_model_file')

        """
        _mt._get_metric_tracker().track('toolkit.model.save')
        return glconnect.get_unity().save_model(self,
                                                _make_internal_url(location))
예제 #25
0
    def __init__(self, url=None, conf_str=None):
        """
        This constructor should not be called directly.
        Instead, please use :py:func:`~graphlab.deeplearning.load` or
        :py:func:`~graphlab.deeplearning.loads`.

        Constructs a NeuralNet from URL or configuration string.
        If neither parameters are provided, by default creates an empty
        NeuralNet.

        Parameters
        ----------
        url : str, optional
          The URL to a configuration file of the NeuralNet.
        conf_str : str, optional
          The configuration string of the NeuralNet.
        """
        self._layers = _LayerList()
        self._learning_params = {"learning_rate": 0.001, "momentum": 0.9}
        if url is not None:
            self._load(_make_internal_url(url))
        if conf_str is not None:
            self._loads(conf_str)
            self.verify()
    def __init__(self, url=None, conf_str=None):
        """
        This constructor should not be called directly.
        Instead, please use :py:func:`~graphlab.deeplearning.load` or
        :py:func:`~graphlab.deeplearning.loads`.

        Constructs a NeuralNet from URL or configuration string.
        If neither parameters are provided, by default creates an empty
        NeuralNet.

        Parameters
        ----------
        url : str, optional
          The URL to a configuration file of the NeuralNet.
        conf_str : str, optional
          The configuration string of the NeuralNet.
        """
        self._layers = _LayerList()
        self._learning_params = {'learning_rate': 0.001, 'momentum': 0.9}
        if url is not None:
            self._load(_make_internal_url(url))
        if conf_str is not None:
            self._loads(conf_str)
            self.verify()
def make_sgraph(vertex_sframe,
                edge_sframe,
                output_path,
                vid_field,
                src_field,
                dst_field,
                num_partitions=8,
                _distributed='auto'):
    """
    Make an SGraph with input vertex and edge sframes,
    Save the graph to output_path, and return the graph.

    Parameters
    ----------
    vertex_sframe : SFrame
        SFrame of vertex data

    edge_sframe : SFrame
        SFrame of edge data

    output_path : str
        Path where the final graph is saved to.

    vid_field : str
        Column name of vertex id in the vertex sframe.

    src_field : str
        Column name of source vertex id in the edge sframe.

    dst_field : str
        Column name of target vertex id in the edge sframe.

    num_partitions : int
        Number of partitions for the final sgraph.

    Returns
    -------
    out : g
        SGraph
    """
    if type(vid_field) is not str:
        raise TypeError('vid_field must be str')
    if type(src_field) is not str:
        raise TypeError('src_field must be str')
    if type(dst_field) is not str:
        raise TypeError('dst_field must be str')

    # Infer the vid type
    vid_type = None
    if (vertex_sframe is not None and len(vertex_sframe) > 0):
        vid_type = vertex_sframe[vid_field].dtype()
    elif (edge_sframe is not None and len(edge_sframe) > 0):
        vid_type = edge_sframe[src_field].dtype()
    else:
        vid_type = int

    # Create empty edge sframe if input is dummy
    if (edge_sframe is None or len(edge_sframe) == 0):
        edge_sframe = gl.SFrame()
        edge_sframe['__src_id'] = gl.SArray([], vid_type)
        edge_sframe['__dst_id'] = gl.SArray([], vid_type)
        src_field = '__src_id'
        dst_field = '__dst_id'

    # Create empty vertex sframe if input is dummy
    if (vertex_sframe is None or len(vertex_sframe) == 0):
        vertex_sframe = gl.SFrame()
        vertex_sframe['__id'] = gl.SArray([], vid_type)
        vid_field = '__id'

    _raise_error_if_not_sframe(vertex_sframe, "vertex_data")
    _raise_error_if_not_sframe(edge_sframe, "edge_data")

    if vid_field not in vertex_sframe.column_names():
        raise ValueError('Column %s not found in vertex_data' % vid_field)
    if src_field not in edge_sframe.column_names():
        raise ValueError('Column %s not found in edge_data' % src_field)
    if dst_field not in edge_sframe.column_names():
        raise ValueError('Column %s not found in edge_data' % dst_field)

    output_path = _make_internal_url(output_path)

    opts = {'vertex_data': vertex_sframe,
            'edge_data': edge_sframe,
            'output_path': output_path,
            'vid_field': vid_field,
            'src_field': src_field,
            'dst_field': dst_field,
            'num_partitions': num_partitions}

    run('distributed_graph_ingress', opts, env=_distributed)
    from graphlab.data_structures.sgraph import load_sgraph
    return load_sgraph(output_path)
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~graphlab.boosted_trees_regression.BoostedTreesRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations for boosting. It is also the number of trees
        in the model.

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1],  optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller, the more conservative the
        algorithm will be. Smaller step_size is usually used together with
        larger max_iterations.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    verbose : boolean, optional
        If True, print progress information during training.

    random_seed: int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproduceable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesRegression
          A trained gradient boosted trees model

    References
    ----------
    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesRegression, graphlab.linear_regression.LinearRegression, graphlab.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = graphlab.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = graphlab.boosted_trees_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    _mt._get_metric_tracker().track(
        'toolkit.regression.boosted_trees_regression.create')

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='boosted_trees_regression',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesRegression(model.__proxy__)
예제 #29
0
def run(function_name,
        model_name,
        options,
        deploy_environment,
        working_dir=None):
    from graphlab.extensions import init_dml_class_registry
    from graphlab.extensions import dml_function_invocation

    if type(deploy_environment) is not HadoopCluster and \
            type(deploy_environment) is not LocalAsync:
        raise TypeError('Deployment environment %s is not supported' %
                        str(type(deploy_environment)))

    # Working directory and num workers
    jobname = 'dml_job_%s' % str(uuid.uuid4())
    if not working_dir:
        working_dir = _dml_create_working_dir(jobname, deploy_environment)
    logger.info('Working directory: %s' % working_dir)

    num_workers = deploy_environment.get_num_workers()
    logger.info('Running with %d workers' % num_workers)

    # Substitute working_dir specific default options
    if 'model_checkpoint_path' in options and options[
            'model_checkpoint_path'] == 'auto':
        options['model_checkpoint_path'] = _make_internal_url(
            os.path.join(working_dir, 'checkpoints'))

    # Serialize arguments
    init_dml_class_registry()
    args = dml_function_invocation()
    _dml_serialize_args(options, working_dir, args)

    # Make argument list for dml_commander and dml_worker
    proc_arg_list = []
    proc_arg_list.append(
        _get_commander_args(function_name, args.to_str(), working_dir,
                            num_workers))
    proc_arg_list.append(_get_worker_args("${MY_RANK}", working_dir))

    # Distributed shell exec
    dshell = None
    job_handle = None
    if type(deploy_environment) is HadoopCluster:
        hadoop_cluster = deploy_environment
        dshell = _distributed_shell.HadoopDistributedShell(hadoop_cluster)
        shell_env = _get_environment_variables(hadoop_cluster)
        shell_script_file = _generate_hadoop_shell_script(
            proc_arg_list, shell_env, working_dir,
            hadoop_cluster.turi_dist_path)
        num_containers = num_workers + 1
        job_handle = dshell.run(function_name, shell_script_file,
                                num_containers)
    elif type(deploy_environment) is LocalAsync:
        raise NotImplementedError()
    else:
        raise ValueError('Unsupported deploy environment')

    from ._dml_job_status import DMLJobStatus
    if 'model_checkpoint_path' in options:
        return DMLJobStatus(model_name,
                            job_handle,
                            working_dir,
                            checkpoint_path=options['model_checkpoint_path'])
    return DMLJobStatus(model_name, job_handle, working_dir)
예제 #30
0
def submit_training_job(env,
                        dataset,
                        target,
                        features=None,
                        max_iterations=10,
                        validation_set=None,
                        class_weights=None,
                        max_depth=6,
                        step_size=0.3,
                        min_loss_reduction=0.0,
                        min_child_weight=0.1,
                        row_subsample=1.0,
                        column_subsample=1.0,
                        random_seed=None,
                        metric='auto',
                        model_checkpoint_path='auto',
                        **kwargs):
    """
    Submit a job to create a (binary or multi-class) classifier model of type
    :class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier` using
    gradient boosted trees (sometimes known as GBMs).

    Parameters
    ----------
    env : graphlab.deploy.hadoop_cluster.HadoopCluster
        Hadoop cluster to submit the training job

    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations for boosting. Each iteration results
        in the creation of an extra tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If provided, the dictionary must contain a key for each class
        label. The value can be any positive number greater than 1e-20. Weights
        are interpreted as relative to each other. So setting the weights to be
        2.0 for the positive class and 1.0 for the negative class has the same
        effect as setting them to be 20.0 and 10.0, respectively. If set to
        `None`, all classes are taken to have weight 1.0. The `auto` mode sets
        the class weight to be inversely proportional to the number of examples
        in the training data with the given class.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1], optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller the step size, the more conservative
        the algorithm will be. Smaller step_size work well when
        `max_iterations` is large.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : :class:`~graphlab.distributed._dml_job_status.DMLJobStatus`
          An object that tracks the execution of the distributed training job.

    References
    ----------

    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesClassifier, graphlab.logistic_classifier.LogisticClassifier, graphlab.svm_classifier.SVMClassifier, graphlab.neuralnet_classifier.NeuralNetClassifier

    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)
      >>> hdp_env = graphlab.deploy.hadoop_cluster.create('my-first-hadoop-cluster',
      ...    'hdfs://path-to-turi-distributed-installation')

      >>> train, test = data.random_split(0.8)
      >>> distr_job = graphlab.distributed.boosted_trees_classifier.submit_training_job(hdp_env, train, target='label')
      >>> model = distr_job.get_results()

      >>> predicitons = model.classify(test)
      >>> results = model.evaluate(test)
    """
    _mt._get_metric_tracker().track(
        'distributed.toolkit.classifier.boosted_trees_classifier.submit_training_job'
    )

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if model_checkpoint_path != 'auto':
        model_checkpoint_path = _make_internal_url(model_checkpoint_path)
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])

    dml_obj = _sl.create(dataset=dataset,
                         target=target,
                         features=features,
                         model_name='boosted_trees_classifier',
                         env=env,
                         max_iterations=max_iterations,
                         validation_set=validation_set,
                         class_weights=class_weights,
                         max_depth=max_depth,
                         step_size=step_size,
                         min_loss_reduction=min_loss_reduction,
                         min_child_weight=min_child_weight,
                         row_subsample=row_subsample,
                         column_subsample=column_subsample,
                         metric=metric,
                         model_checkpoint_path=model_checkpoint_path,
                         **kwargs)
    return dml_obj
예제 #31
0
def ext_import(soname, module_subpath=""):
    """
    Loads a graphlab toolkit module (a shared library) into the
    gl.extensions namespace.

    Toolkit module created via SDK can either be directly imported,
    e.g. ``import example`` or via this function, e.g. ``graphlab.ext_import("example.so")``.
    Use ``ext_import`` when you need more namespace control, or when
    the shared library is not local, e.g. in http, s3 or hdfs.

    Parameters
    ----------
    soname : string
        The filename of the shared library to load.
        This can be a URL, or a HDFS location. For instance if soname is
        somewhere/outthere/toolkit.so
        The functions in toolkit.so will appear in gl.extensions.toolkit.*

    module_subpath : string, optional
        Any additional module paths to prepend to the toolkit module after
        it is imported. For instance if soname is
        somewhere/outthere/toolkit.so, by default
        the functions in toolkit.so will appear in gl.extensions.toolkit.*.
        However, if I module_subpath="somewhere.outthere", the functions
        in toolkit.so will appear in gl.extensions.somewhere.outthere.toolkit.*

    Returns
    -------
    out : a list of functions and classes loaded.

    Examples
    --------
    For instance, given a module which implements the function "square_root",

    .. code-block:: c++

        #include <cmath>
        #include <graphlab/sdk/toolkit_function_macros.hpp>
        double square_root(double a) {
          return sqrt(a);
        }

        BEGIN_FUNCTION_REGISTRATION
        REGISTER_FUNCTION(square_root, "a");
        END_FUNCTION_REGISTRATION

    compiled into example.so

    >>> graphlab.ext_import('example1.so')
    ['example1.square_root']

    >>> graphlab.extensions.example1.square_root(9)
    3.0

    We can customize the import location with module_subpath which can be
    used to avoid namespace conflicts when you have multiple toolkits with the
    same filename.

    >>> graphlab.ext_import('example1.so', 'math')
    ['math.example1.square_root']
    >>> graphlab.extensions.math.example1.square_root(9)
    3.0

    The module can also be imported directly, but graphlab *must* be imported
    first. graphlab will intercept the module loading process to load the
    toolkit.

    >>> import graphlab
    >>> import example1 #searches for example1.so in all the python paths
    >>> example1.square_root(9)
    3.0
    """
    unity = _gl.connect.main.get_unity()
    import os
    if os.path.exists(soname):
        soname = os.path.abspath(soname)
    else:
        soname = _make_internal_url(soname)
    ret = unity.load_toolkit(soname, module_subpath)
    if len(ret) > 0:
        raise RuntimeError(ret)
    _publish()
    # push the functions into the corresponding module namespace
    filename = os.path.basename(soname)
    modulename = filename.split('.')[0]
    return unity.list_toolkit_functions_in_dynamic_module(soname) + unity.list_toolkit_classes_in_dynamic_module(soname)
예제 #32
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model

    # We need to fix this sometime, but here is the explanation of the stupid
    # check below:
    #
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == '':
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, 'dir_archive.ini'))
    else:
        model_path = location
        if protocol in ['http', 'https']:
            dir_archive_exists = True
        else:
            import posixpath
            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, 'dir_archive.ini'))

    if not dir_archive_exists:
        # Not a ToolkitError so try unpickling the model.
        unpickler = gl_pickle.GLUnpickler(location)

        # Get the version
        version = unpickler.load()

        # Load the class name.
        cls_name = unpickler.load()
        cls = _get_class_from_name(cls_name)

        # Load the object with the right version.
        model = cls._load_version(unpickler, version)

        unpickler.close()

        # Return the model
        return model
    else:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True,
           class_weights=None,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~graphlab.random_forest_classifier.RandomForestClassifier` using
    an ensemble of decision trees trained on subsets of the data.

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations to perform. For multi-class
        classification with K classes, each iteration will create K-1 trees.

    max_depth : float, optional
        Maximum depth of a tree.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition on a
        leaf node of the tree. The larger it is, the more conservative the
        algorithm will be. Must be non-negative.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.


    Returns
    -------
      out : RandomForestClassifier
          A trained random forest model for classification tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestClassifier, graphlab.logistic_classifier.LogisticClassifier, graphlab.svm_classifier.SVMClassifier, graphlab.neuralnet_classifier.NeuralNetClassifier


    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = graphlab.random_forest_classifier.create(train, target='label')

      >>> predicitons = model.classify(test)
      >>> results = model.evaluate(test)
    """

    _mt._get_metric_tracker().track(
        'toolkit.classifier.random_forest_classifier.create')

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning(
            "The `num_trees` keyword argument is deprecated. Please "
            "use the `max_iterations` argument instead. Any value provided "
            "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='random_forest_classifier',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       class_weights=class_weights,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return RandomForestClassifier(model.__proxy__)
예제 #34
0
def submit_training_job(env, dataset, target,
    features=None,
    max_iterations=10,
    validation_set=None,
    random_seed = None,
    metric = 'auto',
    model_checkpoint_path='auto',
    **kwargs):
    """
    Submit a job to create a :class:`~graphlab.random_forest_regression.RandomForestRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    env : graphlab.deploy.hadoop_cluster.HadoopCluster
        Hadoop cluster to submit the training job

    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations to perform.

    max_depth : float, optional
        Maximum depth of a tree.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting.


    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default 'auto'
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : :class:`~graphlab.distributed._dml_job_status.DMLJobStatus`
          An object that tracks the execution of the distributed training job.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestRegression, graphlab.linear_regression.LinearRegression, graphlab.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = graphlab.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'
    >>> hdp_env = graphlab.deploy.hadoop_cluster.create('my-first-hadoop-cluster',
    ...    'hdfs://path-to-turi-distributed-installation')

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> distr_job = graphlab.random_forest_regression.submit_training_job(hdp_env, train, target='label')
    >>> model = distr_job.get_results()

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    logger = _logging.getLogger(__name__)

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if model_checkpoint_path != 'auto':
        model_checkpoint_path = _make_internal_url(model_checkpoint_path)
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint'])

    if 'num_trees' in kwargs:
        logger.warning("The `num_trees` keyword argument is deprecated. Please "
              "use the `max_iterations` argument instead. Any value provided "
              "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    _mt._get_metric_tracker().track('distributed.toolkit.regression.random_forest_regression.submit_training_job')
    dml_obj = _sl.create(dataset = dataset,
                        target = target,
                        features = features,
                        model_name = 'random_forest_regression',
                        env = env,
                        max_iterations = max_iterations,
                        validation_set = validation_set,
                        metric = metric,
                        model_checkpoint_path=model_checkpoint_path,
                        **kwargs)
    return dml_obj
예제 #35
0
def ext_import(soname, module_subpath=""):
    """
    Loads a graphlab toolkit module (a shared library) into the
    gl.extensions namespace.

    Toolkit module created via SDK can either be directly imported,
    e.g. ``import example`` or via this function, e.g. ``graphlab.ext_import("example.so")``.
    Use ``ext_import`` when you need more namespace control, or when
    the shared library is not local, e.g. in http, s3 or hdfs.

    Parameters
    ----------
    soname : string
        The filename of the shared library to load.
        This can be a URL, or a HDFS location. For instance if soname is
        somewhere/outthere/toolkit.so
        The functions in toolkit.so will appear in gl.extensions.toolkit.*

    module_subpath : string, optional
        Any additional module paths to prepend to the toolkit module after
        it is imported. For instance if soname is
        somewhere/outthere/toolkit.so, by default
        the functions in toolkit.so will appear in gl.extensions.toolkit.*.
        However, if I module_subpath="somewhere.outthere", the functions
        in toolkit.so will appear in gl.extensions.somewhere.outthere.toolkit.*

    Returns
    -------
    out : a list of functions and classes loaded.

    Examples
    --------
    For instance, given a module which implements the function "square_root",

    .. code-block:: c++

        #include <cmath>
        #include <graphlab/sdk/toolkit_function_macros.hpp>
        double square_root(double a) {
          return sqrt(a);
        }

        BEGIN_FUNCTION_REGISTRATION
        REGISTER_FUNCTION(square_root, "a");
        END_FUNCTION_REGISTRATION

    compiled into example.so

    >>> graphlab.ext_import('example1.so')
    ['example1.square_root']

    >>> graphlab.extensions.example1.square_root(9)
    3.0

    We can customize the import location with module_subpath which can be
    used to avoid namespace conflicts when you have multiple toolkits with the
    same filename.

    >>> graphlab.ext_import('example1.so', 'math')
    ['math.example1.square_root']
    >>> graphlab.extensions.math.example1.square_root(9)
    3.0

    The module can also be imported directly, but graphlab *must* be imported
    first. graphlab will intercept the module loading process to load the
    toolkit.

    >>> import graphlab
    >>> import example1 #searches for example1.so in all the python paths
    >>> example1.square_root(9)
    3.0
    """
    unity = _gl.connect.main.get_unity()
    import os
    if os.path.exists(soname):
        soname = os.path.abspath(soname)
    else:
        soname = _make_internal_url(soname)
    ret = unity.load_toolkit(soname, module_subpath)
    if len(ret) > 0:
        raise RuntimeError(ret)
    _publish()
    # push the functions into the corresponding module namespace
    filename = os.path.basename(soname)
    modulename = filename.split('.')[0]
    return unity.list_toolkit_functions_in_dynamic_module(
        soname) + unity.list_toolkit_classes_in_dynamic_module(soname)
예제 #36
0
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs):
    """
    Executes a distributed ml function

    Parameters
    ----------
    function_name : str
        Name of the distributed function to be executed. The function symbol
        must exists in the unity distributed shared library.

    data : dict
        Key value arguments to the function stored in a dictionary

    env : DMLEnvironemnt
        Contains job environment parameters and a job submit function.

    **kwargs : dict
        Additional options.
        See _get_worker_args and _get_commander_args.
            - check_hdfs : {0, 1} Perform sanity check for hdfs read and write
            - startup_timeout : int Timeout in seconds for cluster setup

    Return
    ------
    (success, message, result_path) : bool, str, str
    """
    from graphlab.extensions import dml_function_invocation, init_dml_class_registry
    init_dml_class_registry()

    if env == 'auto':
        env = DMLRemoteEnvironment()

    if not file_util.exists(env.working_dir):
        _log.debug('Creating working directory: %s' % env.working_dir)
        file_util.mkdir(env.working_dir)
    else:
        _log.debug('Using existing working directory: %s' % env.working_dir)

    _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir))

    success = False
    message = ""
    result_path = None

    # Job function arguments
    try:
        _log.info('Serializing arguments to %s' % env.working_dir)
        args = dml_function_invocation()
        data_copy = copy(data)
        internal_working_dir = _make_internal_url(env.working_dir)
        data_copy['__base_path__'] = internal_working_dir
        args.from_dict(data_copy, internal_working_dir)
        json_data = args.to_str()

        # sanitize the base path url
    
        sanitized_json_data = json_data
        if file_util.is_s3_path(json_data): 
            sanitized_json_data = _sanitize_internal_s3_url(json_data)   

        _log.info('Serialized arguments: %s' % sanitized_json_data)
    except Exception as e:
        success = False
        message = 'Error serializing arguments. %s' % str(e)
        return (success, message, None)

    # Submit job
    try:
        job = dml_submit(function_name, json_data, env,
                         metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE,
                         logprogress_file=PROGRESS_LOG_FILE,
                         **kwargs)
    except KeyboardInterrupt:
        message = 'Canceled by user'
        return (success, message, None)

    _log.info('Waiting for workers to start ... ')
    logprinter = None
    if verbose:
        log_server_address_path = os.path.join(env.working_dir,
                                               COMMANDER_LOG_SERVER_ADDRESS_FILE)
        log_server_address = get_log_metric_server_address(log_server_address_path,
                                                           timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers)
        if len(log_server_address) > 0:
            tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_')
            fd_list = []
            logprinter = LogPrinter()
            # Attach log progress stream
            logprinter.add_stream(LogStream(log_server_address + '/progress',
                                            os.path.join(env.working_dir, PROGRESS_LOG_FILE),
                                            sys.stdout))
            # Attach commander log stream
            local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w')
            fd_list.append(local_commander_log)
            logprinter.add_stream(LogStream(log_server_address + '/commander',
                                            os.path.join(env.working_dir, COMMANDER_LOG_FILE),
                                            local_commander_log))
            # Attach worker log streams
            for i in range(env.num_workers):
                local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w')
                fd_list.append(local_worker_log)
                logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i,
                                                os.path.join(env.working_dir, WORKER_LOG_FILE(i)),
                                                local_worker_log))
            logprinter.start()
            _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir)

    _log.debug('Wait for job to finish')
    (success, message) = _wait_and_parse_job_result(job)

    if logprinter:
        logprinter.stop()
        for fd in fd_list:
            fd.close()

    if success:
        try:
            result_path = os.path.join(env.working_dir, env.output_name)
            ret_str = file_util.read(result_path)
            sanitized_ret_str = _sanitize_internal_s3_url(ret_str)
            _log.debug('Deserializing results: %s' % sanitized_ret_str)

            args.from_str(ret_str)
            response = args.to_dict()

            # Check toolkit response for "result" key or "exception" key.
            if 'result' in response:
                return (success, message, response['result'])
            elif 'exception' in response:
                return (False, response['exception'], None)
            else:
                raise ValueError('Invalid toolkit response. Must have "result" or \
                                 "exception" as key')
        except Exception as e:
            success = False
            message = 'Error deserializing results. %s' % str(e)
            return (success, message, None)
    else:
        return (success, message, None)