def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ # Save to a temoporary pickle file. temp_file = tempfile.mktemp() self._save_to_pickle(temp_file) # Write the pickle file to an OARC if not self.__proxy__: self.__proxy__ = _gl.extensions._PythonModel() # The proxy contains the file. self.__proxy__.temp_file = temp_file wrapper = self._get_wrapper() return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), wrapper)
def _get_commander_args(function_name, data, working_dir, num_workers, shared_lib='./libdml_toolkits.so', cluster_type='standalone_passive', output_name='out', **kwargs): """ Get a list of arguments for dml_commander_startup """ args = dict() # from arguments args['function'] = function_name args['args'] = data args['num_nodes'] = num_workers args['working_dir'] = _make_internal_url(working_dir) # from optional arguments args['shared_lib'] = shared_lib args['cluster_type'] = cluster_type args['output_name'] = output_name # from kwargs, could overwrite existing args accepted_args = list(args.keys()) + [ 'check_hdfs', 'startup_timeout', 'metric_server_address_file', 'metric_server_port' ] for key in accepted_args: if key in kwargs: args[key] = kwargs[key] # return a formated list return ['--%s=%s' % (k, v) for k, v in args.items()]
def _dml_serialize_args(data, working_dir, args): logger.info('Serializing arguments to %s' % working_dir) data_copy = copy.copy(data) internal_working_dir = _make_internal_url(working_dir) data_copy['__base_path__'] = internal_working_dir args.from_dict(data_copy, internal_working_dir) logger.debug('Serialized arguments: %s' % args.to_str())
def save(self, location): """ Save the transformer into a GraphLab archive. The object is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- .. sourcecode:: python >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.save') return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def _get_commander_args(function_name, data, env, cluster_type='standalone_passive', output_name='out', **kwargs): args = dict() # from arguments args['function'] = function_name args['args'] = data args['num_nodes'] = env.num_workers args['working_dir'] = _make_internal_url(env.working_dir) # from optional arguments args['shared_lib'] = env.LIB_UNITY_DISTRIBUTED_PATH args['cluster_type'] = cluster_type args['output_name'] = output_name # from kwargs, could overwrite existing args accepted_args = args.keys() + ['check_hdfs', 'startup_timeout', 'metric_server_address_file', 'metric_server_port'] for key in accepted_args: if key in kwargs: args[key] = kwargs[key] # return a formated list return ['--%s=%s' % (k, v) for k, v in args.iteritems()]
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http if (not file_util.get_protocol(location) in ['http', 'https']) and \ (not file_util.exists(location + '/dir_archive.ini')): # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)
def load_images(url, format='auto', with_path=True, recursive=True, ignore_failure=True, random_order=True): """ Loads images from a directory. JPEG and PNG images are supported. Parameters ---------- url : str The string of the path where all the images are stored. format : {'PNG' | 'JPG' | 'auto'}, optional The format of the images in the directory. The default 'auto' parameter value tries to infer the image type from the file extension. If a format is specified, all images must be of that format. with_path : bool, optional Indicates whether a path column is added to the SFrame. If 'with_path' is set to True, the returned SFrame contains a 'path' column, which holds a path string for each Image object. recursive : bool, optional Inicates whether 'load_images' should do recursive directory traversal, or a flat directory traversal. ignore_failure : bool, optional If true, prints warning for failed images and keep loading the rest of the images. random_order : bool, optional Load images in random order, useful for Stochastic Gradient Decent like algorithm. Returns ------- out : SFrame Returns an SFrame with either an 'image' column or both an 'image' and a 'path' column. The 'image' column is a column of Image objects. If with_path is True, there is also a 'path' column which contains the image path for each of each corresponding Image object. Examples -------- >>> url ='s3://gl-testdata/images/nested' >>> image_sarray = graphlab.image_analysis.load_images(url, "auto", with_path=False, ... recursive=True) """ _mt._get_metric_tracker().track('image_analysis.load_images') import graphlab.extensions as _extensions return _extensions.load_images(_make_internal_url(url), format, with_path, recursive, ignore_failure, random_order)
def _get_worker_args(worker_id, working_dir, **kwargs): """ Get a list of arguments for dml_worker_startup """ args = dict() args['worker_id'] = worker_id args['working_dir'] = _make_internal_url(working_dir) accepted_args = ['check_hdfs', 'startup_timeout', 'consensus_address' ] + list(args.keys()) for key in accepted_args: if key in kwargs: args[key] = kwargs[key] return ['--%s=%s' % (k, v) for k, v in args.items()]
def __init__(self, path=None, format='auto', **__internal_kw_args): self._image_data = bytearray() self._height = 0 self._width = 0 self._channels = 0 self._image_data_size = 0 self._version = _CURRENT_VERSION self._format_enum = _format[_UNDEFINED] if (path is not None): from graphlab.util import _make_internal_url import graphlab.extensions as _extensions img = _extensions.load_image(_make_internal_url(path), format) for key, value in img.__dict__.iteritems(): setattr(self, key, value) else: for key, value in __internal_kw_args.items(): setattr(self, key, value)
def __init__(self, path=None, format='auto', **__internal_kw_args): self._image_data = bytearray() self._height = 0 self._width = 0 self._channels = 0 self._image_data_size = 0 self._version = _CURRENT_VERSION self._format_enum = _format[_UNDEFINED] if (path is not None): from graphlab.util import make_internal_url as _make_internal_url import graphlab.extensions as _extensions img = _extensions.load_image(_make_internal_url(path), format) for key, value in img.__dict__.iteritems(): setattr(self, key, value) else: for key, value in __internal_kw_args.items(): setattr(self, key, value)
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') try: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url) except Exception as e: if isinstance(e, ToolkitError): raise else: # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Note that the diverse_sampler stores the data internally, so you can save the model, then load it later and sample from the loaded model immediately. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- .. sourcecode:: python >>> ground_set = graphlab.SFrame({'id': [0, 1, 2], 'feature_1': [3, 1, 2], 'feature_2': [[0, 1], [0, 1], [1, 0]]}) >>> sampler = graphlab.diversity.diverse_sampler.create(data=ground_set, item_id='id', quality_feature='feature_1', similarity_features=['feature_2']) >>> sampler.save('my_sampler') >>> loaded_sampler = graphlab.load_model('my_sampler') >>> loaded_sampler.sample(k=2) +-----------+------------+----+ | feature_1 | feature_2 | id | +-----------+------------+----+ | 2 | [0.0, 1.0] | 1 | | 1 | [1.0, 0.0] | 2 | +-----------+------------+----+ """ _mt._get_metric_tracker().track(self.__class__.__module__ + '.save') return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def save(self, url): """ Save the neuralnet to url. Parameters ---------- url : str The URL to save the network. Examples -------- >>> import graphlab as gl >>> net = gl.deeplearning.get_builtin_neuralnet('mnist') >>> net.save('mnist.conf') See Also -------- graphlab.deeplearning.load """ _gl_connect.get_unity().__write__(_make_internal_url(url), self.__config_str__())
def save(self, location): """ Parameters ---------- location: str Filename. Returns ------- out: None Examples -------- """ return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') return glconnect.get_unity().load_model(_make_internal_url(location))
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = graphlab.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.save') return glconnect.get_unity().save_model(self, _make_internal_url(location))
def _get_dml_exec_args(function_name, data, env, **kwargs): """ Return a list of map job arguments for distributed exec """ internal_working_dir = _make_internal_url(env.working_dir) startup_timeout = INIT_TIMEOUT_PER_WORKER * env.num_workers dml_environment_variables = _get_dml_environment_variables() map_job_args = [{'exe': env.COMMANDER_PATH, 'args': _get_commander_args(function_name, data, env, startup_timeout=startup_timeout, **kwargs), 'setup': _get_commander_setup(env.working_dir), 'teardown': _get_commander_teardown(env.working_dir), 'out_log_prefix': os.path.join(env.working_dir, COMMANDER_LOG_FILE), 'environment_variables': dml_environment_variables }] for i in range(env.num_workers): worker_args = {'exe': env.WORKER_PATH, 'setup': _get_worker_setup(env.working_dir, i), 'args': _get_worker_args(i, internal_working_dir, startup_timeout=startup_timeout, **kwargs), 'out_log_prefix': os.path.join(env.working_dir, WORKER_LOG_FILE(i)), 'environment_variables': dml_environment_variables} map_job_args.append(worker_args) return map_job_args
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~graphlab.load_model` method. Parameters ---------- location: str Target destination for the model. Can be a local path or remote URL. See Also ---------- graphlab.load_model Examples -------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ return glconnect.get_unity().save_model(self.__proxy__, _make_internal_url(location), self._get_wrapper())
def __init__(self, url=None, conf_str=None): """ This constructor should not be called directly. Instead, please use :py:func:`~graphlab.deeplearning.load` or :py:func:`~graphlab.deeplearning.loads`. Constructs a NeuralNet from URL or configuration string. If neither parameters are provided, by default creates an empty NeuralNet. Parameters ---------- url : str, optional The URL to a configuration file of the NeuralNet. conf_str : str, optional The configuration string of the NeuralNet. """ self._layers = _LayerList() self._learning_params = {"learning_rate": 0.001, "momentum": 0.9} if url is not None: self._load(_make_internal_url(url)) if conf_str is not None: self._loads(conf_str) self.verify()
def __init__(self, url=None, conf_str=None): """ This constructor should not be called directly. Instead, please use :py:func:`~graphlab.deeplearning.load` or :py:func:`~graphlab.deeplearning.loads`. Constructs a NeuralNet from URL or configuration string. If neither parameters are provided, by default creates an empty NeuralNet. Parameters ---------- url : str, optional The URL to a configuration file of the NeuralNet. conf_str : str, optional The configuration string of the NeuralNet. """ self._layers = _LayerList() self._learning_params = {'learning_rate': 0.001, 'momentum': 0.9} if url is not None: self._load(_make_internal_url(url)) if conf_str is not None: self._loads(conf_str) self.verify()
def make_sgraph(vertex_sframe, edge_sframe, output_path, vid_field, src_field, dst_field, num_partitions=8, _distributed='auto'): """ Make an SGraph with input vertex and edge sframes, Save the graph to output_path, and return the graph. Parameters ---------- vertex_sframe : SFrame SFrame of vertex data edge_sframe : SFrame SFrame of edge data output_path : str Path where the final graph is saved to. vid_field : str Column name of vertex id in the vertex sframe. src_field : str Column name of source vertex id in the edge sframe. dst_field : str Column name of target vertex id in the edge sframe. num_partitions : int Number of partitions for the final sgraph. Returns ------- out : g SGraph """ if type(vid_field) is not str: raise TypeError('vid_field must be str') if type(src_field) is not str: raise TypeError('src_field must be str') if type(dst_field) is not str: raise TypeError('dst_field must be str') # Infer the vid type vid_type = None if (vertex_sframe is not None and len(vertex_sframe) > 0): vid_type = vertex_sframe[vid_field].dtype() elif (edge_sframe is not None and len(edge_sframe) > 0): vid_type = edge_sframe[src_field].dtype() else: vid_type = int # Create empty edge sframe if input is dummy if (edge_sframe is None or len(edge_sframe) == 0): edge_sframe = gl.SFrame() edge_sframe['__src_id'] = gl.SArray([], vid_type) edge_sframe['__dst_id'] = gl.SArray([], vid_type) src_field = '__src_id' dst_field = '__dst_id' # Create empty vertex sframe if input is dummy if (vertex_sframe is None or len(vertex_sframe) == 0): vertex_sframe = gl.SFrame() vertex_sframe['__id'] = gl.SArray([], vid_type) vid_field = '__id' _raise_error_if_not_sframe(vertex_sframe, "vertex_data") _raise_error_if_not_sframe(edge_sframe, "edge_data") if vid_field not in vertex_sframe.column_names(): raise ValueError('Column %s not found in vertex_data' % vid_field) if src_field not in edge_sframe.column_names(): raise ValueError('Column %s not found in edge_data' % src_field) if dst_field not in edge_sframe.column_names(): raise ValueError('Column %s not found in edge_data' % dst_field) output_path = _make_internal_url(output_path) opts = {'vertex_data': vertex_sframe, 'edge_data': edge_sframe, 'output_path': output_path, 'vid_field': vid_field, 'src_field': src_field, 'dst_field': dst_field, 'num_partitions': num_partitions} run('distributed_graph_ingress', opts, env=_distributed) from graphlab.data_structures.sgraph import load_sgraph return load_sgraph(output_path)
def create(dataset, target, features=None, max_iterations=10, validation_set='auto', max_depth=6, step_size=0.3, min_loss_reduction=0.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, verbose=True, random_seed=None, metric='auto', **kwargs): """ Create a :class:`~graphlab.boosted_trees_regression.BoostedTreesRegression` to predict a scalar target variable using one or more features. In addition to standard numeric and categorical types, features can also be extracted automatically from list- or dictionary-type SFrame columns. Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. Only numerical typed (int, float) target column is allowed. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, using all columns. max_iterations : int, optional The number of iterations for boosting. It is also the number of trees in the model. validation_set : SFrame, optional The validation set that is used to watch the validation result as boosting progress. max_depth : float, optional Maximum depth of a tree. Must be at least 1. step_size : float, [0,1], optional Step size (shrinkage) used in update to prevents overfitting. It shrinks the prediction of each weak learner to make the boosting process more conservative. The smaller, the more conservative the algorithm will be. Smaller step_size is usually used together with larger max_iterations. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, [0,1], optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the examples (rows) to grow each tree. column_subsample : float, [0,1], optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this also usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the columns to grow each tree. verbose : boolean, optional If True, print progress information during training. random_seed: int, optional Seeds random operations such as column and row subsampling, such that results are reproduceable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'rmse', 'max_error'} kwargs : dict, optional Additional arguments for training the model. - ``early_stopping_rounds`` : int, default None If the validation metric does not improve after <early_stopping_rounds>, stop training and return the best model. If multiple metrics are being tracked, the last one is used. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : BoostedTreesRegression A trained gradient boosted trees model References ---------- - `Wikipedia - Gradient tree boosting <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_ - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- BoostedTreesRegression, graphlab.linear_regression.LinearRegression, graphlab.regression.create Examples -------- Setup the data: >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> data['label'] = data['label'] == 'p' Split the data into training and test data: >>> train, test = data.random_split(0.8) Create the model: >>> model = graphlab.boosted_trees_regression.create(train, target='label') Make predictions and evaluate the model: >>> predictions = model.predict(test) >>> results = model.evaluate(test) """ _mt._get_metric_tracker().track( 'toolkit.regression.boosted_trees_regression.create') if random_seed is not None: kwargs['random_seed'] = random_seed if 'model_checkpoint_path' in kwargs: kwargs['model_checkpoint_path'] = _make_internal_url( kwargs['model_checkpoint_path']) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url( kwargs['resume_from_checkpoint']) model = _sl.create(dataset=dataset, target=target, features=features, model_name='boosted_trees_regression', max_iterations=max_iterations, validation_set=validation_set, max_depth=max_depth, step_size=step_size, min_loss_reduction=min_loss_reduction, min_child_weight=min_child_weight, row_subsample=row_subsample, column_subsample=column_subsample, verbose=verbose, metric=metric, **kwargs) return BoostedTreesRegression(model.__proxy__)
def run(function_name, model_name, options, deploy_environment, working_dir=None): from graphlab.extensions import init_dml_class_registry from graphlab.extensions import dml_function_invocation if type(deploy_environment) is not HadoopCluster and \ type(deploy_environment) is not LocalAsync: raise TypeError('Deployment environment %s is not supported' % str(type(deploy_environment))) # Working directory and num workers jobname = 'dml_job_%s' % str(uuid.uuid4()) if not working_dir: working_dir = _dml_create_working_dir(jobname, deploy_environment) logger.info('Working directory: %s' % working_dir) num_workers = deploy_environment.get_num_workers() logger.info('Running with %d workers' % num_workers) # Substitute working_dir specific default options if 'model_checkpoint_path' in options and options[ 'model_checkpoint_path'] == 'auto': options['model_checkpoint_path'] = _make_internal_url( os.path.join(working_dir, 'checkpoints')) # Serialize arguments init_dml_class_registry() args = dml_function_invocation() _dml_serialize_args(options, working_dir, args) # Make argument list for dml_commander and dml_worker proc_arg_list = [] proc_arg_list.append( _get_commander_args(function_name, args.to_str(), working_dir, num_workers)) proc_arg_list.append(_get_worker_args("${MY_RANK}", working_dir)) # Distributed shell exec dshell = None job_handle = None if type(deploy_environment) is HadoopCluster: hadoop_cluster = deploy_environment dshell = _distributed_shell.HadoopDistributedShell(hadoop_cluster) shell_env = _get_environment_variables(hadoop_cluster) shell_script_file = _generate_hadoop_shell_script( proc_arg_list, shell_env, working_dir, hadoop_cluster.turi_dist_path) num_containers = num_workers + 1 job_handle = dshell.run(function_name, shell_script_file, num_containers) elif type(deploy_environment) is LocalAsync: raise NotImplementedError() else: raise ValueError('Unsupported deploy environment') from ._dml_job_status import DMLJobStatus if 'model_checkpoint_path' in options: return DMLJobStatus(model_name, job_handle, working_dir, checkpoint_path=options['model_checkpoint_path']) return DMLJobStatus(model_name, job_handle, working_dir)
def submit_training_job(env, dataset, target, features=None, max_iterations=10, validation_set=None, class_weights=None, max_depth=6, step_size=0.3, min_loss_reduction=0.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, random_seed=None, metric='auto', model_checkpoint_path='auto', **kwargs): """ Submit a job to create a (binary or multi-class) classifier model of type :class:`~graphlab.boosted_trees_classifier.BoostedTreesClassifier` using gradient boosted trees (sometimes known as GBMs). Parameters ---------- env : graphlab.deploy.hadoop_cluster.HadoopCluster Hadoop cluster to submit the training job dataset : SFrame A training dataset containing feature columns and a target column. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in alphabetical order of the variable values. For example, a target variable with 'cat', 'dog', and 'foosa' as possible values is mapped to 0, 1, and, 2 respectively. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, which uses all columns in the SFrame ``dataset`` excepting the target column.. max_iterations : int, optional The maximum number of iterations for boosting. Each iteration results in the creation of an extra tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If provided, the dictionary must contain a key for each class label. The value can be any positive number greater than 1e-20. Weights are interpreted as relative to each other. So setting the weights to be 2.0 for the positive class and 1.0 for the negative class has the same effect as setting them to be 20.0 and 10.0, respectively. If set to `None`, all classes are taken to have weight 1.0. The `auto` mode sets the class weight to be inversely proportional to the number of examples in the training data with the given class. max_depth : float, optional Maximum depth of a tree. Must be at least 1. step_size : float, [0,1], optional Step size (shrinkage) used in update to prevents overfitting. It shrinks the prediction of each weak learner to make the boosting process more conservative. The smaller the step size, the more conservative the algorithm will be. Smaller step_size work well when `max_iterations` is large. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, [0,1], optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and can usually help prevent overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the examples (rows) to grow each tree. column_subsample : float, [0,1], optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this can also help prevent model overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the columns to grow each tree. random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'accuracy', 'auc', 'log_loss'} kwargs : dict, optional Additional arguments for training the model. - ``early_stopping_rounds`` : int, default None If the validation metric does not improve after <early_stopping_rounds>, stop training and return the best model. If multiple metrics are being tracked, the last one is used. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : :class:`~graphlab.distributed._dml_job_status.DMLJobStatus` An object that tracks the execution of the distributed training job. References ---------- - `Wikipedia - Gradient tree boosting <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_ - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- BoostedTreesClassifier, graphlab.logistic_classifier.LogisticClassifier, graphlab.svm_classifier.SVMClassifier, graphlab.neuralnet_classifier.NeuralNetClassifier Examples -------- .. sourcecode:: python >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> hdp_env = graphlab.deploy.hadoop_cluster.create('my-first-hadoop-cluster', ... 'hdfs://path-to-turi-distributed-installation') >>> train, test = data.random_split(0.8) >>> distr_job = graphlab.distributed.boosted_trees_classifier.submit_training_job(hdp_env, train, target='label') >>> model = distr_job.get_results() >>> predicitons = model.classify(test) >>> results = model.evaluate(test) """ _mt._get_metric_tracker().track( 'distributed.toolkit.classifier.boosted_trees_classifier.submit_training_job' ) if random_seed is not None: kwargs['random_seed'] = random_seed if model_checkpoint_path != 'auto': model_checkpoint_path = _make_internal_url(model_checkpoint_path) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url( kwargs['resume_from_checkpoint']) dml_obj = _sl.create(dataset=dataset, target=target, features=features, model_name='boosted_trees_classifier', env=env, max_iterations=max_iterations, validation_set=validation_set, class_weights=class_weights, max_depth=max_depth, step_size=step_size, min_loss_reduction=min_loss_reduction, min_child_weight=min_child_weight, row_subsample=row_subsample, column_subsample=column_subsample, metric=metric, model_checkpoint_path=model_checkpoint_path, **kwargs) return dml_obj
def ext_import(soname, module_subpath=""): """ Loads a graphlab toolkit module (a shared library) into the gl.extensions namespace. Toolkit module created via SDK can either be directly imported, e.g. ``import example`` or via this function, e.g. ``graphlab.ext_import("example.so")``. Use ``ext_import`` when you need more namespace control, or when the shared library is not local, e.g. in http, s3 or hdfs. Parameters ---------- soname : string The filename of the shared library to load. This can be a URL, or a HDFS location. For instance if soname is somewhere/outthere/toolkit.so The functions in toolkit.so will appear in gl.extensions.toolkit.* module_subpath : string, optional Any additional module paths to prepend to the toolkit module after it is imported. For instance if soname is somewhere/outthere/toolkit.so, by default the functions in toolkit.so will appear in gl.extensions.toolkit.*. However, if I module_subpath="somewhere.outthere", the functions in toolkit.so will appear in gl.extensions.somewhere.outthere.toolkit.* Returns ------- out : a list of functions and classes loaded. Examples -------- For instance, given a module which implements the function "square_root", .. code-block:: c++ #include <cmath> #include <graphlab/sdk/toolkit_function_macros.hpp> double square_root(double a) { return sqrt(a); } BEGIN_FUNCTION_REGISTRATION REGISTER_FUNCTION(square_root, "a"); END_FUNCTION_REGISTRATION compiled into example.so >>> graphlab.ext_import('example1.so') ['example1.square_root'] >>> graphlab.extensions.example1.square_root(9) 3.0 We can customize the import location with module_subpath which can be used to avoid namespace conflicts when you have multiple toolkits with the same filename. >>> graphlab.ext_import('example1.so', 'math') ['math.example1.square_root'] >>> graphlab.extensions.math.example1.square_root(9) 3.0 The module can also be imported directly, but graphlab *must* be imported first. graphlab will intercept the module loading process to load the toolkit. >>> import graphlab >>> import example1 #searches for example1.so in all the python paths >>> example1.square_root(9) 3.0 """ unity = _gl.connect.main.get_unity() import os if os.path.exists(soname): soname = os.path.abspath(soname) else: soname = _make_internal_url(soname) ret = unity.load_toolkit(soname, module_subpath) if len(ret) > 0: raise RuntimeError(ret) _publish() # push the functions into the corresponding module namespace filename = os.path.basename(soname) modulename = filename.split('.')[0] return unity.list_toolkit_functions_in_dynamic_module(soname) + unity.list_toolkit_classes_in_dynamic_module(soname)
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False if protocol == '': model_path = file_util.expand_full_path(location) dir_archive_exists = file_util.exists( os.path.join(model_path, 'dir_archive.ini')) else: model_path = location if protocol in ['http', 'https']: dir_archive_exists = True else: import posixpath dir_archive_exists = file_util.exists( posixpath.join(model_path, 'dir_archive.ini')) if not dir_archive_exists: # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)
def create(dataset, target, features=None, max_iterations=10, validation_set='auto', verbose=True, class_weights=None, random_seed=None, metric='auto', **kwargs): """ Create a (binary or multi-class) classifier model of type :class:`~graphlab.random_forest_classifier.RandomForestClassifier` using an ensemble of decision trees trained on subsets of the data. Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in alphabetical order of the variable values. For example, a target variable with 'cat', 'dog', and 'foosa' as possible values is mapped to 0, 1, and, 2 respectively. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, which uses all columns in the SFrame ``dataset`` excepting the target column.. max_iterations : int, optional The maximum number of iterations to perform. For multi-class classification with K classes, each iteration will create K-1 trees. max_depth : float, optional Maximum depth of a tree. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If set to `None`, all classes are supposed to have weight one. The `auto` mode set the class weight to be inversely proportional to number of examples in the training data with the given class. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger it is, the more conservative the algorithm will be. Must be non-negative. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and can usually help prevent overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the examples (rows) to grow each tree. column_subsample : float, optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this can also help prevent model overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the columns to grow each tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. The default value is 'auto'. verbose : boolean, optional Print progress information during training (if set to true). random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'accuracy', 'auc', 'log_loss'} kwargs : dict, optional Additional arguments for training the model. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : RandomForestClassifier A trained random forest model for classification tasks. References ---------- - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- RandomForestClassifier, graphlab.logistic_classifier.LogisticClassifier, graphlab.svm_classifier.SVMClassifier, graphlab.neuralnet_classifier.NeuralNetClassifier Examples -------- .. sourcecode:: python >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = graphlab.random_forest_classifier.create(train, target='label') >>> predicitons = model.classify(test) >>> results = model.evaluate(test) """ _mt._get_metric_tracker().track( 'toolkit.classifier.random_forest_classifier.create') if random_seed is not None: kwargs['random_seed'] = random_seed if 'model_checkpoint_path' in kwargs: kwargs['model_checkpoint_path'] = _make_internal_url( kwargs['model_checkpoint_path']) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url( kwargs['resume_from_checkpoint']) if 'num_trees' in kwargs: logger = _logging.getLogger(__name__) logger.warning( "The `num_trees` keyword argument is deprecated. Please " "use the `max_iterations` argument instead. Any value provided " "for `num_trees` will be used in place of `max_iterations`.") max_iterations = kwargs['num_trees'] del kwargs['num_trees'] model = _sl.create(dataset=dataset, target=target, features=features, model_name='random_forest_classifier', max_iterations=max_iterations, validation_set=validation_set, class_weights=class_weights, verbose=verbose, metric=metric, **kwargs) return RandomForestClassifier(model.__proxy__)
def submit_training_job(env, dataset, target, features=None, max_iterations=10, validation_set=None, random_seed = None, metric = 'auto', model_checkpoint_path='auto', **kwargs): """ Submit a job to create a :class:`~graphlab.random_forest_regression.RandomForestRegression` to predict a scalar target variable using one or more features. In addition to standard numeric and categorical types, features can also be extracted automatically from list- or dictionary-type SFrame columns. Parameters ---------- env : graphlab.deploy.hadoop_cluster.HadoopCluster Hadoop cluster to submit the training job dataset : SFrame A training dataset containing feature columns and a target column. Only numerical typed (int, float) target column is allowed. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, using all columns. max_iterations : int, optional The number of iterations to perform. max_depth : float, optional Maximum depth of a tree. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the examples (rows) to grow each tree. column_subsample : float, optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this also usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the columns to grow each tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'rmse', 'max_error'} kwargs : dict, optional Additional arguments for training the model. - ``model_checkpoint_path`` : str, default 'auto' If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : :class:`~graphlab.distributed._dml_job_status.DMLJobStatus` An object that tracks the execution of the distributed training job. References ---------- - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- RandomForestRegression, graphlab.linear_regression.LinearRegression, graphlab.regression.create Examples -------- Setup the data: >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = graphlab.SFrame.read_csv(url) >>> data['label'] = data['label'] == 'p' >>> hdp_env = graphlab.deploy.hadoop_cluster.create('my-first-hadoop-cluster', ... 'hdfs://path-to-turi-distributed-installation') Split the data into training and test data: >>> train, test = data.random_split(0.8) Create the model: >>> distr_job = graphlab.random_forest_regression.submit_training_job(hdp_env, train, target='label') >>> model = distr_job.get_results() Make predictions and evaluate the model: >>> predictions = model.predict(test) >>> results = model.evaluate(test) """ logger = _logging.getLogger(__name__) if random_seed is not None: kwargs['random_seed'] = random_seed if model_checkpoint_path != 'auto': model_checkpoint_path = _make_internal_url(model_checkpoint_path) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint']) if 'num_trees' in kwargs: logger.warning("The `num_trees` keyword argument is deprecated. Please " "use the `max_iterations` argument instead. Any value provided " "for `num_trees` will be used in place of `max_iterations`.") max_iterations = kwargs['num_trees'] del kwargs['num_trees'] _mt._get_metric_tracker().track('distributed.toolkit.regression.random_forest_regression.submit_training_job') dml_obj = _sl.create(dataset = dataset, target = target, features = features, model_name = 'random_forest_regression', env = env, max_iterations = max_iterations, validation_set = validation_set, metric = metric, model_checkpoint_path=model_checkpoint_path, **kwargs) return dml_obj
def ext_import(soname, module_subpath=""): """ Loads a graphlab toolkit module (a shared library) into the gl.extensions namespace. Toolkit module created via SDK can either be directly imported, e.g. ``import example`` or via this function, e.g. ``graphlab.ext_import("example.so")``. Use ``ext_import`` when you need more namespace control, or when the shared library is not local, e.g. in http, s3 or hdfs. Parameters ---------- soname : string The filename of the shared library to load. This can be a URL, or a HDFS location. For instance if soname is somewhere/outthere/toolkit.so The functions in toolkit.so will appear in gl.extensions.toolkit.* module_subpath : string, optional Any additional module paths to prepend to the toolkit module after it is imported. For instance if soname is somewhere/outthere/toolkit.so, by default the functions in toolkit.so will appear in gl.extensions.toolkit.*. However, if I module_subpath="somewhere.outthere", the functions in toolkit.so will appear in gl.extensions.somewhere.outthere.toolkit.* Returns ------- out : a list of functions and classes loaded. Examples -------- For instance, given a module which implements the function "square_root", .. code-block:: c++ #include <cmath> #include <graphlab/sdk/toolkit_function_macros.hpp> double square_root(double a) { return sqrt(a); } BEGIN_FUNCTION_REGISTRATION REGISTER_FUNCTION(square_root, "a"); END_FUNCTION_REGISTRATION compiled into example.so >>> graphlab.ext_import('example1.so') ['example1.square_root'] >>> graphlab.extensions.example1.square_root(9) 3.0 We can customize the import location with module_subpath which can be used to avoid namespace conflicts when you have multiple toolkits with the same filename. >>> graphlab.ext_import('example1.so', 'math') ['math.example1.square_root'] >>> graphlab.extensions.math.example1.square_root(9) 3.0 The module can also be imported directly, but graphlab *must* be imported first. graphlab will intercept the module loading process to load the toolkit. >>> import graphlab >>> import example1 #searches for example1.so in all the python paths >>> example1.square_root(9) 3.0 """ unity = _gl.connect.main.get_unity() import os if os.path.exists(soname): soname = os.path.abspath(soname) else: soname = _make_internal_url(soname) ret = unity.load_toolkit(soname, module_subpath) if len(ret) > 0: raise RuntimeError(ret) _publish() # push the functions into the corresponding module namespace filename = os.path.basename(soname) modulename = filename.split('.')[0] return unity.list_toolkit_functions_in_dynamic_module( soname) + unity.list_toolkit_classes_in_dynamic_module(soname)
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. data : dict Key value arguments to the function stored in a dictionary env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ (success, message, result_path) : bool, str, str """ from graphlab.extensions import dml_function_invocation, init_dml_class_registry init_dml_class_registry() if env == 'auto': env = DMLRemoteEnvironment() if not file_util.exists(env.working_dir): _log.debug('Creating working directory: %s' % env.working_dir) file_util.mkdir(env.working_dir) else: _log.debug('Using existing working directory: %s' % env.working_dir) _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir)) success = False message = "" result_path = None # Job function arguments try: _log.info('Serializing arguments to %s' % env.working_dir) args = dml_function_invocation() data_copy = copy(data) internal_working_dir = _make_internal_url(env.working_dir) data_copy['__base_path__'] = internal_working_dir args.from_dict(data_copy, internal_working_dir) json_data = args.to_str() # sanitize the base path url sanitized_json_data = json_data if file_util.is_s3_path(json_data): sanitized_json_data = _sanitize_internal_s3_url(json_data) _log.info('Serialized arguments: %s' % sanitized_json_data) except Exception as e: success = False message = 'Error serializing arguments. %s' % str(e) return (success, message, None) # Submit job try: job = dml_submit(function_name, json_data, env, metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE, logprogress_file=PROGRESS_LOG_FILE, **kwargs) except KeyboardInterrupt: message = 'Canceled by user' return (success, message, None) _log.info('Waiting for workers to start ... ') logprinter = None if verbose: log_server_address_path = os.path.join(env.working_dir, COMMANDER_LOG_SERVER_ADDRESS_FILE) log_server_address = get_log_metric_server_address(log_server_address_path, timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers) if len(log_server_address) > 0: tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_') fd_list = [] logprinter = LogPrinter() # Attach log progress stream logprinter.add_stream(LogStream(log_server_address + '/progress', os.path.join(env.working_dir, PROGRESS_LOG_FILE), sys.stdout)) # Attach commander log stream local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w') fd_list.append(local_commander_log) logprinter.add_stream(LogStream(log_server_address + '/commander', os.path.join(env.working_dir, COMMANDER_LOG_FILE), local_commander_log)) # Attach worker log streams for i in range(env.num_workers): local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w') fd_list.append(local_worker_log) logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i, os.path.join(env.working_dir, WORKER_LOG_FILE(i)), local_worker_log)) logprinter.start() _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir) _log.debug('Wait for job to finish') (success, message) = _wait_and_parse_job_result(job) if logprinter: logprinter.stop() for fd in fd_list: fd.close() if success: try: result_path = os.path.join(env.working_dir, env.output_name) ret_str = file_util.read(result_path) sanitized_ret_str = _sanitize_internal_s3_url(ret_str) _log.debug('Deserializing results: %s' % sanitized_ret_str) args.from_str(ret_str) response = args.to_dict() # Check toolkit response for "result" key or "exception" key. if 'result' in response: return (success, message, response['result']) elif 'exception' in response: return (False, response['exception'], None) else: raise ValueError('Invalid toolkit response. Must have "result" or \ "exception" as key') except Exception as e: success = False message = 'Error deserializing results. %s' % str(e) return (success, message, None) else: return (success, message, None)