Exemplo n.º 1
0
    def _serialize(self, file_path):
        """
        Serializes the Job to the provided file_path.

        Special-case sequential job definitions (jobs where all the tasks are to be sequentially executed) and
        there are no parallel executions.

        In those cases, instead of serializing the tasks into the appropriate folder, just pickle the entire job
        and write it that instead.

        This way when executing if a Job is found run_job will be called, vs run_task.
        """
        aggregate_file = _zipfile.ZipFile(file_path, 'w')
        base_path = _tempfile.mkdtemp()

        # TODO: serialize job attributes (like .name) or else won't be in deserialized object

        # identify if only one task per step, and special-case that
        if all([len(step) == 1 for step in self._sequence]):
            # one task per step, so handle with serial execution
            __LOGGER__.debug("Special casing sequential execution")
            step_idx = 0
            job_idx = 0
            step_dir = _os.path.join(base_path, str(step_idx))
            _os.mkdir(step_dir)

            job_file_path = _os.path.join(step_dir, str(job_idx))
            with open(job_file_path, 'w') as job_file:
                _cloudpickle.dump(self, job_file)

            relative_path = _os.path.join('steps', str(step_idx), str(job_idx))
            aggregate_file.write(job_file_path, relative_path)

        else:

            for step_idx, cur_step in enumerate(self._sequence):
                step_dir = _os.path.join(base_path, str(step_idx))
                _os.mkdir(step_dir)

                for task_idx, cur_task in enumerate(cur_step):
                    task_file_path = _os.path.join(step_dir, str(task_idx))
                    with open(task_file_path, 'w') as task_file:
                        _cloudpickle.dump(cur_task, task_file)

                    relative_path = _os.path.join('steps', str(step_idx), str(task_idx))
                    aggregate_file.write(task_file_path, relative_path)

        aggregate_file.close()

        # delete the tempdir created
        _shutil.rmtree(base_path)
    def save(self, obj, typename=None):
        """
        Save the item to this session.

        Parameters
        ----------
        obj : object
            Object to save to this session
        typename : str, optional
            Specify the type of this object (Task, Environment, Job)
        """
        if isinstance(obj, str):
            if obj in self._objects:
                obj = self._objects[obj]
            else:
                raise Exception("Unable to find artifact to save.")

        if not Session._is_known_type(obj):
            raise Exception("Trying to save an unknown type")

        savedir = self.location
        if typename is None:
            if isinstance(obj, _job.Job):
                typename = 'Job'
            elif isinstance(obj, _environment.Environment):
                typename = 'Environment'
            elif isinstance(obj, _artifact.Task):
                typename = 'Task'
            elif isinstance(obj, _predictive_service.PredictiveService):
                typename = 'PredictiveService'
            else:
                __LOGGER__.error("Trying to save an unrecognized item of type: %s, saving failed." % type(obj))
                return

        # overwrite the obj with PredictiveServiceEndpoint to be pickled and saved
        if isinstance(obj, _predictive_service.PredictiveService) and typename == 'PredictiveService':
            obj = _predictive_service_endpoint.PredictiveServiceEndpoint(obj.name, obj._s3_state_path, obj.aws_credentials)

        filename = self._get_filename_from_name(obj.name) + "." + typename

        try:
            with __builtin__.open(str(_os.path.join(savedir, filename)), "w") as f:
                _cloudpickle.dump(obj, f)

            if hasattr(obj, '_modified_since_last_saved'):
                obj._modified_since_last_saved = False

        except Exception as e:
            __LOGGER__.warning("Error saving %s: '%s'" % (typename, e))
  def _save_imp(self, po_path, dependency_path, aws_credentials):
    '''Save the predictive object to a directory

      The files for a predictive object are laid out the following way:

        po_path/definition/meta -- serialized json file about the predictive
          object, including: description, dependencies, etc.
        po_path/definition/definition -- cloudpickle-serialized PredictiveObject
        dependency_path -- all dependent GraphLab objects, each in its
          own directory:
          dependency_path/uri1/ -- serialized GraphLab object with uri1
          dependency_path/uri2/ -- serialized GraphLab object with uri2
      '''
    fu.create_directory(po_path)

    describe = {
      'description': self.description,
      'dependencies': {},
      'schema_version' : self.schema_version
    }

    for (uri, gl_obj) in self.dependencies.iteritems():

      # If it isn't already saved, save it.
      temp_path = None
      try:
        if not fu.is_path(gl_obj):
          obj_type = self._get_graphlab_object_type(gl_obj)
          temp_path = tempfile.mkdtemp()

          __logger__.info("Saving dependent GraphLab %s (%s) locally to '%s' " % (obj_type, uri, temp_path))
          gl_obj.save(temp_path)
          gl_obj = temp_path
        else:
          obj_type = get_graphlab_object_type(gl_obj)

        # Copy the saved object without loading it.
        save_path = os.path.join(dependency_path, uri)

        __logger__.info("Copying dependent GraphLab %s(%s) from '%s' to '%s' " % (obj_type, uri, gl_obj, save_path))

        if fu.is_s3_path(gl_obj) and fu.is_s3_path(save_path):
          fu.intra_s3_copy_model(gl_obj, save_path, aws_credentials)
        elif fu.is_local_path(gl_obj) and fu.is_s3_path(save_path):
          fu.s3_copy_model(gl_obj, save_path, aws_credentials)
        elif fu.is_local_path(gl_obj) and fu.is_local_path(save_path):
          # Useful for unit tests
          shutil.copytree(gl_obj, save_path)
        else:
          raise RuntimeError("Copy GraphLab object from S3 to local path is not supported. GraphLab object path: %s, save path: %s" % (gl_obj, save_path))
      finally:
        if temp_path:
          shutil.rmtree(temp_path)

      # add to the global describe dictionary
      describe['dependencies'][uri] = {
        'path': save_path,
        'type': obj_type
      }

    # persist the global description
    describe_path = self._get_describe_path(po_path)
    self._save_object(describe_path, describe)

    # persist the definition of myself
    definition_path = self._get_definition_path(po_path)
    try:
      with open(definition_path, 'wb') as f:
        _cloudpickle.dump(self, f)
    except Exception as e:
      __logger__.error('Unable to save object: %s' % (e.message))
      raise e