def _fitInParallel(self, dataset, paramMaps): """ Fits len(paramMaps) models in parallel, one in each Spark task. :param paramMaps: non-empty list or tuple of ParamMaps (dict values) :return: list of fitted models, matching the order of paramMaps """ sc = JVMAPI._curr_sc() paramMapsRDD = sc.parallelize(paramMaps, numSlices=len(paramMaps)) # Extract image URI from provided dataset and create features as numpy arrays localFeatures, localLabels = self._getNumpyFeaturesAndLabels(dataset) localFeaturesBc = sc.broadcast(localFeatures) localLabelsBc = None if localLabels is None else sc.broadcast( localLabels) # Broadcast Keras model (HDF5) file content as bytes modelBytes = self._loadModelAsBytes() modelBytesBc = sc.broadcast(modelBytes) # Obtain params for this estimator instance baseParamMap = self.extractParamMap() baseParamDict = dict([(param.name, val) for param, val in baseParamMap.items()]) baseParamDictBc = sc.broadcast(baseParamDict) def _local_fit(override_param_map): """ Fit locally a model with a combination of this estimator's param, with overriding parameters provided by the input. :param override_param_map: dict, key type is MLllib Param They are meant to override the base estimator's params. :return: serialized Keras HDF5 file bytes """ # Update params params = baseParamDictBc.value override_param_dict = dict([ (param.name, val) for param, val in override_param_map.items() ]) params.update(override_param_dict) # Create Keras model model = kmutil.bytes_to_model(modelBytesBc.value) model.compile(optimizer=params['kerasOptimizer'], loss=params['kerasLoss']) # Retrieve features and labels and fit Keras model features = localFeaturesBc.value labels = None if localLabelsBc is None else localLabelsBc.value _fit_params = params['kerasFitParams'] model.fit(x=features, y=labels, **_fit_params) return kmutil.model_to_bytes(model) kerasModelBytesRDD = paramMapsRDD.map(lambda paramMap: (paramMap, _local_fit(paramMap))) return self._collectModels(kerasModelBytesRDD)
def fitMultiple(self, dataset, paramMaps): """ Fits len(paramMaps) models in parallel, one in each Spark task. :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`. The column `inputCol` should be of type `sparkdl.image.imageIO.imgSchema`. :param paramMaps: non-empty list or tuple of ParamMaps (dict values) :return: an iterable which contains one model for each param map. Each call to `next(modelIterator)` will return `(index, model)` where model was fit using `paramMaps[index]`. `index` values may not be sequential. .. warning:: This serializes each model into an HDF5 byte file to the driver. If the model file is large, the driver might go out-of-memory. As we cannot assume the existence of a sufficiently large (and writable) file system, users are advised to not train too many models in a single Spark job. """ _ = [self._validateParams(pm) for pm in paramMaps] def _get_tunable_name_value_map(param_map, tunable): """takes a dictionary {`Param` -> value} and a list [`Param`], select keys that are present in both and returns a map of {Param.name -> value}""" return {param.name: val for param, val in param_map.items() if param in tunable} sc = JVMAPI._curr_sc() param_name_maps = [(i, _get_tunable_name_value_map(pm, self._tunable_params)) for (i, pm) in enumerate(paramMaps)] num_models = len(param_name_maps) paramNameMapsRDD = sc.parallelize(param_name_maps, numSlices=num_models) # Extract image URI from provided dataset and create features as numpy arrays localFeatures, localLabels = self._getNumpyFeaturesAndLabels(dataset) localFeaturesBc = sc.broadcast(localFeatures) localLabelsBc = None if localLabels is None else sc.broadcast(localLabels) # Broadcast Keras model (HDF5) file content as bytes modelBytes = self._loadModelAsBytes() modelBytesBc = sc.broadcast(modelBytes) # Obtain params for this estimator instance base_params = _get_tunable_name_value_map(self.extractParamMap(), self._tunable_params) baseParamsBc = sc.broadcast(base_params) def _local_fit(row): """ Fit locally a model with a combination of this estimator's param, with overriding parameters provided by the input. :param row: a list or tuple containing index and override_param_map. Index is an int representing the index of parameter map and override_param_map is a dict whose key is a string representing an MLllib Param Name. These are meant to override the base estimator's params. :return: tuple of index, override_param_map and serialized Keras HDF5 file bytes """ index, override_param_map = row # Update params params = baseParamsBc.value params.update(override_param_map) # Create Keras model model = kmutil.bytes_to_model(modelBytesBc.value) model.compile(optimizer=params['kerasOptimizer'], loss=params['kerasLoss']) # Retrieve features and labels and fit Keras model features = localFeaturesBc.value labels = None if localLabelsBc is None else localLabelsBc.value _fit_params = params['kerasFitParams'] model.fit(x=features, y=labels, **_fit_params) return index, override_param_map, kmutil.model_to_bytes(model) kerasModelBytesRDD = paramNameMapsRDD.map(_local_fit) models = self._collectModels(kerasModelBytesRDD) return _ThreadSafeIterator(models)
def fitMultiple(self, dataset, paramMaps): """ Fits len(paramMaps) models in parallel, one in each Spark task. :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`. The column `inputCol` should be of type `sparkdl.image.imageIO.imgSchema`. :param paramMaps: non-empty list or tuple of ParamMaps (dict values) :return: an iterable which contains one model for each param map. Each call to `next(modelIterator)` will return `(index, model)` where model was fit using `paramMaps[index]`. `index` values may not be sequential. .. warning:: This serializes each model into an HDF5 byte file to the driver. If the model file is large, the driver might go out-of-memory. As we cannot assume the existence of a sufficiently large (and writable) file system, users are advised to not train too many models in a single Spark job. """ [self._validateParams(pm) for pm in paramMaps] def _name_value_map(paramMap): """takes a dictionary {param -> value} and returns a map of {param.name -> value}""" return {param.name: val for param, val in paramMap.items()} sc = JVMAPI._curr_sc() paramNameMaps = list(enumerate(map(_name_value_map, paramMaps))) num_models = len(paramNameMaps) paramNameMapsRDD = sc.parallelize(paramNameMaps, numSlices=num_models) # Extract image URI from provided dataset and create features as numpy arrays localFeatures, localLabels = self._getNumpyFeaturesAndLabels(dataset) localFeaturesBc = sc.broadcast(localFeatures) localLabelsBc = None if localLabels is None else sc.broadcast(localLabels) # Broadcast Keras model (HDF5) file content as bytes modelBytes = self._loadModelAsBytes() modelBytesBc = sc.broadcast(modelBytes) # Obtain params for this estimator instance baseParams = _name_value_map(self.extractParamMap()) baseParamsBc = sc.broadcast(baseParams) def _local_fit(row): """ Fit locally a model with a combination of this estimator's param, with overriding parameters provided by the input. :param row: a list or tuple containing index and override_param_map. Index is an int representing the index of parameter map and override_param_map is a dict whose key is a string representing an MLllib Param Name. These are meant to override the base estimator's params. :return: tuple of index, override_param_map and serialized Keras HDF5 file bytes """ index, override_param_map = row # Update params params = baseParamsBc.value params.update(override_param_map) # Create Keras model model = kmutil.bytes_to_model(modelBytesBc.value) model.compile(optimizer=params['kerasOptimizer'], loss=params['kerasLoss']) # Retrieve features and labels and fit Keras model features = localFeaturesBc.value labels = None if localLabelsBc is None else localLabelsBc.value _fit_params = params['kerasFitParams'] model.fit(x=features, y=labels, **_fit_params) return index, override_param_map, kmutil.model_to_bytes(model) kerasModelBytesRDD = paramNameMapsRDD.map(_local_fit) models = self._collectModels(kerasModelBytesRDD) return _ThreadSafeIterator(models)