def _evaluate(self, rdd: RDD, **kwargs): yaml_model = self.master_network.to_yaml() optimizer = deserialize_optimizer(self.master_optimizer) loss = self.master_loss weights = self.master_network.get_weights() weights = rdd.context.broadcast(weights) custom_objects = self.custom_objects metrics = self.master_metrics def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs, data_iterator): model = model_from_yaml(model, custom_objects) model.compile(optimizer, loss, metrics) model.set_weights(weights.value) feature_iterator, label_iterator = tee(data_iterator, 2) x_test = np.asarray([x for x, y in feature_iterator]) y_test = np.asarray([y for x, y in label_iterator]) return [model.evaluate(x_test, y_test, **kwargs)] if self.num_workers: rdd = rdd.repartition(self.num_workers) results = rdd.mapPartitions( partial(_evaluate, yaml_model, optimizer, loss, custom_objects, metrics, kwargs)) if not metrics: # if no metrics, we can just return the scalar corresponding to the loss value return results.mean() else: # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API loss_value = results.map(lambda x: x[0]).mean() metric_value = results.map(lambda x: x[1]).mean() return [loss_value, metric_value]
def _predict(self, rdd: RDD): if self.num_workers: rdd = rdd.repartition(self.num_workers) yaml_model = self.master_network.to_yaml() weights = self.master_network.get_weights() weights = rdd.context.broadcast(weights) custom_objects = self.custom_objects def _predict(model, custom_objects, data): model = model_from_yaml(model, custom_objects) model.set_weights(weights.value) data = np.array([x for x in data]) return model.predict(data) predictions = rdd.mapPartitions(partial(_predict, yaml_model, custom_objects)).collect() return predictions
def repar_rdd(rdd: RDD, rdd_count: int, example_per_par=100000, coalesce_only=True): """ repar rdd based on number of example. if coalesce_only is False and expected partition is greater than current partition then nothing will happen """ num_partition = rdd.getNumPartitions() expect_partition = max(1, int(rdd_count / example_per_par)) if expect_partition < num_partition: rdd = rdd.coalesce(expect_partition) elif expect_partition > num_partition and coalesce_only is False: rdd = rdd.repartition(expect_partition) return rdd
def partition_per_row(rdd: RDD) -> RDD: """Place each row in an RDD into a separate partition. Only useful if that row represents something large to be computed over, perhaps an external resource such as a multi-gb training dataset. The spark part of the dataset is expected to be tiny and easily fit in a single partition. """ num_rows = rdd.count() # Help out mypy. Also don't use `identity`, as it somehow fails serialization partition_fn = cast(Callable[[int], int], lambda x: x) return ( # bring everything together and assign each row a partition id rdd.repartition(1).mapPartitions(lambda rows: enumerate(rows)) # Partition by the new parition_id .partitionBy(num_rows, partition_fn) # Drop the partition id, giving back the origional shape .map(lambda pair: pair[1]))
def fit(self, rdd: RDD, **kwargs): """ Train an elephas model on an RDD. The Keras model configuration as specified in the elephas model is sent to Spark workers, abd each worker will be trained on their data partition. :param rdd: RDD with features and labels :param epochs: number of epochs used for training :param batch_size: batch size used for training :param verbose: logging verbosity level (0, 1 or 2) :param validation_split: percentage of data set aside for validation """ print('>>> Fit model') if self.num_workers: rdd = rdd.repartition(self.num_workers) if self.mode in ['asynchronous', 'synchronous', 'hogwild']: self._fit(rdd, **kwargs) else: raise ValueError( "Choose from one of the modes: asynchronous, synchronous or hogwild")
def __call__(self, rdd: RDD, **kwargs: Any) -> RDD: """ Performs a single step of an algorithm, running all operations in sequence and ensuring data is partitioned correctly. Any additional keyword arguments passed to this function will be available in all life-cycle functions of the step: - `group` - `emit_by_group` - `broadcast` - `step` **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.** """ if rdd.getNumPartitions() != self._n_partitions: rdd = rdd.repartition(self._n_partitions) step_cls: Type[Step] = self.__class__ rdd = step_cls.group( rdd, **kwargs ).cache() # cache because we use it twice (emit and step) def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]: k, v = kv new_v = step_cls.emit_by_group(k, v, **kwargs) return new_v emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect()) to_broadcast = step_cls.broadcast(emitted, **kwargs) broadcast: Broadcast = self._sc.broadcast(to_broadcast) def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]: k, v = kv for new_v in step_cls.step(k, v, broadcast, **kwargs): yield new_v rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True) return rdd