def evaluate(truth: RDD, prediction: RDD) -> float: """ Calculate RMSE between truth and predictions. :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)> :return: float = RMSE """ truth = truth.map(lambda u: ((u[0], u[1]), u[2])) prediction = prediction.map(lambda u: ((u[0], u[1]), u[2])) return truth.join(prediction).map(lambda u: (u[1][0] - u[1][1])**2).mean()**0.5
def _join(rdd: RDD, other: RDD, func=None): num_partitions = max(rdd.getNumPartitions(), other.getNumPartitions()) rtn_rdd = rdd.join(other, numPartitions=num_partitions) if func is not None: rtn_rdd = _map_value(rtn_rdd, lambda x: func(x[0], x[1])) return rtn_rdd