def aggregate_target_metadata(self, rdd: RDD): def map_fn(splitIndex, iterator): csv, max_len, count = reduce( lambda acc, kv: ( "\n".join([acc[0], target_metadata_to_tsv(kv[1])]), max(acc[1], kv[1].n_frames), acc[2] + 1), iterator, ("", 0, 0)) filename = f"blizzard2012-target-metadata-{splitIndex:03d}.tsv" filepath = os.path.join(self.out_dir, filename) with open(filepath, mode="w") as f: f.write(csv) yield count, max_len return rdd.sortByKey().mapPartitionsWithIndex( map_fn, preservesPartitioning=True).fold( (0, 0), lambda acc, xy: (acc[0] + xy[0], max(acc[1], xy[1])))