def test_cache(): my_rdd = Context().parallelize([1, 2, 3, 4], 2) my_rdd = my_rdd.map(lambda x: x*x).cache() print('no exec until here') print(my_rdd.first()) print('executed map on first partition only') print(my_rdd.collect()) print('now map() was executed on all partitions and should ' 'not be executed again') print(my_rdd.collect()) assert len(my_rdd.collect()) == 4 and 16 in my_rdd.collect()
def test_first_partitions(): my_rdd = Context().parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3) print(my_rdd.first()) assert my_rdd.first() == 1