num_pixels = 2000 rows = sc.range(num_pixels, numSlices=10) cols = sc.range(num_pixels, numSlices=10) indices = rows.cartesian(cols) def mandelbrot_wrapper(row, col): x = col/(num_pixels/4.) - 2. y = row/(num_pixels/4.) - 2. return ((row, col), P2.mandelbrot(x, y)) ########### Different from part A: load balancing! ######## new_indices = indices.repartition(100) # Randomly throw jobs between partitions mandelbrot_load_balanced = new_indices.map(lambda a: mandelbrot_wrapper(*a)) summed_rdd = P2.sum_values_for_partitions(mandelbrot_load_balanced) summed_result = summed_rdd.collect() # Now collect the data & plot plt.hist(summed_result, bins=np.logspace(3, 8, 20)) sns.rugplot(summed_result, color='red') plt.gca().set_xscale('log') plt.xlabel('Total Number of Iterations on Partition') plt.ylabel('Partition Count') plt.title('Number of Iterations on each Partition') plt.savefig('P2b_alternative_hist.png', dpi=200, bbox_inches='tight')
return ((row, col), P2.mandelbrot(x, y)) mandelbrot_rdd = indices.map(lambda a: mandelbrot_wrapper(*a)) # Now collect the data & plot mandelbrot_result = mandelbrot_rdd.collect() plt.grid(False) # I slightly redefined the draw image function as the original # implementation annoyed me...I did not want to collect in a draw function! P2.draw_image(data=mandelbrot_result) plt.savefig('P2a_mandelbrot.png', dpi=200, bbox_inches='tight') plt.clf() # Now create the histogram...I recognize that mandelbrot is computed twice # but it is for my sanity summed_rdd = P2.sum_values_for_partitions(mandelbrot_rdd) summed_result = summed_rdd.collect() plt.hist(summed_result, bins=np.logspace(3, 8, 20)) sns.rugplot(summed_result, color='red') plt.gca().set_xscale('log') plt.xlabel('Total Number of Iterations on Partition') plt.ylabel('Partition Count') plt.title('Number of Iterations on each Partition') plt.savefig('P2a_hist.png', dpi=200, bbox_inches='tight')
y = row/(num_pixels/4.) - 2. return ((row, col), P2.mandelbrot(x, y)) mandelbrot_rdd = indices.map(lambda a: mandelbrot_wrapper(*a)) # Now collect the data & plot mandelbrot_result = mandelbrot_rdd.collect() plt.grid(False) # I slightly redefined the draw image function as the original # implementation annoyed me...I did not want to collect in a draw function! P2.draw_image(data=mandelbrot_result) plt.savefig('P2a_mandelbrot.png', dpi=200, bbox_inches='tight') plt.clf() # Now create the histogram...I recognize that mandelbrot is computed twice # but it is for my sanity summed_rdd = P2.sum_values_for_partitions(mandelbrot_rdd) summed_result = summed_rdd.collect() plt.hist(summed_result, bins=np.logspace(3, 8, 20)) sns.rugplot(summed_result, color='red') plt.gca().set_xscale('log') plt.xlabel('Total Number of Iterations on Partition') plt.ylabel('Partition Count') plt.title('Number of Iterations on each Partition') plt.savefig('P2a_hist.png', dpi=200, bbox_inches='tight')
partition_vs_expensive_task = labeled_expensive_tasks.map( lambda x: (x[1] % num_partitions, x[0])) # Get cheap tasks ready to process cheap_tasks = indices_vs_expensive.filter(lambda x: x[1] == 0) cheap_tasks = cheap_tasks.map(lambda x: x[0]) labeled_cheap_tasks = cheap_tasks.zipWithIndex() partition_vs_cheap_task = labeled_cheap_tasks.map( lambda x: (x[1] % num_partitions, x[0])) # Combine cheap & expensive tasks, now designated to an appropriate partition partition_vs_ij = partition_vs_expensive_task.union(partition_vs_cheap_task) # Sort data into the correct partition...sorted by key! sorted_by_partition = partition_vs_ij.sortByKey(numPartitions=100) mandelbrot_load_balanced = sorted_by_partition.map( lambda a: mandelbrot_wrapper(*a[1])) summed_rdd = P2.sum_values_for_partitions(mandelbrot_load_balanced) summed_result = summed_rdd.collect() # Now collect the data & plot plt.hist(summed_result, bins=np.logspace(3, 8, 20)) sns.rugplot(summed_result, color='red') plt.gca().set_xscale('log') plt.xlabel('Total Number of Iterations on Partition') plt.ylabel('Partition Count') plt.title('Number of Iterations on each Partition') plt.savefig('P2b_hist.png', dpi=200, bbox_inches='tight')