def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 wprime.evaluate() for i in range(25): timer.time_op('linear-regression', _step)
def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 expr.force(wprime) for i in range(25): timer.time_op('linear-regression', _step)
def benchmark_optimization(ctx, timer): FLAGS.optimization = 0 DATA_SIZE = 5 * 1000 * 1000 current = eager(zeros((DATA_SIZE * ctx.num_workers,), dtype=np.float32, tile_hint = (DATA_SIZE,))) strike = eager(ones((DATA_SIZE * ctx.num_workers,), dtype=np.float32, tile_hint=(DATA_SIZE,))) maturity = eager(strike * 12) rate = eager(strike * 0.05) volatility = eager(strike * 0.01) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.optimization = 1 FLAGS.opt_parakeet_gen = 0 FLAGS.opt_map_fusion = 1 timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 1 timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility))
def benchmark_convnet(ctx, timer): image_size = BASE_IMG_SIZE minibatch = 64 #minibatch = ctx.num_workers hint = util.divup(image_size, sqrt(ctx.num_workers)) tile_hint = (util.divup(minibatch, ctx.num_workers), N_COLORS, image_size, image_size) util.log_info('Hint: %s', tile_hint) images = expr.eager(expr.ones((minibatch, N_COLORS, image_size, image_size), tile_hint=tile_hint)) w1 = expr.eager(expr.ones((N_FILTERS, N_COLORS) + FILTER_SIZE, tile_hint=ONE_TILE)) w2 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) w3 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) def _(): conv1 = stencil.stencil(images, w1, 2) pool1 = stencil.maxpool(conv1) conv2 = stencil.stencil(pool1, w2, 2) pool2 = stencil.maxpool(conv2) conv3 = stencil.stencil(pool2, w3, 2) pool3 = stencil.maxpool(conv3) expr.force(pool3) # force parakeet functions to compile before timing. _() for i in range(2): timer.time_op('convnet', _)
def test_convnet(ctx): hint = util.divup(64, sqrt(ctx.num_workers)) images = expr.eager( expr.ones((N_IMGS, ) + IMG_SIZE, tile_hint=(N_IMGS, N_COLORS, hint, hint))) w1 = expr.eager( expr.ones((N_FILTERS, N_COLORS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv1 = stencil.stencil(images, w1, 2) pool1 = stencil.maxpool(conv1) w2 = expr.eager( expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv2 = stencil.stencil(pool1, w2, 2) pool2 = stencil.maxpool(conv2) w3 = expr.eager( expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv3 = stencil.stencil(pool2, w3, 2) pool3 = stencil.maxpool(conv3) util.log_info(pool3.shape)
def benchmark_convnet(ctx, timer): image_size = BASE_IMG_SIZE minibatch = 64 #minibatch = ctx.num_workers hint = util.divup(image_size, sqrt(ctx.num_workers)) tile_hint = (util.divup(minibatch, ctx.num_workers), N_COLORS, image_size, image_size) util.log_info('Hint: %s', tile_hint) images = expr.eager(expr.ones((minibatch, N_COLORS, image_size, image_size), tile_hint=tile_hint)) w1 = expr.eager(expr.ones((N_FILTERS, N_COLORS) + FILTER_SIZE, tile_hint=ONE_TILE)) w2 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) w3 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) def _(): conv1 = stencil.stencil(images, w1, 2) pool1 = stencil.maxpool(conv1) conv2 = stencil.stencil(pool1, w2, 2) pool2 = stencil.maxpool(conv2) conv3 = stencil.stencil(pool2, w3, 2) pool3 = stencil.maxpool(conv3) pool3.evaluate() # force parakeet functions to compile before timing. _() for i in range(2): timer.time_op('convnet', _)
def train(self): self.x = expr.eager(self.x) self.y = expr.eager(self.y) for i in range(self.iterations): diff = self.update() grad = expr.sum(diff, axis=0, tile_hint=[self.N_DIM]).glom().reshape((self.N_DIM, 1)) self.w = self.w - grad * self.alpha return self.w
def benchmark_optimization(ctx, timer): FLAGS.optimization = 0 DATA_SIZE = 5 * 1000 * 1000 current = eager( zeros((DATA_SIZE * ctx.num_workers, ), dtype=np.float32, tile_hint=(DATA_SIZE, ))) strike = eager( ones((DATA_SIZE * ctx.num_workers, ), dtype=np.float32, tile_hint=(DATA_SIZE, ))) maturity = eager(strike * 12) rate = eager(strike * 0.05) volatility = eager(strike * 0.01) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.optimization = 1 FLAGS.opt_parakeet_gen = 0 FLAGS.opt_map_fusion = 1 timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 1 timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 0 FLAGS.opt_auto_tiling = 0 timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_auto_tiling = 1 timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility))
def benchmark_logreg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 40000000 * ctx.num_workers N_EXAMPLES = 5000000 * 64 x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM))) y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1))) start = time.time() logistic_regression.logistic_regression(x, y, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_cg(ctx, timer): print "#worker:", ctx.num_workers l = int(math.sqrt(ctx.num_workers)) n = 2000 * 16 #n = 4000 * l la = 20 niter = 5 tile_hint = (n, n/ctx.num_workers) #nonzer = 7 #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2) #density = 0.5 * nz/(n*n) A = expr.rand(n, n, tile_hint=tile_hint) A = (A + expr.transpose(A))*0.5 I = expr.sparse_diagonal((n,n), tile_hint=tile_hint) * la I.force() A = expr.eager(A - I) #x1 = numpy_cg(A.glom(), niter) util.log_warn('begin cg!') t1 = datetime.now() x2 = conj_gradient(A, niter).force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/niter)
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers N = 100000 * ctx.num_workers D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D)) labels = expr.eager(expr.shuffle(data, _init_label_mapper)) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_svm(ctx, timer): print "#worker:", ctx.num_workers max_iter = 5 #N = 200000 * ctx.num_workers N = 100000 * 64 D = 2 # create data data = expr.randn(N, D, dtype=np.float64, tile_hint=[N/ctx.num_workers, D]) labels = expr.eager(expr.shuffle(data, _init_label_mapper)) t1 = datetime.now() w = fit(data, labels, ctx.num_workers, T=max_iter).force() t2 = datetime.now() util.log_warn('train time per iteration:%s ms, final w:%s', millis(t1,t2)/max_iter, w.glom().T) correct = 0 for i in range(10): new_data = expr.randn(1, D, dtype=np.float64, tile_hint=[1, D]) new_label = predict(w, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if new_data[0,0] >= new_data[0,1] and new_label == 1.0 or new_data[0,0] < new_data[0,1] and new_label == -1.0: correct += 1 print 'predict precision:', correct * 1.0 / 10
def test_matrix_mult(self): _skip_if_travis() N_POINTS = 2000 x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers)).astype(np.float32) y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers)) res.evaluate() cost = time.time() - start self._verify_cost("matrix_mult", cost)
def test_matrix_mult(self): _skip_if_travis() N_POINTS = 2000 x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers)).astype(np.float32) y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers)) res.force() cost = time.time() - start self._verify_cost("matrix_mult", cost)
def benchmark_slice(ctx, timer): TEST_SIZE = 1000 * ctx.num_workers # force arange to evaluate first. x = expr.eager(expr.zeros((TEST_SIZE,10000))) for i in range(5): timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum())) timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum())) timer.time_op('slice-box', lambda: expr.evaluate(x[200:300, 200:300].sum()))
def benchmark_slice(ctx, timer): TEST_SIZE = 1000 * ctx.num_workers # force arange to evaluate first. x = expr.eager(expr.zeros((TEST_SIZE, 10000))) for i in range(5): timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum())) timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum())) timer.time_op('slice-box', lambda: expr.evaluate(x[200:300, 200:300].sum()))
def benchmark_matmul(ctx, timer): N = int(1000 * math.pow(ctx.num_workers, 1.0 / 3.0)) # N = 4000 M = util.divup(N, ctx.num_workers) T = util.divup(N, math.sqrt(ctx.num_workers)) util.log_info("Testing with %d workers, N = %d, tile_size=%s", ctx.num_workers, N, T) # x = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(N, M))) # y = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(N, M))) x = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(T, T))) y = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(T, T))) # print expr.glom(expr.dot(x, y)) # print expr.dag(expr.dot(x, y)) def _step(): expr.evaluate(expr.dot(x, y)) timer.time_op("matmul", _step)
def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 65536 N_DIM = 1 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / N_TILES, N_DIM)).astype(np.float32) x = expr.eager(x) def _step(): y = expr.force(x * x) for i in range(25): _step()
def benchmark_pr(ctx, timer): num_pages = 1000 * 1000 * 3 * ctx.num_workers num_outlinks = 10 density = num_outlinks * 1.0 / num_pages same_site_prob = 0.9 print "#worker:", ctx.num_workers col_step = util.divup(num_pages, ctx.num_workers) wts_tile_hint = [num_pages, col_step] p_tile_hint = [col_step, 1] #wts = expr.sparse_diagonal((num_pages, num_pages), dtype=np.float32, tile_hint=wts_tile_hint) #wts = expr.eager( # expr.sparse_rand((num_pages, num_pages), # density=density, # format='csr', # dtype=np.float32, # tile_hint=wts_tile_hint)) wts = expr.eager(pagerank_sparse(num_pages, num_outlinks, same_site_prob, wts_tile_hint)) #res = wts.glom().todense() #for i in range(res.shape[0]): # l = [] # for j in range(res.shape[1]): # l.append(round(res[i,j],1)) # print l #p = expr.sparse_empty((num_pages,1), dtype=np.float32, tile_hint=p_tile_hint).force() #for i in range(num_pages): # p[i,0] = 1 #p = expr.sparse_rand((num_pages, 1), density=1.0, format='csc', dtype=np.float32, tile_hint=p_tile_hint) p = expr.eager(expr.rand(num_pages, 1, tile_hint=p_tile_hint).astype(np.float32)) #q = expr.zeros((num_pages, 1), dtype=np.float32, tile_hint=p_tile_hint).force() #q[:] = p.glom().todense() #q = expr.lazify(q) #r = expr.dot(wts, p) #print r.glom() avg_time = sparse_multiply(wts, p, p_tile_hint) print 'baseline:', baseline[ctx.num_workers], 'current benchmark:', avg_time / 1000
def benchmark_matmul(ctx, timer): N = int(1000 * math.pow(ctx.num_workers, 1.0 / 3.0)) #N = 4000 M = util.divup(N, ctx.num_workers) T = util.divup(N, math.sqrt(ctx.num_workers)) util.log_info('Testing with %d workers, N = %d, tile_size=%s', ctx.num_workers, N, T) #x = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(N, M))) #y = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(N, M))) x = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(T, T))) y = expr.eager(expr.ones((N, N), dtype=np.double, tile_hint=(T, T))) #print expr.glom(expr.dot(x, y)) #print expr.dag(expr.dot(x, y)) def _step(): expr.evaluate(expr.dot(x, y)) timer.time_op('matmul', _step)
def test_linear_reg(self): _skip_if_travis() N_EXAMPLES = 10 * 1000 * 1000 * self.ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): yp = expr.dot(x, w) diff = x * (yp - y) grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1)) w = w - grad * 1e-6 cost = time.time() - start self._verify_cost("linear_reg", cost)
def test_convnet(ctx): hint = util.divup(64, sqrt(ctx.num_workers)) images = expr.eager(expr.ones((N_IMGS,) + IMG_SIZE, tile_hint=(N_IMGS, N_COLORS, hint, hint))) w1 = expr.eager(expr.ones((N_FILTERS, N_COLORS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv1 = stencil.stencil(images, w1, 2) pool1 = stencil.maxpool(conv1) w2 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv2 = stencil.stencil(pool1, w2, 2) pool2 = stencil.maxpool(conv2) w3 = expr.eager(expr.ones((N_FILTERS, N_FILTERS) + FILTER_SIZE, tile_hint=ONE_TILE)) conv3 = stencil.stencil(pool2, w3, 2) pool3 = stencil.maxpool(conv3) util.log_info(pool3.shape)
def benchmark_als(ctx, timer): print "#worker:", ctx.num_workers #USER_SIZE = 400 * ctx.num_workers USER_SIZE = 200 * 64 MOVIE_SIZE = 12800 num_features = 20 num_iter = 5 A = expr.eager(expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE/ctx.num_workers, MOVIE_SIZE))) util.log_warn('begin als!') t1 = datetime.now() U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter) U.force() M.force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
def test_pagerank(self): _skip_if_travis() OUTLINKS_PER_PAGE = 10 PAGES_PER_WORKER = 1000000 num_pages = PAGES_PER_WORKER * self.ctx.num_workers wts = expr.shuffle( expr.ndarray( (num_pages, num_pages), dtype=np.float32, tile_hint=(num_pages, PAGES_PER_WORKER / 8)), make_weights, ) start = time.time() p = expr.eager(expr.ones((num_pages, 1), tile_hint=(PAGES_PER_WORKER / 8, 1), dtype=np.float32)) expr.dot(wts, p, tile_hint=(PAGES_PER_WORKER / 8, 1)).evaluate() cost = time.time() - start self._verify_cost("pagerank", cost)
def setUp(self): if not hasattr(self, 'current'): self.current = eager(expr.abs(10 + expr.randn(10))) self.strike = eager(expr.abs(20 + expr.randn(10)))
def run(N_EXAMPLES, N_DIM, iterations): x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / 10, 10))) y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / 10, 1))) linear_regression(x, y, iterations)
def find_change(arr, threshold=0.5): diff = abs(arr[1:] - arr[:-1]) diff = eager(diff) return diff[diff > threshold]