def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers N = 100000 * ctx.num_workers D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D)) labels = expr.eager(expr.shuffle(data, _init_label_mapper)) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers #N = 100000 * ctx.num_workers N = 10000 * 64 D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers)) labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper, kw={'data': data}, shape_hint=(data.shape[0], 1), cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}} ) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_lda(ctx, timer): print "#worker:", ctx.num_workers NUM_TERMS = 160 NUM_DOCS = 200 * ctx.num_workers #NUM_DOCS = 10 * 64 # create data # NUM_TERMS = 41807 # NUM_DOCS = 21578 # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).evaluate() terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100) max_iter = 3 k_topics = 16 t1 = datetime.now() doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter) doc_topics.optimized().evaluate() topic_term_count.optimized().evaluate() t2 = datetime.now() time_cost = millis(t1, t2) util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost / max_iter)
def benchmark_als(ctx, timer): print "#worker:", ctx.num_workers #USER_SIZE = 400 * ctx.num_workers USER_SIZE = 200 * 64 MOVIE_SIZE = 12800 num_features = 20 num_iter = 5 A = expr.eager(expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE/ctx.num_workers, MOVIE_SIZE))) util.log_warn('begin als!') t1 = datetime.now() U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter) U.force() M.force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
def benchmark_als(ctx, timer): print "#worker:", ctx.num_workers #USER_SIZE = 100 * ctx.num_workers USER_SIZE = 320 #USER_SIZE = 200 * 64 MOVIE_SIZE = 12800 num_features = 20 num_iter = 2 A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE, util.divup(MOVIE_SIZE, ctx.num_workers))) #A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5) util.log_warn('begin als!') t1 = datetime.now() U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter) U.force() M.force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
def benchmark_lda(ctx, timer): print "#worker:", ctx.num_workers NUM_TERMS = 160 NUM_DOCS = 200 * ctx.num_workers #NUM_DOCS = 10 * 64 # create data # NUM_TERMS = 41807 # NUM_DOCS = 21578 # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).force() terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100) max_iter = 3 k_topics = 16 t1 = datetime.now() doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter) doc_topics.optimized().force() topic_term_count.optimized().force() t2 = datetime.now() time_cost = millis(t1,t2) util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost/max_iter)