'num_partition_GBK': 2, 'split_size': 128, } t = rdd.TextFile(self.filename) m = rdd.Map(t, (lambda urls: parseNeighbors(urls))) links = rdd.GroupByKey(m) ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0)) for iteration in range(5): joins = rdd.Join([links, ranks]) contribs = rdd.FlatMap( joins, lambda url_urls_rank: computeContribs( url_urls_rank[1][0], url_urls_rank[1][1])) rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b) ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15) ranks.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] page_rank_client = PageRankClient(filepath) # page_rank_client = PageRankClient(sys.argv[1]) client = get_client(master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), self_address) print "[Client]Job Submited...." page_rank_client.start_server(self_address)
RDD._config = {'num_partition_RBK': 2, 'num_partition_GBK': 2, 'split_size': 128, } t = rdd.TextFile(self.filename) m = rdd.Map(t, (lambda urls: parseNeighbors(urls))) links = rdd.GroupByKey(m) ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0)) for iteration in range(5): joins = rdd.Join([links, ranks]) contribs = rdd.FlatMap(joins, lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1])) rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b) ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15) ranks.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] page_rank_client = PageRankClient(filepath) # page_rank_client = PageRankClient(sys.argv[1]) client = get_client(master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), self_address) print "[Client]Job Submited...." page_rank_client.start_server(self_address)
'split_size': 128, "driver_addr": ""} RDD._streaming = 20 lines = rdd.Streaming(driver.num_partition) f = rdd.FlatMap(lines, lambda x: parse_lines(x)) m = rdd.Map(f, lambda x: (x, 1)) counts = rdd.ReduceByKey(m, lambda a, b: a + b) counts.collect(driver) if __name__ == '__main__': name, master_address, self_address, interval = sys.argv # word count streaming client word_count_client = StreamingWordCountClient(master_address, int(interval)) obj = pickle_object(word_count_client) # assign job client = get_client(master_address) job_id = execute_command(client, client.get_job, obj, self_address) debug_print_by_name('wentao', str(job_id)) # send data send_data_thread = gevent.spawn(send_word, job_id, master_address) print "[Client]Job Submited...." client_thread = gevent.spawn(word_count_client.start_server, self_address) gevent.joinall([send_data_thread, client_thread]) # word_count_client.start_server(self_address)
'split_size': 128, "driver_addr": "" } RDD._streaming = 20 lines = rdd.Streaming(driver.num_partition) f = rdd.FlatMap(lines, lambda x: parse_lines(x)) m = rdd.Map(f, lambda x: (x, 1)) counts = rdd.ReduceByKey(m, lambda a, b: a + b) counts.collect(driver) if __name__ == '__main__': name, master_address, self_address, interval = sys.argv # word count streaming client word_count_client = StreamingWordCountClient(master_address, int(interval)) obj = pickle_object(word_count_client) # assign job client = get_client(master_address) job_id = execute_command(client, client.get_job, obj, self_address) debug_print_by_name('wentao', str(job_id)) # send data send_data_thread = gevent.spawn(send_word, job_id, master_address) print "[Client]Job Submited...." client_thread = gevent.spawn(word_count_client.start_server, self_address) gevent.joinall([send_data_thread, client_thread]) # word_count_client.start_server(self_address)