def start_server(args): """Start kvstore service """ server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) my_server = KVServer(server_id=args.server_id, server_namebook=server_namebook, num_client=args.num_client) data = F.zeros((num_entries, args.dim_size), F.float32, F.cpu()) g2l = F.zeros(num_entries * args.num_servers, F.int64, F.cpu()) start = num_entries * my_server.get_machine_id() end = num_entries * (my_server.get_machine_id() + 1) g2l[start:end] = F.arange(0, num_entries) partition = np.arange(args.num_servers) partition = F.tensor(np.repeat(partition, num_entries)) if my_server.get_id() % my_server.get_group_count() == 0: # master server my_server.set_global2local(name='entity_embed', global2local=g2l) my_server.init_data(name='entity_embed', data_tensor=data) my_server.set_partition_book(name='entity_embed', partition_book=partition) else: my_server.set_global2local(name='entity_embed') my_server.init_data(name='entity_embed') my_server.set_partition_book(name='entity_embed') my_server.print() my_server.start()
def _init_data(self, name, shape, init_type, low, high): """Initialize kvstore tensor. Parameters ---------- name : str data name shape : list of int The tensor shape init_type : str initialize method, including 'zero' and 'uniform' low : float min threshold high : float max threshold """ if init_type == 'uniform': self._data_store[name] = F.uniform(shape=shape, dtype=F.float32, ctx=F.cpu(), low=low, high=high) elif init_type == 'zero': self._data_store[name] = F.zeros(shape=shape, dtype=F.float32, ctx=F.cpu()) else: raise RuntimeError('Unknown initial method')
def start_client(args): if args.range == -1: policy, gpb = create_partition_policy(args) else: policy, gpb = create_range_partition_policy(args) print("create data...") data = create_data(args) print("Create data done.") dgl.distributed.connect_to_server(ip_config=args.ip_config) kvclient = dgl.distributed.KVClient(ip_config=args.ip_config) kvclient.barrier() kvclient.map_shared_data(partition_book=gpb) #################################### local fast-pull #################################### if args.machine_id == 1: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) start = time.time() for _ in range(100): res = kvclient.pull(name='data', id_tensor=id_tensor) end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Local fast-pull Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) #################################### remote fast-pull #################################### if args.machine_id == 0: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) start = time.time() for _ in range(100): res = kvclient.pull(name='data', id_tensor=id_tensor) end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Remote fast-pull Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) #################################### local pull ################################## kvclient.register_pull_handler('data', udf_pull) kvclient.barrier() if args.machine_id == 1: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) start = time.time() for _ in range(100): res = kvclient.pull(name='data', id_tensor=id_tensor) end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Local pull Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) #################################### remote pull ################################## if args.machine_id == 0: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) start = time.time() for _ in range(100): res = kvclient.pull(name='data', id_tensor=id_tensor) end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Remote pull Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) ################################# local push ###################################### if args.machine_id == 1: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) data_tensor = F.zeros((args.data_size, args.dim), F.float32, F.cpu()) kvclient.barrier() start = time.time() for _ in range(100): res = kvclient.push(name='data', id_tensor=id_tensor, data_tensor=data_tensor) kvclient.barrier() end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Local push Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) ################################# remote push ###################################### if args.machine_id == 0: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = id_tensor + args.graph_size else: id_tensor = np.random.randint(args.graph_size, size=args.data_size) id_tensor = F.tensor(id_tensor) kvclient.barrier() start = time.time() for _ in range(100): res = kvclient.push(name='data', id_tensor=id_tensor, data_tensor=data_tensor) kvclient.barrier() end = time.time() total_bytes = (args.data_size*(args.dim+2)*4)*100*args.num_client/2 print("Remote push Throughput (MB): %f" % (total_bytes / (end-start) / 1024.0 / 1024.0)) dgl.distributed.shutdown_servers() dgl.distributed.finalize_client()
def create_data(args): """Create data hold by server nodes """ data = F.zeros((args.graph_size, args.dim), F.float32, F.cpu()) return data
def zeros_init(shape, dtype): return F.zeros(shape, dtype=dtype, ctx=F.cpu())
def start_client(args): """Start client """ server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) my_client = KVClient(server_namebook=server_namebook) my_client.connect() my_client.print() my_client.barrier() local_start = num_entries * my_client.get_machine_id() local_end = num_entries * (my_client.get_machine_id() + 1) local_range = np.arange(local_start, local_end) id_list = [] for i in range(10000): ids = np.random.choice(local_range, args.batch_size) id_list.append(F.tensor(ids)) print("Pull from local...") num_bytes = 0 start = time.time() for ids in id_list: tmp = my_client.pull(name='entity_embed', id_tensor=ids) ndim = tmp.shape[1] num_bytes += np.prod(tmp.shape) * 4 print("Total time: %.3f, #bytes: %.3f GB" % (time.time() - start, num_bytes / 1000 / 1000 / 1000)) my_client.barrier() arr = F.zeros((num_entries, ndim), F.float32, F.cpu()) print('Slice from a tensor...') num_bytes = 0 start = time.time() for ids in id_list: tmp = arr[ids] num_bytes += np.prod(tmp.shape) * 4 print("Total time: %.3f, #bytes: %.3f GB" % (time.time() - start, num_bytes / 1000 / 1000 / 1000)) print("Pull from remote...") if local_start == 0: remote_range = np.arange(local_end, num_entries * args.num_servers) elif local_end == num_entries * args.num_servers: remote_range = np.arange(0, local_start) else: range1 = np.arange(0, local_start) range2 = np.arange(local_end, num_entries * args.num_servers) remote_range = np.concatenate((range1, range2)) id_list = [] for i in range(1000): ids = np.random.choice(remote_range, args.batch_size) id_list.append(F.tensor(ids)) num_bytes = 0 start = time.time() for ids in id_list: tmp = my_client.pull(name='entity_embed', id_tensor=ids) num_bytes += np.prod(tmp.shape) * 4 print("Total time: %.3f, #bytes: %.3f GB" % (time.time() - start, num_bytes / 1000 / 1000 / 1000)) my_client.barrier() if my_client.get_id() == 0: my_client.shut_down()