import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tensor_list = [] tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') paddle.distributed.all_gather(tensor_list, tindata) return tensor_list if __name__ == "__main__": runtime_main(TestCollectiveAllgatherAPI, "allgather")
seed = os.getpid() np.random.seed(seed) in_feat = 2 n_expert = 2 world_size = 2 tot_expert = n_expert * world_size local_expert_count = np.random.randint( 1, 4, size=tot_expert).astype("int") fwd_expert_count = sum(local_expert_count) local_input_buf = np.random.rand(fwd_expert_count, in_feat).astype("float32") local_expert_count = paddle.to_tensor(local_expert_count) local_input_buf = paddle.to_tensor(local_input_buf) global_expert_count = [] paddle.distributed.alltoall( paddle.split( local_expert_count, 2, axis=0), global_expert_count) global_expert_count = paddle.concat(global_expert_count, axis=0) local_input_buf.stop_gradient = False output = paddle.distributed.utils.global_scatter( local_input_buf, local_expert_count, global_expert_count) output.stop_gradient = False c = output * output c.backward() return [output.numpy(), local_input_buf.grad.numpy()] if __name__ == "__main__": runtime_main(TestCollectiveGlobalScatterAPI, "global_scatter")
import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') tindata = paddle.split(tindata, 2, axis=0) tout_data = [] paddle.distributed.alltoall(tindata, tout_data) return tout_data if __name__ == "__main__": runtime_main(TestCollectiveAllToAllAPI, "alltoall")
from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float64', append_batch_size=False) toutdata = layers.fill_constant(shape=[5, 1000], dtype='float64', value=1.0) tensor_list = None if rank == 1: tensor_list = paddle.split(tindata, 2, axis=0) paddle.distributed.scatter(toutdata, tensor_list, src=1) return [toutdata] if __name__ == "__main__": runtime_main(TestCollectiveScatterAPI, "scatter")
data = paddle.static.data(name='tindata', shape=[10, 1000], dtype="float32") paddle.distributed.broadcast(data, src=0) data = paddle.split(data, 2, axis=1)[rank] if rank == 0: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[0:500, :]), ) else: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[500:1000, :]), ) linear_out = paddle.distributed.split( data, size=(1000, 16), operation='linear', axis=0, num_partitions=2, weight_attr=param_attr, bias_attr=True, ) return [linear_out] if __name__ == "__main__": runtime_main(TestRowParallelLinearAPI, "row_parallel_linear")
fleet.init(is_collective=True) np.random.seed(2020) np_array = np.random.rand(1000, 16) data = paddle.static.data( name='tindata', shape=[10, 1000], dtype="float32") paddle.distributed.broadcast(data, src=0) if rank == 0: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[:, 0:8]), ) else: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[:, 8:16]), ) linear_out = paddle.distributed.split( data, size=(1000, 16), operation='linear', axis=1, num_partitions=2, weight_attr=param_attr, bias_attr=True, ) return [linear_out] if __name__ == "__main__": runtime_main(TestColumnParallelLinearAPI, "column_parallel_linear")
from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data( name="tindata", shape=[10, 1000], dtype='float32', append_batch_size=False) if rank == 0: paddle.distributed.send(tindata, dst=1) else: paddle.distributed.recv(tindata, src=0) return [tindata] if __name__ == "__main__": runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
# (num_embeddings, embedding_dim) = (12, 8) size = (12, 8) np_array = np.random.rand(size[0], size[1]) paddle.seed(2020) data_in = paddle.randint(0, size[0], shape=(10, 4)) data = paddle.static.data( name='tindata', shape=[10, 1000], dtype="float32") per_part_size = size[0] // 2 if rank == 0: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[0:per_part_size, :]), ) else: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[per_part_size:size[0], :]), ) emb_out = paddle.distributed.split( data_in, size, operation="embedding", num_partitions=2, weight_attr=param_attr) return [data_in, emb_out] if __name__ == "__main__": runtime_main(TestParallelEmbeddingAPI, "parallel_embedding")
def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): fleet.init(is_collective=True) np.random.seed(2020) np_array = np.random.rand(9, 8) paddle.seed(2020) data_in = paddle.randint(0, 7, shape=(10, 4)) data = paddle.static.data(name='tindata', shape=[10, 1000], dtype="float32") if rank == 0: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[0:5, :]), ) else: param_attr = paddle.fluid.ParamAttr( initializer=paddle.fluid.initializer.NumpyArrayInitializer( np_array[5:9, :]), ) emb_out = paddle.distributed.split(data_in, (7, 8), operation="embedding", num_partitions=2, weight_attr=param_attr) return [data_in, emb_out] if __name__ == "__main__": runtime_main(TestParallelEmbeddingAPINoneDivisible, "parallel_embedding")
import socket from contextlib import closing from six import string_types import math import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): paddle.distributed.barrier() return [] if __name__ == "__main__": runtime_main(TestCollectiveBarrierAPI, "barrier")
import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') gp = paddle.distributed.new_group([0, 1]) paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False) return [tindata] if __name__ == "__main__": runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce")
from six import string_types import math import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data( name="tindata", shape=[10, 1000], dtype='float32') paddle.distributed.all_reduce(tindata) return [tindata] if __name__ == "__main__": runtime_main(TestCollectiveAllreduceAPI, "allreduce")
import socket from contextlib import closing from six import string_types import math import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data( name="tindata", shape=[10, 1000], dtype='float32') paddle.distributed.reduce(tindata, dst=0) return [tindata] if __name__ == "__main__": runtime_main(TestCollectiveReduceAPI, "reduce")
np.random.seed(seed) in_feat = 2 n_expert = 2 world_size = 2 tot_expert = n_expert * world_size local_expert_count = np.random.randint( 1, 4, size=tot_expert).astype("int") local_expert_count = paddle.to_tensor(local_expert_count) global_expert_count = [] paddle.distributed.alltoall( paddle.split(local_expert_count, 2, axis=0), global_expert_count) global_expert_count = paddle.concat(global_expert_count, axis=0) fwd_expert_count = sum(global_expert_count) np.random.seed(seed) local_input_buf = np.random.rand(fwd_expert_count, in_feat).astype("float32") local_input_buf = paddle.to_tensor(local_input_buf) local_input_buf.stop_gradient = False output = paddle.distributed.utils.global_gather( local_input_buf, local_expert_count, global_expert_count) output.stop_gradient = False c = output * output c.stop_gradient = False c.backward() return [output.numpy(), local_input_buf.grad.numpy()] if __name__ == "__main__": runtime_main(TestCollectiveGlobalGatherAPI, "global_gather")
import math import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main paddle.enable_static() class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank): with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') paddle.distributed.broadcast(tindata, src=1) return [tindata] if __name__ == "__main__": runtime_main(TestCollectiveBroadcastAPI, "broadcast")