def FlowJob(x: flow.typing.Numpy.Placeholder((4, 3, 2, 3), dtype=flow.float)): with flow.scope.placement("gpu", "0:0-3", (2, 2)): v1 = flow.get_variable( "v1", shape=(4, 3, 2, 3), dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, parallel_distribution=["S(0)", "S(2)"], ) v2 = flow.get_variable( "v2", shape=(4, 3, 6), dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, parallel_distribution=["S(0)", "S(2)"], ) x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(2)"]) x += v1 loss = flow.reshape_like(x, v2) loss = flow.hierarchical_parallel_cast(loss, parallel_distribution=["S(0)"]) return loss
def gpt_loader_fn() -> flow.typing.Numpy: with flow.scope.placement("cpu", device_strs, parallel_hierachy): tokens = flow.data.megatron_gpt_mmap_data_loader( data_file_prefix=data_file_prefix, seq_length=seq_length, num_samples=num_samples, batch_size=batch_size, dtype=dtype, shuffle=shuffle, random_seed=random_seed, split_sizes=split_sizes, split_index=split_index, parallel_distribution=parallel_distribution, start_from_saved_progress=start_from_saved_progress, name="GPTDataLoader", ) if (isinstance(parallel_distribution, list) and len(parallel_distribution) > 1): tokens = flow.hierarchical_parallel_cast( tokens, parallel_distribution=["B", "B"]) tokens = flow.hierarchical_parallel_cast(tokens, parallel_distribution=["B"]) return tokens
def FlowJob(x: flow.typing.Numpy.Placeholder((4, 6), dtype=flow.float)): with flow.scope.placement("gpu", "0:0-3", (2, 2)): v = flow.get_variable( "x", shape=(4, 6), dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, parallel_distribution=["S(0)", "S(1)"], ) x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(1)"]) x += v loss = flow.reshape(x, (4, 2, 3)) loss = flow.hierarchical_parallel_cast(loss, parallel_distribution=["S(0)"]) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0).minimize(loss) return loss
def test_fn( a: flow.typing.Numpy.Placeholder(a_shape), b: flow.typing.Numpy.Placeholder(b_shape), c: flow.typing.Numpy.Placeholder(c_shape), ) -> flow.typing.Numpy: var_a = flow.get_variable( name="var_a", shape=a_shape, dtype=flow.float32, initializer=flow.ones_initializer(), distribute=flow.distribute.split(1), ) # S0 -> S1 a = flow.hierarchical_parallel_cast(a, parallel_distribution=["S(1)"]) a = var_a * a out = flow.matmul(a, b) # P -> B out = flow.hierarchical_parallel_cast(out, parallel_distribution=["B"]) # S0 -> B c = flow.hierarchical_parallel_cast(c, parallel_distribution=["B"]) out = flow.nn.bias_add(out, c) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out) return out
def test_fn( x: flow.typing.Numpy.Placeholder((1024, 4)), indices: flow.typing.Numpy.Placeholder(shape=(12, ), dtype=flow.int32), ) -> flow.typing.Numpy: with flow.scope.placement("gpu", "0:0-3", (2, 2)): x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(0)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "B"]) v = flow.get_variable( name="v", shape=(1024, 4), parallel_distribution=["S(0)", "B"], initializer=flow.zeros_initializer(), ) x = x + v indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "S(0)"]) x = flow.gather(x, indices) x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "S(0)"], grad_mode="manual", grad_parallel_distribution=["B", "S(0)"], ) x = flow.math.relu(x) x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"], ) x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B"]) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0).minimize(x) return x
def test_fn( x: flow.typing.Numpy.Placeholder((1024, 1024)), indices: flow.typing.Numpy.Placeholder(shape=(64, ), dtype=flow.int32), ) -> flow.typing.Numpy: with flow.scope.placement("gpu", "0:0-3", (2, 2)): if src[0] == "S(0)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["S(0)", "S(0)"]) if src[1] == "S(0)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["S(0)", "S(0)"], ) elif src[1] == "S(1)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "S(1)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["S(0)", "B"], ) elif src[1] == "P": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "S(0)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["S(0)", "B"], ) elif src[1] == "B": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["S(0)", "B"], ) elif src[0] == "P": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(0)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) if src[1] == "S(0)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "S(0)"], ) elif src[1] == "S(1)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(1)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) elif src[1] == "P": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(0)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) elif src[1] == "B": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) elif src[0] == "B": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) if src[1] == "S(0)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "S(0)"], ) elif src == "S(1)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "S(1)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) elif src == "P": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "S(0)"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) elif src == "B": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"]) indices = flow.hierarchical_parallel_cast( indices, parallel_distribution=["B", "B"]) else: raise NotImplementedError x = flow.gather(x, indices) x = flow.hierarchical_parallel_cast( x, parallel_distribution=dst, name="gather_cast", ) if dst[0] == "S(0)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(0)", "S(0)"], ) elif dst[0] == "B": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["B", "B"], ) elif dst[0] == "S(1)": x = flow.hierarchical_parallel_cast( x, parallel_distribution=["S(1)", "S(1)"], ) else: raise NotImplementedError x = flow.hierarchical_parallel_cast(x, parallel_distribution=["B"]) return x