示例#1
0
def update(devices):
    device = devices[0].split("/")[1]
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4).to(device)
    model = [
        RemoteModuleParams(nn.Linear, (4, 4), {}),
        RemoteModuleParams(nn.ReLU, (), {})
    ]
    pipe = create_sequence_pipeline(model,
                                    balance=[1, 1],
                                    chunks=4,
                                    devices=devices[:2])
    opt = DistributedOptimizer(
        torch.optim.SGD,
        pipe.parameter_rrefs(),
        lr=0.05,
    )
    losses = []
    for i in range(2):
        with dist_autograd.context() as context_id:
            y = pipe(x)
            loss = criterion(y, rpc.RRef(x))
            losses.append(loss)
            loss.backward(context_id)
            opt.step(context_id)
    losses = [l.to_here() for l in losses]
    assert losses[0] > losses[1], f"{losses[0]} !> {losses[1]}"
示例#2
0
    def _test_dist_optim_none_grads(self, optim_cls, *args, **kwargs):
        # local version
        module1 = MyModule()
        module2 = MyModule(requires_grad=False)
        params = [module1.get_w(), module2.get_w()]
        local_optim = optim_cls(params, *args, **kwargs)

        old_w1 = module1.w.clone().detach()
        old_w2 = module2.w.clone().detach()

        g_cpu = torch.Generator()
        g_cpu.manual_seed(0)
        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
        output1 = module1.forward(t2)
        output2 = module2.forward(output1)
        loss = torch.add(output2, t1).sum()

        loss.backward()
        local_optim.step()

        # distributed version
        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)

        remote_module1 = rpc.remote(owner1, MyModule)
        remote_module2 = rpc.remote(owner2, MyModule, args=(False, ))
        remote_param1 = remote_module1.remote().get_w()
        remote_param2 = remote_module2.remote().get_w()

        # sanity check: local and remote initial weights should match
        self.assertEqual(old_w1, remote_param1.to_here())
        self.assertEqual(old_w2, remote_param2.to_here())

        dist_optim = DistributedOptimizer(optim_cls,
                                          [remote_param1, remote_param2],
                                          *args, **kwargs)

        with dist_autograd.context() as context_id:
            g_cpu.manual_seed(0)
            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            output1 = remote_module1.rpc_async().forward(t2)
            output2 = remote_module2.rpc_async().forward(output1.wait())
            loss = torch.add(output2.wait(), t1)

            dist_autograd.backward(context_id, [loss.sum()])
            dist_optim.step(context_id)

            new_w1 = remote_module1.rpc_async().get_w().wait()
            new_w2 = remote_module2.rpc_async().get_w().wait()

            # ensure optimizer changed weights for w1
            self.assertNotEqual(old_w1, new_w1)

            # ensure optimizer not changed weights for w2
            self.assertEqual(old_w2, new_w2)
            # ensure local equals remote
            self.assertEqual(new_w1, module1.get_w())
            self.assertEqual(new_w2, module2.get_w())
def auto_graph_extract(devices):
    from fairscale.experimental.nn.distributed_pipeline.trace import make_graph

    device = devices[0].split("/")[1]
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4).to(device)

    # create model
    model = nn.Sequential(
        RemoteModule(devices[0], nn.Linear, (4, 4), {}),
        ShardedLinearLayer(devices[0], devices, devices[1]),
        RemoteModule(devices[0], nn.Linear, (4, 4), {}),
    )
    graph = make_graph(model)
    pipe = DistributedPipeline(graph, chunks=4)
    partitions = extract_partitions(graph, pipe)
    assert [[0, 1], [2], [3], [4], [5]] == partitions, f"partitions={partitions}"
    parameter_rrefs = pipe.parameter_rrefs()
    assert len(parameter_rrefs) == 8
    opt = DistributedOptimizer(
        torch.optim.SGD,
        parameter_rrefs,
        lr=0.05,
    )
    losses = []
    for i in range(2):
        with dist_autograd.context() as context_id:
            y = pipe(x)
            loss = criterion(y, rpc.RRef(x))
            losses.append(loss)
            loss.backward(context_id)
            opt.step(context_id)
    losses = [l.to_here() for l in losses]
    assert losses[0] > losses[1], f"{losses[0]} !> {losses[1]}"
示例#4
0
def run_master(split_size):

    # put the two model parts on worker1 and worker2 respectively
    model = DistResNet50(split_size, ["worker1", "worker2"])
    loss_fn = nn.MSELoss()
    opt = DistributedOptimizer(
        optim.SGD,
        model.parameter_rrefs(),
        lr=0.05,
    )

    one_hot_indices = torch.LongTensor(batch_size) \
                           .random_(0, num_classes) \
                           .view(batch_size, 1)

    for i in range(num_batches):
        print(f"Processing batch {i}")
        # generate random inputs and labels
        inputs = torch.randn(batch_size, 3, image_w, image_h)
        labels = torch.zeros(batch_size, num_classes) \
                      .scatter_(1, one_hot_indices, 1)

        with dist_autograd.context() as context_id:
            outputs = model(inputs)
            dist_autograd.backward(context_id, [loss_fn(outputs, labels)])
            opt.step(context_id)
def run_training_loop(rank, num_gpus, train_loader, test_loader):
    # Runs the typical nueral network forward + backward + optimizer step, but
    # in a distributed fashion.
    net = TrainerNet(num_gpus=num_gpus)
    # Build DistributedOptmizer.
    param_rrefs = net.get_global_param_rrefs()
    opt = DistributedOptimizer(optim.SGD, param_rrefs, lr=0.03)
    for i, (data, target) in enumerate(train_loader):
        with dist_autograd.context() as cid:
            model_output = net(data)
            target = target.to(model_output.device)
            loss = F.nll_loss(model_output, target)
            if i % 5 == 0:
                print(f"Rank {rank} training batch {i} loss {loss.item()}")
            dist_autograd.backward(cid, [loss])
            # Ensure that dist autograd ran successfully and gradients were
            # returned.
            assert remote_method(
                ParameterServer.get_dist_gradients,
                net.param_server_rref,
                cid) != {}
            opt.step(cid)

    print("Training complete!")
    print("Getting accuracy....")
    get_accuracy(test_loader, net)
示例#6
0
def run_master():

    # put the two model parts on worker1 and worker2 respectively
    model = DistResNet(["worker1", "worker2", "worker3", "worker4"])
    loss_fn = nn.MSELoss()
    opt = DistributedOptimizer(
        optim.SGD,
        model.parameter_rrefs(),
        lr=0.05,
    )

    one_hot_indices = torch.LongTensor(batch_size) \
                           .random_(0, num_classes) \
                           .view(batch_size, 1)

    for i in range(num_batches):
        print(f"Processing batch {i}")
        # generate random inputs and labels
        inputs = torch.randn(batch_size, 3, image_w, image_h)
        labels = torch.zeros(batch_size, num_classes) \
                      .scatter_(1, one_hot_indices, 1)

        # The distributed autograd context is the dedicated scope for the
        # distributed backward pass to store gradients, which can later be
        # retrieved using the context_id by the distributed optimizer.
        with dist_autograd.context() as context_id:
            outputs = model(inputs)
            dist_autograd.backward(context_id, [loss_fn(outputs, labels)])
            opt.step(context_id)
示例#7
0
    def test_dist_optim(self):
        # local version
        module1 = MyModule()
        module2 = MyModule()
        params = [module1.get_w(), module2.get_w()]
        local_optim = optim.SGD(params, lr=0.05)

        old_w1 = module1.w.clone().detach()
        old_w2 = module2.w.clone().detach()

        torch.manual_seed(0)
        t1 = torch.rand((3, 3), requires_grad=True)
        t2 = torch.rand((3, 3), requires_grad=True)
        output1 = module1.forward(t2)
        output2 = module2.forward(output1)
        loss = torch.add(output2, t1).sum()

        loss.backward()
        local_optim.step()

        # distributed version
        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)

        remote_module1 = rpc.remote(owner1, MyModule)
        remote_module2 = rpc.remote(owner2, MyModule)
        remote_param1 = remote_method(MyModule.get_w, remote_module1)
        remote_param2 = remote_method(MyModule.get_w, remote_module2)

        old_w1_remote = remote_param1.to_here()

        # sanity check: local and remote initial weights should match
        self.assertEqual(old_w1, remote_param1.to_here())
        self.assertEqual(old_w2, remote_param2.to_here())

        dist_optim = DistributedOptimizer(optim.SGD,
                                          [remote_param1, remote_param2],
                                          lr=0.05)

        with dist_autograd.context():
            torch.manual_seed(0)
            t1 = torch.rand((3, 3), requires_grad=True)
            t2 = torch.rand((3, 3), requires_grad=True)
            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
            output2 = rpc_async_method(MyModule.forward, remote_module2,
                                       output1.wait())
            loss = torch.add(output2.wait(), t1)

            dist_autograd.backward([loss.sum()])
            dist_optim.step()

            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()

            # ensure optimizer changed weights
            self.assertNotEqual(old_w1, new_w1)
            self.assertNotEqual(old_w2, new_w2)
            # ensure local equals remote
            self.assertEqual(new_w1, module1.get_w())
            self.assertEqual(new_w2, module2.get_w())
示例#8
0
    def test_dist_optim_exception(self):
        # distributed version
        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)

        remote_module1 = rpc.remote(owner1, MyModule)
        remote_module2 = rpc.remote(owner2, MyModule)
        remote_param1 = remote_method(MyModule.get_w, remote_module1)
        remote_param2 = remote_method(MyModule.get_w, remote_module2)

        dist_optim = DistributedOptimizer(FailingOptimizer,
                                          [remote_param1, remote_param2])

        with dist_autograd.context() as context_id:
            g_cpu = torch.Generator()
            g_cpu.manual_seed(0)
            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
            output2 = rpc_async_method(MyModule.forward, remote_module2,
                                       output1.wait())
            loss = torch.add(output2.wait(), t1).sum()

            dist_autograd.backward(context_id, [loss])
            with self.assertRaisesRegex(Exception, "Error running optimizer"):
                dist_optim.step(context_id)
示例#9
0
文件: test_rpc.py 项目: zazyzaya/TGCN
def training_loop():
    model = RNN('ps', 3, 10, 1)
    X_tr, y_tr = gen_toy_data()
    X_te, y_te = gen_toy_data()

    opt = DistributedOptimizer(Adam, model.parameter_rrefs(), lr=0.01)

    loss_fn = nn.MSELoss()

    for e in range(100):
        with dist_autograd.context() as context_id:
            y_hat = model(X_tr)
            loss = loss_fn(y_hat, y_tr)

            dist_autograd.backward(context_id, [loss])
            opt.step(context_id)
            # No need to zero grad because it's blown
            # away every step by the dist API

        print("[%d] Loss: %0.4f" % (e, loss.item()))

    y_hat = model(X_te)
    y_hat[y_hat < 0.5] = 0
    y_hat[y_hat >= 0.5] = 1

    correct = float((y_hat == y_te).sum().item())
    total = float(y_hat.size(1))
    print("Final accuracy: %d/%d = %0.4f" % (correct, total, correct / total))
def update(devices):
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4)
    model = [("linear1", nn.Linear, (4, 4), {}), ("relu", nn.ReLU, (), {})]
    pipe = MultiProcessPipe(model,
                            balance=[1, 1],
                            chunks=4,
                            devices=devices[:2])
    params = pipe.parameter_rrefs()
    opt = DistributedOptimizer(
        torch.optim.SGD,
        pipe.parameter_rrefs(),
        lr=0.05,
    )
    losses = []
    for i in range(2):
        with dist_autograd.context() as context_id:
            y = pipe(x)
            loss = criterion(y, rpc.RRef(x))
            losses.append(loss)
            loss.backward(context_id)
            opt.step(context_id)
    losses = [l.to_here() for l in losses]
    assert losses[0] > losses[1], f"{losses[0]} !> {losses[1]}"
示例#11
0
def train(rrefs, kwargs):
    model = TGCN(rrefs,
                 kwargs['h_size'],
                 kwargs['z_size'],
                 gru_hidden_units=kwargs['n_gru'])

    opt = DistributedOptimizer(Adam, model.parameter_rrefs(), lr=kwargs['lr'])

    times = []
    best = (None, 0)
    no_progress = 0
    for e in range(kwargs['epochs']):
        # Get loss and send backward
        model.train()
        with dist_autograd.context() as context_id:
            st = time.time()
            zs = model.forward(ld.LANL_Data.TRAIN)
            loss = model.loss_fn(zs,
                                 ld.LANL_Data.TRAIN,
                                 nratio=kwargs['nratio'])

            print("backward")
            dist_autograd.backward(context_id, [loss])

            print("step")
            opt.step(context_id)

            elapsed = time.time() - st
            times.append(elapsed)
            print('[%d] Loss %0.4f  %0.2fs' % (e, loss.item(), elapsed))

        # Get validation info to prevent overfitting
        model.eval()
        with torch.no_grad():
            zs = model.forward(ld.LANL_Data.TRAIN, no_grad=True)
            v_loss = model.loss_fn(zs, ld.LANL_Data.VAL).item()

            print("\t Val loss: %0.4f" % v_loss)

            if v_loss > best[1]:
                best = (model.save_states(), v_loss)
            else:
                if e >= kwargs['min']:
                    no_progress += 1

            if no_progress == kwargs['patience']:
                print("Early stopping!")
                break

    model.load_states(best[0][0], best[0][1])
    zs, h0 = model(ld.LANL_Data.TEST, include_h=True)

    states = {'gcn': best[0][0], 'rnn': best[0][1]}
    f = open('model_save.pkl', 'wb+')
    pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)

    print("Exiting train loop")
    print("Avg TPE: %0.4fs" % (sum(times) / len(times)))

    return model, zs[-1], h0
示例#12
0
def _run_trainer():
    r"""
    The trainer creates a distributed RNNModel and a DistributedOptimizer. Then,
    it performs training using random input data.
    """
    batch = 5
    ntoken = 7
    ninp = 2

    nhid = 3
    nindices = 6
    nlayers = 4
    hidden = (
        torch.randn(nlayers, nindices, nhid),
        torch.randn(nlayers, nindices, nhid),
    )

    model = rnn.RNNModel("ps", ntoken, ninp, nhid, nlayers)

    # setup distributed optimizer
    opt = DistributedOptimizer(
        optim.SGD,
        model.parameter_rrefs(),
        lr=0.05,
    )

    criterion = torch.nn.CrossEntropyLoss()

    def get_next_batch():
        for _ in range(5):
            data = torch.LongTensor(batch, nindices) % ntoken
            target = torch.LongTensor(batch, ntoken) % nindices
            yield data, target

    # train for 10 iterations
    for epoch in range(10):
        # create distributed autograd context
        for data, target in get_next_batch():
            with dist_autograd.context() as context_id:
                hidden[0].detach_()
                hidden[1].detach_()
                output, hidden = model(data, hidden)
                loss = criterion(output, target)
                # run distributed backward pass
                dist_autograd.backward(context_id, [loss])
                # run distributed optimizer
                opt.step(context_id)
                # not necessary to zero grads as each iteration creates a different
                # distributed autograd context which hosts different grads
        print("Training epoch {}".format(epoch))
示例#13
0
    def run_one_mini_batch(self):
        with dist_autograd.context() as context_id:
            # Forward pass (create references on remote nodes).
            rref1 = rpc.remote(dst_name, random_tensor)
            rref2 = rpc.remote(dst_name, random_tensor)
            loss = rref1.to_here() + rref2.to_here()

            # Backward pass (run distributed autograd).
            dist_autograd.backward([loss.sum()])

            # Build DistributedOptimizer.
            dist_optim = DistributedOptimizer(optim.SGD, [rref1, rref2],
                                              lr=0.05)

            # Run the distributed optimizer step.
            dist_optim.step()
示例#14
0
def multi_input_multi_output_layers(devices):
    device = devices[0].split("/")[1]
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4).to(device)

    #                                / ->linear_layer_2_1
    # input -> linear_layer1 -> split                     ->concatenate
    #                                \ ->linear_layer_2_2

    linear_layer_1 = RemoteModule(devices[0], nn.Linear, (4, 4), {})
    split = RemoteModule(devices[0], SplitTensors, (), {})
    linear_layers_2 = [
        RemoteModule(devices[0], nn.Linear, (2, 2), {}),
        RemoteModule(devices[1], nn.Linear, (2, 2), {}),
    ]
    concatenate = RemoteModule(devices[1], ConcatenateTensors, ())

    graph = PipelineModulesGraph()
    graph.add_sequence([linear_layer_1, split], [0], 2)
    for i, l in enumerate(linear_layers_2):
        graph.add_layer(l, [(split, i)])
    graph.add_layer(concatenate, linear_layers_2)

    pipe = DistributedPipeline(graph, chunks=4)
    assert [[0, 1], [2], [3], [4]] == extract_partitions(graph, pipe)
    parameter_rrefs = pipe.parameter_rrefs()
    assert len(parameter_rrefs) == 6
    opt = DistributedOptimizer(
        torch.optim.SGD,
        parameter_rrefs,
        lr=0.05,
    )
    losses = []
    for i in range(2):
        with dist_autograd.context() as context_id:
            y = pipe(x)
            loss = criterion(y, rpc.RRef(x))
            losses.append(loss)
            loss.backward(context_id)
            opt.step(context_id)
    losses = [l.to_here() for l in losses]
    assert losses[0] > losses[1], f"{losses[0]} !> {losses[1]}"
示例#15
0
def _run_trainer(emb_rref, rank):
    r"""
    Each trainer runs a forward pass which involves an embedding lookup on the
    parameter server and running nn.Linear locally. During the backward pass,
    DDP is responsible for aggregating the gradients for the dense part
    (nn.Linear) and distributed autograd ensures gradients updates are
    propagated to the parameter server.
    """

    # Setup the model.
    model = HybridModel(emb_rref, rank)

    # Retrieve all model parameters as rrefs for DistributedOptimizer.

    # Retrieve parameters for embedding table.
    model_parameter_rrefs = rpc.rpc_sync("ps",
                                         _retrieve_embedding_parameters,
                                         args=(emb_rref, ))

    # model.parameters() only includes local parameters.
    for param in model.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Setup distributed optimizer
    opt = DistributedOptimizer(
        optim.SGD,
        model_parameter_rrefs,
        lr=0.05,
    )

    criterion = torch.nn.CrossEntropyLoss()

    def get_next_batch(rank):
        for _ in range(10):
            num_indices = random.randint(20, 50)
            indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS)

            # Generate offsets.
            offsets = []
            start = 0
            batch_size = 0
            while start < num_indices:
                offsets.append(start)
                start += random.randint(1, 10)
                batch_size += 1

            offsets_tensor = torch.LongTensor(offsets)
            target = torch.LongTensor(batch_size).random_(8).cuda(rank)
            yield indices, offsets_tensor, target

    # Train for 100 epochs
    for epoch in range(100):
        # create distributed autograd context
        for indices, offsets, target in get_next_batch(rank):
            with dist_autograd.context() as context_id:
                output = model(indices, offsets)
                loss = criterion(output, target)

                # Run distributed backward pass
                dist_autograd.backward(context_id, [loss])

                # Tun distributed optimizer
                opt.step(context_id)

                # Not necessary to zero grads as each iteration creates a different
                # distributed autograd context which hosts different grads
        print("Training done for epoch {}".format(epoch))
示例#16
0
def train(rrefs, kwargs, rnn_constructor, rnn_args, static):
    rnn = rnn_constructor(*rnn_args)
    model = StaticRecurrent(rnn, rrefs) if static \
        else DynamicRecurrent(rnn, rrefs)

    opt = DistributedOptimizer(
        Adam, model.parameter_rrefs(), lr=kwargs['lr']
    )

    times = []
    best = (None, 0)
    no_progress = 0
    for e in range(kwargs['epochs']):
        # Get loss and send backward
        model.train()
        with dist_autograd.context() as context_id:
            st = time.time()
            zs = model.forward(TData.TRAIN)
            loss = model.loss_fn(zs, TData.TRAIN, nratio=kwargs['nratio'])

            print("backward")
            dist_autograd.backward(context_id, loss)
            
            print("step")
            opt.step(context_id)

            elapsed = time.time()-st 
            times.append(elapsed)
            l = torch.stack(loss).sum()
            print('[%d] Loss %0.4f  %0.2fs' % (e, l.item(), elapsed))

        # Get validation info to prevent overfitting
        model.eval()
        with torch.no_grad():
            zs = model.forward(TData.TRAIN, no_grad=True)
            p,n = model.score_edges(zs, TData.VAL)
            auc,ap = get_score(p,n)

            print("\tValidation: AP: %0.4f  AUC: %0.4f" 
                % (ap, auc), end='')
            tot = ap+auc

            if tot > best[1]:
                print('*\n')
                best = (model.save_states(), tot)
                no_progress = 0
            else:
                print('\n')
                if e >= kwargs['min']:
                    no_progress += 1 

            if no_progress == kwargs['patience']:
                print("Early stopping!")
                break 

    model.load_states(best[0][0], best[0][1])
    zs, h0 = model(TData.TEST, include_h=True)

    states = {'gcn': best[0][0], 'rnn': best[0][1], 'h0': h0}
    f = open('model_save.pkl', 'wb+')
    pickle.dump(states, f, protocol=pickle.HIGHEST_PROTOCOL)

    tpe = sum(times)/len(times)
    print("Exiting train loop")
    print("Avg TPE: %0.4fs" % tpe)
    
    return model, h0, tpe
示例#17
0
def run_trainer(args, emb_rref_list):
    """
    Trainer function to be run from each machine. This function:
        1. Performs some basic initialization steps.
        2. Prepares random data for training.
        3. Sanity checks cmd-line args such as embedding sizes and MLP layers
        4. Sets up the model, loss, and Distributed Optimizer
        5. Runs the Training Loop
    """

    ######## BASIC INITIALIZATION ########
    set_rand_seed()
    set_print_options(args.print_precision)

    args.use_gpu = args.use_gpu and torch.cuda.is_available()
    init_gpu(args.use_gpu)
    #print(args)

    ######## PREPARE TRAINING DATA ########
    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
    # input and target at random
    ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
    m_den = ln_bot[0]
    train_data, train_loader = dp.make_random_data_and_loader(
        args, ln_emb, m_den)
    nbatches = args.num_batches if args.num_batches > 0 else len(train_loader)

    ######## PARSE CMD LINE ARGS ########
    m_spa = args.arch_sparse_feature_size
    num_fea = ln_emb.size + 1  # num sparse + num dense features
    m_den_out = ln_bot[ln_bot.size - 1]
    if args.arch_interaction_op == "dot":
        # approach 1: all
        # num_int = num_fea * num_fea + m_den_out
        # approach 2: unique
        if args.arch_interaction_itself:
            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
        else:
            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
    elif args.arch_interaction_op == "cat":
        num_int = num_fea * m_den_out
    else:
        sys.exit("ERROR: --arch-interaction-op=" + args.arch_interaction_op +
                 " is not supported")
    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")

    ######## SANITY CHECKS ########
    # Ensure feature sizes and MLP dimensions match
    if m_den != ln_bot[0]:
        sys.exit("ERROR: arch-dense-feature-size " + str(m_den) +
                 " does not match first dim of bottom mlp " + str(ln_bot[0]))
    if m_spa != m_den_out:
        sys.exit("ERROR: arch-sparse-feature-size " + str(m_spa) +
                 " does not match last dim of bottom mlp " + str(m_den_out))
    if num_int != ln_top[0]:
        sys.exit("ERROR: # of feature interactions " + str(num_int) +
                 " does not match first dimension of top mlp " +
                 str(ln_top[0]))

    # test prints (model arch)
    if args.debug_mode:
        print("model arch:")
        print("mlp top arch " + str(ln_top.size - 1) +
              " layers, with input to output dimensions:")
        print(ln_top)
        print("# of interactions")
        print(num_int)
        print("mlp bot arch " + str(ln_bot.size - 1) +
              " layers, with input to output dimensions:")
        print(ln_bot)
        print("# of features (sparse and dense)")
        print(num_fea)
        print("dense feature size")
        print(m_den)
        print("sparse feature size")
        print(m_spa)
        print("# of embeddings (= # of sparse features) " + str(ln_emb.size) +
              ", with dimensions " + str(m_spa) + "x:")
        print(ln_emb)

        print("data (inputs and targets):")
        for j, (X, offsets, indices, T) in enumerate(train_loader):
            # early exit if nbatches was set by the user and has been exceeded
            if nbatches > 0 and j >= nbatches:
                break

            print("mini-batch: %d" % j)
            print(X.detach().cpu().numpy())
            # transform offsets to lengths when printing
            print([
                np.diff(S_o.detach().cpu().tolist() +
                        list(indices[i].shape)).tolist()
                for i, S_o in enumerate(offsets)
            ])
            print([S_i.detach().cpu().tolist() for S_i in indices])
            print(T.detach().cpu().numpy())

    ######## TRAINING SETUP ########

    # Initialize the model (note we are passing the list of RRefs that point to
    # the remote embeddings).
    dlrm = model.DLRM_RPC(
        emb_rref_list,
        args.distributed_rank,
        args.use_gpu,
        ln_emb,
        ln_bot,
        ln_top,
        arch_interaction_op=args.arch_interaction_op,
        arch_interaction_itself=args.arch_interaction_itself,
        sigmoid_bot=-1,
        sigmoid_top=ln_top.size - 2,
    )

    # Specify the loss function
    loss_fn = torch.nn.MSELoss(reduction="mean")

    model_parameter_rrefs = []
    # RRefs for embeddings from PS
    for ind, emb_rref in enumerate(emb_rref_list):
        ps_name = "ps{}".format(ind)
        model_parameter_rrefs.extend(
            rpc.rpc_sync(ps_name,
                         _retrieve_embedding_parameters,
                         args=(emb_rref, )))
    # RRefs local to the model (MLP)
    for param in dlrm.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Build DistributedOptimizer.
    opt = DistributedOptimizer(
        optim.SGD,
        model_parameter_rrefs,
        lr=args.learning_rate,
    )

    def time_wrap(use_gpu):
        if use_gpu:
            torch.cuda.synchronize()
        return time.time()

    # TODO: uncomment for comp/comms DDP benchmark
    #if args.distributed_rank == 0:
    #    state_dict_top = {}
    #    state_dict_bot = {}
    #    dlrm.top_mlp_ddp.register_comm_hook(state_dict_top, profile_hook)
    #    dlrm.bot_mlp_ddp.register_comm_hook(state_dict_bot, profile_hook)

    # training or inference
    best_gA_test = 0
    best_auc_test = 0
    total_time = 0
    total_loss = 0
    total_accu = 0
    total_iter = 0
    total_samp = 0

    # Lists to track forward and backwad times per iteration
    fwd_times = []
    bwd_times = []

    rpc_fwd_times = []
    embedding_lookup_times = []

    ######## RUN TRAINING LOOP ########
    with torch.autograd.profiler.profile(enabled=args.enable_profiling,
                                         use_cuda=args.use_gpu) as prof:
        for epoch in range(args.nepochs):

            accum_time_begin = time_wrap(args.use_gpu)

            if args.mlperf_logging:
                previous_iteration_time = None

            for j, (X, offsets, indices, T) in enumerate(train_loader):

                if args.mlperf_logging:
                    current_time = time_wrap(args.use_gpu)
                    if previous_iteration_time:
                        iteration_time = current_time - previous_iteration_time
                    else:
                        iteration_time = 0
                    previous_iteration_time = current_time
                else:
                    t1 = time_wrap(args.use_gpu)

                # early exit if nbatches was set by the user and has been exceeded
                if nbatches > 0 and j >= nbatches:
                    break

                # create distributed autograd context
                with dist_autograd.context() as context_id:
                    # Run forward pass
                    fwd_start = time_wrap(args.use_gpu)
                    Z, rpc_delays, embed_lookup_delay, rpc_total = dlrm.forward(
                        X, offsets, indices)
                    fwd_end = time_wrap(args.use_gpu)

                    # Compute Loss
                    E = loss_fn(Z, T)

                    # Run distributed backward pass
                    bwd_start = time_wrap(args.use_gpu)
                    dist_autograd.backward(context_id, [E])
                    bwd_end = time_wrap(args.use_gpu)

                    # Run distributed optimizer
                    opt.step(context_id)

                    if epoch >= args.warmup_epochs:
                        fwd_times.append(fwd_end - fwd_start)
                        bwd_times.append(bwd_end - bwd_start)
                        rpc_fwd_times.extend(rpc_delays)
                        embedding_lookup_times.append(embed_lookup_delay)

                # compute loss and accuracy
                L = E.detach().cpu().numpy()  # numpy array
                S = Z.detach().cpu().numpy()  # numpy array
                T = T.detach().cpu().numpy()  # numpy array
                mbs = T.shape[
                    0]  # = args.mini_batch_size except maybe for last
                A = np.sum((np.round(S, 0) == T).astype(np.uint8))

                if args.mlperf_logging:
                    total_time += iteration_time
                else:
                    t2 = time_wrap(args.use_gpu)
                    total_time += t2 - t1
                total_accu += A
                total_loss += L * mbs
                total_iter += 1
                total_samp += mbs

                should_print = ((j + 1) % args.print_freq
                                == 0) or (j + 1 == nbatches)
                should_test = ((args.test_freq > 0)
                               and (args.data_generation == "dataset")
                               and (((j + 1) % args.test_freq == 0) or
                                    (j + 1 == nbatches)))

                # print time, loss and accuracy
                if should_print or should_test:
                    gT = 1000.0 * total_time / total_iter if args.print_time else -1
                    total_time = 0

                    gA = total_accu / total_samp
                    total_accu = 0

                    gL = total_loss / total_samp
                    total_loss = 0

                    str_run_type = "inference" if args.inference_only else "training"
                    print(
                        "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".
                        format(str_run_type, j + 1, nbatches, epoch, gT) +
                        "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100))

                    log_iter = nbatches * epoch + j + 1
                    # Uncomment the line below to print out the total time with overhead
                    # print("Accumulated time so far: {}" \
                    # .format(time_wrap(args.use_gpu) - accum_time_begin))
                    total_iter = 0
                    total_samp = 0

        # END TRAIN LOOP
        # TODO: uncomment for comp/comms DDP benchmark
        # TODO: for bottom also
        #torch.cuda.synchronize(args.distributed_rank)
        #if args.distributed_rank == 0:
        #    for bucket_index in range(len(state_dict_top)):
        #        e_bfr = state_dict[bucket_index]["e_bfr"]
        #        e_aft = state_dict[bucket_index]["e_aft"]
        #        print(f"bucket {bucket_index} comm time: {e_bfr.elapsed_time(e_aft)}")

        mean_fwd = 1000.0 * np.mean(fwd_times)
        mean_bwd = 1000.0 * np.mean(bwd_times)
        std_fwd = 1000.0 * np.std(fwd_times)
        std_bwd = 1000.0 * np.std(bwd_times)
        rpc_fwd_mean = 1000.0 * np.mean(rpc_fwd_times)
        rpc_fwd_std = 1000.0 * np.std(rpc_fwd_times)
        embedding_fwd_mean = 1000.0 * np.mean(embedding_lookup_times)
        embedding_fwd_std = 1000.0 * np.std(embedding_lookup_times)

        print("[Trainer {}] Average FWD Time (ms): {}".format(
            args.distributed_rank, mean_fwd))
        print("[Trainer {}] STD DEV FWD Time (ms): {}".format(
            args.distributed_rank, std_fwd))
        print("[Trainer {}] Average BWD Time (ms): {}".format(
            args.distributed_rank, mean_bwd))
        print("[Trainer {}] STD DEV BWD Time (ms): {}".format(
            args.distributed_rank, std_bwd))
        print("[Trainer {}] Average RPC FWD Time (ms): {}".format(
            args.distributed_rank, rpc_fwd_mean))
        print("[Trainer {}] STD DEV RPC FWD Time (ms): {}".format(
            args.distributed_rank, rpc_fwd_std))
        print("[Trainer {}] Average Embedding Lookup Time (ms): {}".format(
            args.distributed_rank, embedding_fwd_mean))
        print("[Trainer {}] STD DEV Embedding Lookup Time (ms): {}".format(
            args.distributed_rank, embedding_fwd_std))

    # profiling
    if args.enable_profiling:
        with open("dlrm_s_pytorch.prof", "w") as prof_f:
            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
            prof.export_chrome_trace("./dlrm_s_pytorch.json")
示例#18
0
def run_master():

    # put the two model parts on worker1 and worker2 respectively
    model = DistResNet(["worker1", "worker2", "worker3", "worker4"])
    loss_fn = nn.MSELoss()
    opt = DistributedOptimizer(
        optim.SGD,
        model.parameter_rrefs(),
        lr=0.05,
    )

    one_hot_indices = torch.LongTensor(batch_size) \
                           .random_(0, num_classes) \
                           .view(batch_size, 1)

    # Uses input_list to store the intermediary tensors, and input_list feeds the tensors to the model
    input_list = []
    offset = 0  # Index to track which earlier stages are not executed. Also used to shift the input_list
    # batch_no = 0

    # Implements the Inter-batch parallelism achieved in PipeDream
    # Runs the loop num_batches + 3 times to execute the remaining + 3 stages of forwarding.
    for i in range(num_batches + 3):

        input_col = []
        inputs = ''

        if i < num_batches:
            print(f"Processing batch {i}")
            # generate random inputs and labels
            inputs = torch.randn(batch_size, 3, image_w, image_h)
            labels = torch.zeros(batch_size, num_classes) \
                        .scatter_(1, one_hot_indices, 1)
            input_list = [inputs] + input_list
            offset = 0
        else:
            offset += 1

        # Append the offset, index, and length data to send them to the model
        input_col.append(input_list)
        input_col.append(offset)
        input_col.append(len(input_list))

        results = model(input_col) # Runs the Stages with the given inputs

        # The distributed autograd context is the dedicated scope for the
        # distributed backward pass to store gradients, which can later be
        # retrieved using the context_id by the distributed optimizer.
        with dist_autograd.context() as context_id:
            if offset + len(results) >= 4:
                # print("backward for batch ", batch_no )
                dist_autograd.backward(context_id, [loss_fn(results[-1], labels)])
                opt.step(context_id)
                # batch_no += 1

        input_list = []     # Reset the input_list

        for j in range(0, len(results), 1):
            if j + offset + 1 == 4:     # Drop the last element of results, which is already backward propagated
                break
            input_list.append(results[j])   # Re-insert the results
示例#19
0
def run_master(num_split):
    print("run master")
    # put the two model parts on worker1 and worker2 respectively
    model = DistResNet50(
        num_split, ["worker1", "worker2"])
    opt = DistributedOptimizer(
        optim.SGD,
        model.parameter_rrefs(),
        lr=lr, momentum=0.9
    )

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    train_dataset = datasets.CIFAR100(root='./data', train=True,
                                      download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size)
    val_dataset = datasets.CIFAR100(root='./data', train=False,
                                    download=True, transform=transform)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=128)

    for e in range(epoch):
        model.train()
        train_loss = Metric("train_loss")
        train_accuracy = Metric("train_accuracy")
        with tqdm(
            total=len(train_loader),
            desc="Train Epoch #{}".format(e + 1),
        ) as t:
            for idx, (data, target) in enumerate(train_loader):
                with dist_autograd.context() as context_id:
                    outputs = model(data)
                    loss = F.cross_entropy(outputs, target)
                    dist_autograd.backward(context_id, [loss])
                    opt.step(context_id)
                    train_loss.update(loss)
                    train_accuracy.update(accuracy(outputs, target))
                    t.set_postfix(
                        {
                            "loss": train_loss.avg.item(),
                            "accuracy": 100.0 * train_accuracy.avg.item(),
                        }
                    )
                    t.update(1)

        model.eval()
        with tqdm(
            total=len(val_loader),
            desc="Valid Epoch #{}".format(e + 1),
        ) as t:
            with torch.no_grad():
                val_loss = Metric("val_loss")
                val_accuracy = Metric("val_accuracy")
                for data, target in val_loader:
                    output = model(data)
                    val_loss.update(F.cross_entropy(output, target))
                    val_accuracy.update(accuracy(output, target))
                    t.set_postfix(
                        {
                            "loss": val_loss.avg.item(),
                            "accuracy": 100.0 * val_accuracy.avg.item(),
                        }
                    )
                    t.update(1)
示例#20
0
def _run_trainer(emb_rref_list, rank):
    r"""
   Each trainer runs a forward pass which involves an embedding lookup on the
   8 parameter servers and running nn.Linear locally. During the backward pass,
   DDP is responsible for aggregating the gradients for the dense part
   (nn.Linear) and distributed autograd ensures gradients updates are
   propagated to the parameter servers.
   """

    # Setup the model.
    model = HybridModel(emb_rref_list, rank)

    # Retrieve all model parameters as rrefs for DistributedOptimizer.

    # Retrieve parameters from all embedding tables for the current trainer.
    model_parameter_rrefs = []
    for ind, emb_rref in enumerate(emb_rref_list):
        ps_name = "ps{}".format(ind)
        model_parameter_rrefs.extend(
            rpc.rpc_sync(ps_name,
                         _retrieve_embedding_parameters,
                         args=(emb_rref, )))

    # model.parameters() only includes local parameters.
    for param in model.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Setup distributed optimizer
    opt = DistributedOptimizer(optim.SGD, model_parameter_rrefs, lr=0.05)

    criterion = torch.nn.CrossEntropyLoss()

    def get_next_batch(rank):
        for _ in range(10):
            num_indices = random.randint(20, 50)
            indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS)

            # Generate offsets.
            offsets = []
            start = 0
            batch_size = 0

            while start < num_indices:
                offsets.append(start)
                start += random.randint(1, 10)
                batch_size += 1

            offsets_tensor = torch.LongTensor(offsets)
            target = torch.LongTensor(batch_size).random_(8).cuda(rank)

            yield indices, offsets_tensor, target

    measurements = []
    # Include warm-up cycles during training
    for epoch in range(100 + WARMUP_CYCLES):
        start = time.time()
        batch_size = 0

        # create distributed autograd context
        for indices, offsets, target in get_next_batch(rank):
            batch_size += len(target)

            with dist_autograd.context() as context_id:
                output = model(indices, offsets)
                loss = criterion(output, target)

                # Run distributed backward pass
                dist_autograd.backward(context_id, [loss])

                # Run distributed optimizer. Gradients propagated all the way to the parameter servers
                opt.step(context_id)

                # Not necessary to zero grads as each iteration creates a different
                # distributed autograd context which hosts different grads

        measurements.append(time.time() - start)
        # print("Training done for epoch {}".format(epoch))

    # Throw away warm-up measurements
    measurements = measurements[WARMUP_CYCLES:]
    return rank, measurements, batch_size
示例#21
0
def study():
    """
    Async multiplication using two remote modules
    """
    # Start with a local version
    module1 = MyModule()
    module2 = MyModule()
    params = [module1.get_w(), module2.get_w()]
    local_optim = optim.SGD(params, lr=0.05)

    # Keep a copy of the old weights to make sure they change
    old_w1 = module1.w.clone().detach()
    old_w2 = module2.w.clone().detach()

    torch.manual_seed(0)
    t1 = torch.rand((3, 3), requires_grad=True)
    t2 = torch.rand((3, 3), requires_grad=True)

    output1 = module1.forward(t2)
    output2 = module2.forward(output1)
    loss = torch.add(output2, t1).sum()

    loss.backward()
    local_optim.step()

    # distributed version
    owner1 = "worker%d" % ((Env.rank + 1) % Env.world_size)
    owner2 = "worker%d" % ((Env.rank + 2) % Env.world_size)

    remote_module1 = rpc.remote(owner1, MyModule)
    remote_module2 = rpc.remote(owner2, MyModule)
    remote_param1 = remote_method(MyModule.get_w, remote_module1)
    remote_param2 = remote_method(MyModule.get_w, remote_module2)

    old_w1_remote = remote_param1.to_here()

    dist_optim = DistributedOptimizer(
        optim.SGD, [remote_param1, remote_param2], lr=0.05
    )

    with dist_autograd.context():
        torch.manual_seed(0)
        t1 = torch.rand((3, 3), requires_grad=True)
        t2 = torch.rand((3, 3), requires_grad=True)

        output1 = remote_async(MyModule.forward, remote_module1, t2)
        output2 = remote_async(MyModule.forward, remote_module2, output1.wait())
        loss = torch.add(output2.wait(), t1)

        dist_autograd.backward([loss.sum()])
        dist_optim.step()

        new_w1 = remote_async(MyModule.get_w, remote_module1).wait()
        new_w2 = remote_async(MyModule.get_w, remote_module2).wait()

        # Make sure the weights have been updated
        print(f'Old weight vs new weight: {old_w1 == new_w1}')
        print(f'Old weight vs new weight: {old_w2 == new_w2}')

        # Make sure the weights on the remote module and the local copy are the same
        w1_consistent = (new_w1 == module1.get_w()).all()
        w2_consistent = (new_w2 == module2.get_w()).all()

        print(f'w1 consist: {w1_consistent}')
        print(f'w2 consist: {w2_consistent}')