Exemplo n.º 1
0
    def testDistributedSampler(self, *_):
        service_ep = 'http://127.0.0.1:' + self.web_port
        scheduler_ep = '127.0.0.1:' + self.scheduler_port
        with new_session(service_ep) as sess:
            raw1 = np.random.rand(100, 200)
            data1 = mt.tensor(raw1, chunk_size=40)
            data1.execute(name='data1', session=sess)

            raw2 = np.random.rand(100,)
            data2 = mt.tensor(raw2, chunk_size=60)
            data2.execute(name='data2', session=sess)

            with DistributedContext(scheduler_address=scheduler_ep, session_id=sess.session_id):
                dataset = MarsDataset('data1', 'data2')
                self.assertEqual(len(dataset), 100)

                sampler = MarsDistributedSampler(dataset, num_replicas=1, rank=0)
                indices = sampler.generate_indices()
                r1 = np.array(dataset._get_data(indices)[0])
                r2 = np.array([dataset[ind][0] for ind in sampler])
                np.testing.assert_array_equal(r1, r2)

                r1 = np.array(dataset._get_data(indices)[1])
                r2 = np.array([dataset[ind][1] for ind in sampler])
                np.testing.assert_array_equal(r1, r2)

                self.assertEqual(len(sampler), 100)

                sampler.set_epoch(1)
                self.assertEqual(sampler.epoch, 1)
Exemplo n.º 2
0
def main():
    import torch.nn as nn
    import torch.distributed as dist
    import torch.optim as optim
    import torch.utils.data
    import mars.tensor as mt
    from mars.learn.contrib.pytorch import MarsDataset, MarsDistributedSampler

    dist.init_process_group(backend='gloo')
    torch.manual_seed(42)

    data = mt.named_tensor(name='data')
    labels = mt.named_tensor(name='labels')
    train_dataset = MarsDataset(data, labels)
    train_sampler = MarsDistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=32,
                                               shuffle=False,
                                               sampler=train_sampler)

    model = nn.parallel.DistributedDataParallel(get_model())
    optimizer = optim.SGD(model.parameters(),
                          lr=0.01, momentum=0.5)
    criterion = nn.BCELoss()

    for _ in range(2):
        # 2 epochs
        for _, (batch_data, batch_labels) in enumerate(train_loader):
            outputs = model(batch_data)
            loss = criterion(outputs.squeeze(), batch_labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()