def run(device: torch.device, epochs: int = 1, batch_size: int = 4096) -> None: rank = dist.get_rank() model = torchvision.models.resnet50().to(device) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.5, .5, .5], [.5, .5, .5]) ]) dataset = torchvision.datasets.CIFAR10('/data/private/datasets', train=True, transform=transform, download=True) optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) # partition.partiton_dataset is same as the one in pytorch tutorial # https://pytorch.org/tutorials/intermediate/dist_tuto.html#distributed-training train_set, bsz = partition.partition_dataset(dataset, batch_size) num_batches = ceil(len(train_set.dataset)) / float(bsz) for epoch in range(epochs): epoch_loss = .0 for data, target in train_set: optimizer.zero_grad() data = data.to( device ) # all of processes are locked here at the second iteration target = target.to(device) outputs = model(data) loss = F.cross_entropy(outputs, target) epoch_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() print(f'Rank {rank}, epoch {epoch} : {epoch_loss / num_batches}')
def train_teacher(nb_teachers, teacher_id): """ This function trains a single teacher model with responds teacher's ID among an ensemble of nb_teachers models for the dataset specified. The model will be save in directory. :param nb_teachers: total number of teachers in the ensemble :param teacher_id: id of the teacher being trained :return: True if everything went well """ # Load the dataset X_train, X_test, y_train, y_test = models.get_dataset() print(X_train.shape) print(y_train.shape) print(X_test.shape) print(y_test.shape) # Retrieve subset of data for this teacher data, labels = partition.partition_dataset(X_train, y_train, nb_teachers, teacher_id) print("Length of training data: " + str(len(labels))) # Define teacher checkpoint filename and full path filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.hdf5' filename2 = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.h5' # Perform teacher training need to modify # Create teacher model model, opt = models.create_two_layer_mlp(46) # num of cols model.compile(loss='binary_crossentropy', optimizer="Adam", metrics=['accuracy']) model, hist = models.training(model, data, X_test, labels, y_test, filename) #modify model_json = model.to_json() with open("model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights(filename2) print("Saved model to disk") return True
def run(rank, size, partition_sizes, custom_partition=False, params=None): torch.manual_seed(1234) train_set, _ = partition_dataset(partition_sizes, train=True, custom=custom_partition) val_set, _ = partition_dataset(partition_sizes, train=False) train_model(train_set, val_set, **params)
DATASETS_PATH = "./datasets/" if __name__ == '__main__': #file = "chainlink3D.arff" file = "banana.arff" M = 2 partitioning_method = 0 #L = 0.4 #MIN_PTS = 4 L = 0.03 MIN_PTS = 4 arf = prt.partition_dataset(file, M, partitioning_method) dimensions = len(arf[0][0]) - 1 # for i in range(M): # points, labels = arff.loadpartitionNDArray(i) # plt.plotCluster(points, labels, message=f'Partition {i}') contribution_map = {} for i in range(M): local_update = lcl.compute_local_update(i, L) for key, value in local_update.items(): if key in contribution_map: contribution_map[key] += value else: contribution_map[key] = value