def crunch(surf_file, net, w, s, d, dataloader: list, loss_key, acc_key, comm, rank, args): """ Calculate the loss values and accuracies of modified models in parallel using MPI reduce. """ f = h5py.File(surf_file, 'r+' if rank == 0 else 'r') losses, accuracies = [], [] xcoordinates = f['xcoordinates'][:] ycoordinates = f['ycoordinates'][:] if 'ycoordinates' in f.keys() else None if loss_key not in f.keys(): shape = xcoordinates.shape if ycoordinates is None else ( len(xcoordinates), len(ycoordinates)) losses = -np.ones(shape=shape) accuracies = -np.ones(shape=shape) if rank == 0: f[loss_key] = losses f[acc_key] = accuracies else: losses = f[loss_key][:] accuracies = f[acc_key][:] # Generate a list of indices of 'losses' that need to be filled in. # The coordinates of each unfilled index (with respect to the direction vectors # stored in 'd') are stored in 'coords'. inds, coords, inds_nums = scheduler.get_job_indices( losses, xcoordinates, ycoordinates, comm) print('Computing %d values for rank %d' % (len(inds), rank)) start_time = time.time() total_sync = 0.0 criterion = nn.CrossEntropyLoss() if args.loss_name == 'mse': criterion = nn.MSELoss() elif args.loss_name == 'bce': criterion = nn.BCELoss() # Loop over all uncalculated loss values for count, ind in enumerate(inds): # Get the coordinates of the loss value being calculated coord = coords[count] # Load the weights corresponding to those coordinates into the net if args.dir_type == 'weights': net_plotter.set_weights(net.module if args.ngpu > 1 else net, w, d, coord) elif args.dir_type == 'states': net_plotter.set_states(net.module if args.ngpu > 1 else net, s, d, coord) # Record the time to compute the loss value loss_start = time.time() loss, acc = evaluation.eval_loss(net, criterion, dataloader, args.cuda) loss_compute_time = time.time() - loss_start # Record the result in the local array losses.ravel()[ind] = loss accuracies.ravel()[ind] = acc # Send updated plot data to the master node syc_start = time.time() losses = mpi.reduce_max(comm, losses) accuracies = mpi.reduce_max(comm, accuracies) syc_time = time.time() - syc_start total_sync += syc_time # Only the master node writes to the file - this avoids write conflicts if rank == 0: f[loss_key][:] = losses f[acc_key][:] = accuracies f.flush() print( 'Evaluating rank %d %d/%d (%.1f%%) coord=%s \t%s= %.3f \t%s=%.2f \ttime=%.2f \tsync=%.2f' % (rank, count, len(inds), 100.0 * count / len(inds), str(coord), loss_key, loss, acc_key, acc, loss_compute_time, syc_time)) # This is only needed to make MPI run smoothly. If this process has less work than # the rank0 process, then we need to keep calling reduce so the rank0 process doesn't block for i in range(max(inds_nums) - len(inds)): losses = mpi.reduce_max(comm, losses) accuracies = mpi.reduce_max(comm, accuracies) total_time = time.time() - start_time print('Rank %d done! Total time: %.2f Sync: %.2f' % (rank, total_time, total_sync)) f.close()
inds, coords, inds_nums = scheduler.get_job_indices(losses, xcoordinates, ycoordinates, comm) print('Computing %d values for rank %d'% (len(inds), rank)) sys.stdout.flush() start_time = time.time() total_sync = 0.0 cce = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM) for count, ind in enumerate(inds): coord = coords[count] evaluation.set_weights(model, w, d, coord) print('Rank:%d computing' % (rank)) loss_start = time.time() loss, acc = evaluation.eval_loss(model, cce, x_train, y_train, batch_size) loss_compute_time = time.time() - loss_start losses.ravel()[ind] = loss accuracies.ravel()[ind] = acc syc_start = time.time() losses = mpi.reduce_max(comm, losses) accuracies = mpi.reduce_max(comm, accuracies) syc_time = time.time() - syc_start total_sync += syc_time if rank == 0: f[loss_key][:] = losses f[acc_key][:] = accuracies
def crunch(surf_file, net, w, s, d, dataloader, loss_key, acc_key, comm, rank, args): """ Calculate the loss values and accuracies of modified models in parallel using MPI reduce. """ f = h5py.File(surf_file, 'r+' if rank == 0 else 'r') losses, accuracies = [], [] xcoordinates = f['xcoordinates'][:] ycoordinates = f['ycoordinates'][:] if 'ycoordinates' in f.keys() else None if loss_key not in f.keys(): shape = xcoordinates.shape if ycoordinates is None else (len(xcoordinates),len(ycoordinates)) losses = -np.ones(shape=shape) accuracies = -np.ones(shape=shape) if rank == 0: f[loss_key] = losses f[acc_key] = accuracies else: losses = f[loss_key][:] accuracies = f[acc_key][:] # Generate a list of indices of 'losses' that need to be filled in. # The coordinates of each unfilled index (with respect to the direction vectors # stored in 'd') are stored in 'coords'. inds, coords, inds_nums = scheduler.get_job_indices(losses, xcoordinates, ycoordinates, comm) print('Computing %d values for rank %d'% (len(inds), rank)) start_time = time.time() total_sync = 0.0 criterion = nn.CrossEntropyLoss() if args.loss_name == 'mse': criterion = nn.MSELoss() # Loop over all uncalculated loss values for count, ind in enumerate(inds): # Get the coordinates of the loss value being calculated coord = coords[count] # Load the weights corresponding to those coordinates into the net if args.dir_type == 'weights': net_plotter.set_weights(net.module if args.ngpu > 1 else net, w, d, coord) elif args.dir_type == 'states': net_plotter.set_states(net.module if args.ngpu > 1 else net, s, d, coord) # Record the time to compute the loss value loss_start = time.time() loss, acc = evaluation.eval_loss(net, criterion, dataloader, args.cuda) loss_compute_time = time.time() - loss_start # Record the result in the local array losses.ravel()[ind] = loss accuracies.ravel()[ind] = acc # Send updated plot data to the master node syc_start = time.time() losses = mpi4pytorch.reduce_max(comm, losses) accuracies = mpi4pytorch.reduce_max(comm, accuracies) syc_time = time.time() - syc_start total_sync += syc_time # Only the master node writes to the file - this avoids write conflicts if rank == 0: f[loss_key][:] = losses f[acc_key][:] = accuracies print('Evaluating rank %d %d/%d (%.1f%%) coord=%s \t%s= %.3f \t%s=%.2f \ttime=%.2f \tsync=%.2f' % ( rank, count, len(inds), 100.0 * count/len(inds), str(coord), loss_key, loss, acc_key, acc, loss_compute_time, syc_time)) # This is only needed to make MPI run smoothly. If this process has less work than # the rank0 process, then we need to keep calling reduce so the rank0 process doesn't block for i in range(max(inds_nums) - len(inds)): losses = mpi4pytorch.reduce_max(comm, losses) accuracies = mpi4pytorch.reduce_max(comm, accuracies) total_time = time.time() - start_time print('Rank %d done! Total time: %.2f Sync: %.2f' % (rank, total_time, total_sync)) f.close()
def crunch(surf_file, net, w, s, d, dataloader, loss_key, acc_key, comm, rank, args): """ Calculate the loss values and accuracies of modified models in parallel using MPI reduce. Input (major): net, data, rank: index of the machine in the distributed system Output: the loss values and save in surf_file Dependency: scheduler.get_job_indices, net_plotter.set_weights, evaluation.eval_loss """ print('----------STILL ALIVE # 01') print('-----------------------surf_file path', surf_file) f = h5py.File( surf_file, 'r+' if rank == 0 else 'r') # check https://docs.h5py.org/en/stable/quick.html for h5py files print('----------STILL ALIVE # 02') print('-----------------------------------------------------------------') losses, accuracies = [], [] xcoordinates = f['xcoordinates'][:] ycoordinates = f['ycoordinates'][:] if 'ycoordinates' in f.keys() else None if loss_key not in f.keys(): shape = xcoordinates.shape if ycoordinates is None else ( len(xcoordinates), len(ycoordinates)) losses = -np.ones(shape=shape) accuracies = -np.ones(shape=shape) if rank == 0: f[loss_key] = losses f[acc_key] = accuracies else: losses = f[loss_key][:] accuracies = f[acc_key][:] # Generate a list of indices of 'losses' that need to be filled in. # The coordinates of each unfilled index (with respect to the direction vectors # stored in 'd') are stored in 'coords'. # RS: Utilize distributed computation, since inds are split into multiple machines comm; # Key idea: the main job is to evaluate the losses of multiple models, which can be done in parallel inds, coords, inds_nums = scheduler.get_job_indices( losses, xcoordinates, ycoordinates, comm) print('Computing %d values for rank %d' % (len(inds), rank)) start_time = time.time() total_sync = 0.0 criterion = nn.CrossEntropyLoss() if args.loss_name == 'mse': criterion = nn.MSELoss() # Loop over all uncalculated loss values. inds is defined a few lines above # RS: I suspect that the for loop is not sequential but parallel; when enumerating inds, automatically split into multiple GPUs. for count, ind in enumerate(inds): # Get the coordinates of the loss value being calculated coord = coords[count] # Load the weights corresponding to those coordinates into the net # RS: e.g. if coord = 0.5 in 1d case, then we obtain net = 0.5 (model_1 + model_2). if args.dir_type == 'weights': net_plotter.set_weights(net.module if args.ngpu > 1 else net, w, d, coord) elif args.dir_type == 'states': net_plotter.set_states(net.module if args.ngpu > 1 else net, s, d, coord) # Compute the loss value, and record the time loss_start = time.time() loss, acc = evaluation.eval_loss(net, criterion, dataloader, args.cuda) loss_compute_time = time.time() - loss_start # Record the result in the local array losses.ravel()[ind] = loss accuracies.ravel()[ind] = acc # Send updated plot data to the master node # RS: I'm confused: why syncing during the for loop? I thought the process is: # every GPU computes its own data points (e.g. 10 on GPU-1 + 8 on GPU-0), then they combine. But this code says they sync at each loop, why? # For 1-GPU, this part probably does not matter (not sure whether 8 threads play a role in distributed computation) syc_start = time.time() losses = mpi.reduce_max(comm, losses) accuracies = mpi.reduce_max(comm, accuracies) syc_time = time.time() - syc_start # print('----sync time of this part:', syc_time ) # print the time to sync total_sync += syc_time # Only the master node writes to the file - this avoids write conflicts if rank == 0: f[loss_key][:] = losses f[acc_key][:] = accuracies f.flush() # print syc_time to check print( 'Evaluating rank %d %d/%d (%.1f%%) coord=%s \t%s= %.3f \t%s=%.2f \ttime=%.2f \tsync=%.2f' % (rank, count, len(inds), 100.0 * count / len(inds), str(coord), loss_key, loss, acc_key, acc, loss_compute_time, syc_time)) # This is only needed to make MPI run smoothly. If this process has less work than # the rank0 process, then we need to keep calling reduce so the rank0 process doesn't block for i in range(max(inds_nums) - len(inds)): losses = mpi.reduce_max(comm, losses) accuracies = mpi.reduce_max(comm, accuracies) total_time = time.time() - start_time print('Rank %d done! Total time: %.2f Sync: %.2f' % (rank, total_time, total_sync)) f.close()