def partition_agaricus(batch_size, train_file, test_file): train_dataset = SparseDatasetWithLines(train_file, 127) test_dataset = SparseDatasetWithLines(test_file, 127) size = dist.get_world_size() bsz = 1 if batch_size == 1 else int(batch_size / float(size)) train_partition_sizes = [1.0 / size for _ in range(size)] train_partition = DataPartitioner(train_dataset, train_partition_sizes) train_partition = train_partition.use(dist.get_rank()) train_loader = DataLoader(train_partition, batch_size=bsz, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True) return train_partition, train_loader, bsz, test_loader
def run(args): device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') torch.manual_seed(1234) read_start = time.time() avg_error = np.iinfo(np.int16).max logging.info(f"{args.rank}-th worker starts.") file_name = "{}/{}_{}".format(args.root, args.rank, args.world_size) train_file = open(file_name, 'r').readlines() train_set = SparseDatasetWithLines(train_file, args.features) train_set = [t[0] for t in train_set] logging.info(f"Loading dataset costs {time.time() - read_start}s") # initialize centroids init_cent_start = time.time() if args.rank == 0: c_dense_list = [t.to_dense() for t in train_set[:args.num_clusters]] centroids = torch.stack(c_dense_list).reshape(args.num_clusters, args.features) else: centroids = torch.empty(args.num_clusters, args.features) if dist_is_initialized(): dist.broadcast(centroids, 0) logging.info( f"Receiving initial centroids costs {time.time() - init_cent_start}s") training_start = time.time() for epoch in range(args.epochs): if avg_error >= args.threshold: start_compute = time.time() model = SparseKmeans(train_set, centroids, args.features, args.num_clusters) model.find_nearest_cluster() error = torch.tensor(model.error) end_compute = time.time() logging.info( f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s" ) sync_start = time.time() if dist_is_initialized(): centroids, avg_error = broadcast_average( args, model.get_centroids("dense_tensor"), error) logging.info( f"{args.rank}-th worker finished {epoch} epoch. " f"Computing takes {end_compute - start_compute}s. " f"Communicating takes {time.time() - sync_start}s. " # f"Centroids: {model.get_centroids('dense_tensor')}. " f"Loss: {model.error}") else: logging.info( f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}" ) logging.info( f"Whole process time : {time.time() - training_start}") return
def partition_sparse(file, num_feature): train_dataset = SparseDatasetWithLines(file, num_feature) size = 1 rank = 0 if dist_is_initialized(): size = dist.get_world_size() rank = dist.get_rank() train_partition_sizes = [1.0 / size for _ in range(size)] train_partition = DataPartitioner(train_dataset, train_partition_sizes) train_partition = train_partition.use(rank) return train_partition
return def get_centroids(self, centroids_type): if centroids_type == "sparse_tensor": return self.centroids if centroids_type == "numpy": cent_lst = [ self.centroids[i].to_dense().numpy() for i in range(self.nr_cluster) ] centroid = np.array(cent_lst).reshape(self.nr_cluster, self.nr_feature) return centroid if centroids_type == "dense_tensor": cent_tensor_lst = [ self.centroids[i].to_dense() for i in range(self.nr_cluster) ] return torch.stack(cent_tensor_lst) if __name__ == "__main__": train_file = "../dataset/agaricus_127d_train.libsvm" test_file = "../dataset/agaricus_127d_test.libsvm" dim = 127 train_data = SparseDatasetWithLines(train_file, dim) test_data = SparseDatasetWithLines(test_file, dim) nr_cluster = 10 centroids = train_data.ins_list[:nr_cluster] kmeans_model = SparseKmeans(train_data, centroids, dim, nr_cluster) kmeans_model.find_nearest_cluster()
def handler(event, context): avg_error = np.iinfo(np.int16).max num_features = event['num_features'] num_clusters = event['num_clusters'] worker_cent_bucket = event["worker_cent_bucket"] avg_cent_bucket = event["avg_cent_bucket"] num_epochs = event["num_epochs"] threshold = event["threshold"] dataset_type = event["dataset_type"] elastic_location = event["elasticache"] elastic_endpoint = memcached_init(elastic_location) print(elastic_endpoint) #Reading data from S3 bucket_name = event['bucket_name'] key = urllib.parse.unquote_plus(event['key'], encoding='utf-8') logger.info( f"Reading training data from bucket = {bucket_name}, key = {key}") key_splits = key.split("_") num_worker = int(key_splits[-1]) worker_index = int(key_splits[0]) event_start = time.time() file = get_object(bucket_name, key).read().decode('utf-8').split("\n") s3_end = time.time() logger.info(f"Getting object from s3 takes {s3_end - event_start}s") if dataset_type == "dense": # dataset is stored as numpy array dataset = DenseDatasetWithLines(file, num_features).ins_np dt = dataset.dtype centroid_shape = (num_clusters, dataset.shape[1]) else: # dataset is sparse, stored as sparse tensor dataset = SparseDatasetWithLines(file, num_features) first_entry = dataset.ins_list[0].to_dense().numpy() dt = first_entry.dtype centroid_shape = (num_clusters, first_entry.shape[1]) parse_end = time.time() logger.info(f"Parsing dataset takes {parse_end - s3_end}s") logger.info( f"worker index: {worker_index},Dataset: {dataset_type}, dtype: {dt}. Centroids shape: {centroid_shape}. num_features: {num_features}" ) if worker_index == 0: if dataset_type == "dense": centroids = dataset[0:num_clusters].reshape(-1) hset_object(elastic_endpoint, avg_cent_bucket, "initial", centroids.tobytes()) centroids = centroids.reshape(centroid_shape) else: centroids = store_centroid_as_numpy( dataset.ins_list[0:num_clusters], num_clusters) hset_object(elastic_endpoint, avg_cent_bucket, "initial", centroids.tobytes()) else: cent = hget_object_or_wait(elastic_endpoint, avg_cent_bucket, "initial", 0.00001) centroids = process_centroid(cent, num_clusters, dt) #centroids = np.frombuffer(cent,dtype=dt) if centroid_shape != centroids.shape: logger.error("The shape of centroids does not match.") logger.info( f"Waiting for initial centroids takes {time.time() - parse_end} s") training_start = time.time() sync_time = 0 for epoch in range(num_epochs): logger.info(f"{worker_index}-th worker in {epoch}-th epoch") epoch_start = time.time() if epoch != 0: last_epoch = epoch - 1 cent_with_error = hget_object_or_wait(elastic_endpoint, avg_cent_bucket, f"avg-{last_epoch}", 0.00001) wait_end = time.time() if worker_index != 0: logger.info( f"Wait for centroid for {epoch}-th epoch. Takes {wait_end - epoch_start}" ) sync_time += wait_end - epoch_start avg_error, centroids = process_centroid(cent_with_error, num_clusters, dt, True) if avg_error >= threshold: print("get new centro") res = get_new_centroids(dataset, dataset_type, centroids, epoch, num_features, num_clusters) #dt = res.dtype sync_start = time.time() success = hset_object(elastic_endpoint, worker_cent_bucket, f"{worker_index}_{epoch}", res.tobytes()) if worker_index == 0 and success: compute_average_centroids(elastic_endpoint, avg_cent_bucket, worker_cent_bucket, num_worker, centroid_shape, epoch, dt) logger.info( f"Waiting for all workers takes {time.time() - sync_start} s" ) if epoch != 0: sync_time += time.time() - sync_start else: print("sync time = {}".format(sync_time)) logger.info( f"{worker_index}-th worker finished training. Error = {avg_error}, centroids = {centroids}" ) logger.info(f"Whole process time : {time.time() - training_start}") return print("sync time = {}".format(sync_time)) put_object("kmeans-time", "time_{}".format(worker_index), np.asarray(sync_time).tostring())
def run(args): device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') logging.info(f"{args.rank}-th worker starts.") read_start = time.time() torch.manual_seed(1234) train_file = open(args.train_file, 'r').readlines() dataset = SparseDatasetWithLines(train_file, args.features) logging.info(f"Loading dataset costs {time.time() - read_start}s") preprocess_start = time.time() dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(0.2 * dataset_size)) if args.shuffle: np.random.seed(42) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_set = [dataset[i] for i in train_indices] val_set = [dataset[i] for i in val_indices] logging.info("preprocess data cost {} s".format(time.time() - preprocess_start)) lr = LogisticRegression(train_set, val_set, args.features, args.epochs, args.learning_rate, args.batch_size) training_start = time.time() for epoch in range(args.epochs): epoch_start = time.time() num_batches = math.floor(len(train_set) / args.batch_size) for batch_idx in range(num_batches): batch_start = time.time() batch_ins, batch_label = lr.next_batch(batch_idx) batch_grad = torch.zeros(lr.n_input, 1, requires_grad=False) batch_bias = np.float(0) train_loss = Loss() train_acc = Accuracy() for i in range(len(batch_ins)): z = lr.forward(batch_ins[i]) h = lr.sigmoid(z) loss = lr.loss(h, batch_label[i]) # print("z= {}, h= {}, loss = {}".format(z, h, loss)) train_loss.update(loss, 1) train_acc.update(h, batch_label[i]) g = lr.backward(batch_ins[i], h.item(), batch_label[i]) batch_grad.add_(g) batch_bias += np.sum(h.item() - batch_label[i]) batch_grad = batch_grad.div(len(batch_ins)) batch_bias = batch_bias / len(batch_ins) batch_grad.mul_(-1.0 * args.learning_rate) lr.grad.add_(batch_grad) lr.bias = lr.bias - batch_bias * args.learning_rate end_compute = time.time() logging.info( f"Train loss: {train_loss}, train accurary: {train_acc}") logging.info( f"{args.rank}-th worker finishes computing one batch. Takes {time.time() - batch_start}" ) weights = np.append(lr.grad.numpy().flatten(), lr.bias) weights_merged = broadcast_average(args, torch.tensor(weights)) lr.grad, lr.bias = weights_merged[:-1].reshape( args.features, 1), float(weights_merged[-1]) logging.info( f"{args.rank}-th worker finishes sychronizing. Takes {time.time() - end_compute}" ) val_loss, val_acc = lr.evaluate() logging.info( f"Validation loss: {val_loss}, validation accuracy: {val_acc}") logging.info(f"Epoch takes {time.time() - epoch_start}s") logging.info( f"Finishes training. {args.epochs} takes {time.time() - training_start}s." )
def handler(event, context): try: start_time = time.time() num_features = event['num_features'] learning_rate = event["learning_rate"] batch_size = event["batch_size"] num_epochs = event["num_epochs"] validation_ratio = event["validation_ratio"] # Reading data from S3 bucket_name = event['bucket_name'] key = urllib.parse.unquote_plus(event['key'], encoding='utf-8') print(f"Reading training data from bucket = {bucket_name}, key = {key}") key_splits = key.split("_") worker_index = int(key_splits[0]) num_worker = int(key_splits[1]) # read file from s3 file = get_object(bucket_name, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = SparseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_set = [dataset[i] for i in train_indices] val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s".format(time.time() - preprocess_start)) svm = SparseSVM(train_set, val_set, num_features, num_epochs, learning_rate, batch_size) # Training the Model for epoch in range(num_epochs): epoch_start = time.time() num_batches = math.floor(len(train_set) / batch_size) print(f"worker {worker_index} epoch {epoch}") for batch_idx in range(num_batches): batch_start = time.time() batch_ins, batch_label = svm.next_batch(batch_idx) acc = svm.one_epoch(batch_idx, epoch) np_grad = svm.weights.numpy().flatten() print(f"computation takes {time.time() - batch_start}s") sync_start = time.time() put_object(grad_bucket, w_grad_prefix + str(worker_index), np_grad.tobytes()) file_postfix = "{}_{}".format(epoch, batch_idx) if worker_index == 0: w_grad_merge = merge_weights(grad_bucket, num_worker, np_grad.dtype, np_grad.shape) put_object(model_bucket, w_grad_prefix + file_postfix, w_grad_merge.tobytes()) # delete_expired_w_b(model_bucket, epoch, batch_idx, w_grad_prefix) svm.weights = torch.from_numpy(w_grad_merge).reshape(num_features, 1) else: w_data = get_object_or_wait(model_bucket, w_grad_prefix + file_postfix, 0.1).read() w_grad_merge = np.frombuffer(w_data, dtype=np_grad.dtype).reshape(np_grad.shape) svm.weights = torch.from_numpy(w_grad_merge).reshape(num_features, 1) print(f"synchronization cost {time.time() - sync_start}s") print(f"batch takes {time.time() - batch_start}s") if (batch_idx + 1) % 10 == 0: print(f"Epoch: {epoch + 1}/{num_epochs}, Step: {batch_idx + 1}/{len(train_indices) / batch_size}, " f"train acc: {acc}") val_acc = svm.evaluate() print(f"validation accuracy: {val_acc}") print(f"Epoch takes {time.time() - epoch_start}s") if worker_index == 0: clear_bucket(model_bucket) clear_bucket(grad_bucket) print("elapsed time = {} s".format(time.time() - start_time)) except Exception as e: print("Error {}".format(e))
def handler(event, context): try: start_time = time.time() bucket_name = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] merged_bucket = event['merged_bucket'] num_features = event['num_features'] learning_rate = event["learning_rate"] batch_size = event["batch_size"] num_epochs = event["num_epochs"] validation_ratio = event["validation_ratio"] elasti_location = event['elasticache'] endpoint = memcached_init(elasti_location) # Reading data from S3 print(f"Reading training data from bucket = {bucket_name}, key = {key}") file = get_object(bucket_name, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = SparseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_set = [dataset[i] for i in train_indices] val_set = [dataset[i] for i in val_indices] print("preprocess data cost {} s".format(time.time() - preprocess_start)) svm = SparseSVM(train_set, val_set, num_features, num_epochs, learning_rate, batch_size) # Training the Model train_start = time.time() for epoch in range(num_epochs): epoch_start = time.time() num_batches = math.floor(len(train_set) / batch_size) print("worker {} epoch {}".format(worker_index, epoch)) for batch_idx in range(num_batches): batch_start = time.time() batch_ins, batch_label = svm.next_batch(batch_idx) acc = svm.one_epoch(batch_idx, epoch) if (batch_idx + 1) % 10 == 0: print("Epoch: {}/{}, Step: {}/{}, train acc: {}" .format(epoch + 1, num_epochs, batch_idx + 1, num_batches, acc)) cal_time = time.time() - epoch_start sync_start = time.time() np_w = svm.weights.numpy().flatten() postfix = str(epoch) w_merge = reduce_epoch(endpoint, np_w, merged_bucket, num_workers, worker_index, postfix) svm.weights = torch.from_numpy(w_merge).reshape(num_features, 1) sync_time = time.time() - sync_start test_start = time.time() val_acc = svm.evaluate() test_time = time.time() - test_start print('Epoch: [%d/%d], Step: [%d/%d], Time: %.4f, epoch cost %.4f, ' 'cal cost %.4f s, sync cost %.4f s, test cost %.4f s, test accuracy: %s %%' % (epoch + 1, num_epochs, batch_idx + 1, num_batches, time.time() - train_start, time.time() - epoch_start, cal_time, sync_time, test_time, val_acc)) if worker_index == 0: clear_bucket(endpoint) print("elapsed time = {} s".format(time.time() - start_time)) except Exception as e: print("Error {}".format(e))
def handler(event, context): start_time = time.time() bucket = event['bucket_name'] worker_index = event['rank'] num_workers = event['num_workers'] key = event['file'] tmp_bucket = event['tmp_bucket'] merged_bucket = event['merged_bucket'] num_epochs = event['num_epochs'] learning_rate = event['learning_rate'] batch_size = event['batch_size'] print('bucket = {}'.format(bucket)) print("file = {}".format(key)) print('tmp bucket = {}'.format(tmp_bucket)) print('merged bucket = {}'.format(merged_bucket)) print('number of workers = {}'.format(num_workers)) print('worker index = {}'.format(worker_index)) print('num epochs = {}'.format(num_epochs)) print('learning rate = {}'.format(learning_rate)) print("batch size = {}".format(batch_size)) # read file from s3 file = get_object(bucket, key).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - start_time)) parse_start = time.time() dataset = SparseDatasetWithLines(file, num_features) print("parse data cost {} s".format(time.time() - parse_start)) preprocess_start = time.time() # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_ratio * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) print("preprocess data cost {} s".format(time.time() - preprocess_start)) model = LogisticRegression(num_features, num_classes) # Loss and Optimizer # Softmax is internally computed. # Set parameters to be updated. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Training the Model for epoch in range(num_epochs): for batch_index, (items, labels) in enumerate(train_loader): print("------worker {} epoch {} batch {}------".format( worker_index, epoch, batch_index)) batch_start = time.time() items = Variable(items.view(-1, num_features)) labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(items) loss = criterion(outputs, labels) loss.backward() print("forward and backward cost {} s".format(time.time() - batch_start)) w_grad = model.linear.weight.grad.data.numpy() b_grad = model.linear.bias.grad.data.numpy() #print("dtype of grad = {}".format(w_grad.dtype)) #print("w_grad before merge = {}".format(w_grad[0][0:5])) #print("b_grad before merge = {}".format(b_grad)) sync_start = time.time() put_object(grad_bucket, w_grad_prefix + str(worker_index), w_grad.tobytes()) put_object(grad_bucket, b_grad_prefix + str(worker_index), b_grad.tobytes()) file_postfix = "{}_{}".format(epoch, batch_index) if worker_index == 0: w_grad_merge, b_grad_merge = \ merge_w_b_grads(grad_bucket, num_worker, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) put_merged_w_b_grad(model_bucket, w_grad_merge, b_grad_merge, file_postfix, w_grad_prefix, b_grad_prefix) delete_expired_w_b(model_bucket, epoch, batch_index, w_grad_prefix, b_grad_prefix) model.linear.weight.grad = Variable( torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable( torch.from_numpy(b_grad_merge)) else: w_grad_merge, b_grad_merge = get_merged_w_b_grad( model_bucket, file_postfix, w_grad.dtype, w_grad.shape, b_grad.shape, w_grad_prefix, b_grad_prefix) model.linear.weight.grad = Variable( torch.from_numpy(w_grad_merge)) model.linear.bias.grad = Variable( torch.from_numpy(b_grad_merge)) #print("w_grad after merge = {}".format(model.linear.weight.grad.data.numpy()[0][:5])) #print("b_grad after merge = {}".format(model.linear.bias.grad.data.numpy())) print("synchronization cost {} s".format(time.time() - sync_start)) optimizer.step() print("batch cost {} s".format(time.time() - batch_start)) if (batch_index + 1) % 10 == 0: print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, batch_index + 1, len(train_indices) / batch_size, loss.data)) if worker_index == 0: clear_bucket(model_bucket) clear_bucket(grad_bucket) # Test the Model correct = 0 total = 0 for items, labels in validation_loader: items = Variable(items.view(-1, num_features)) # items = Variable(items) outputs = model(items) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() print('Accuracy of the model on the %d test samples: %d %%' % (len(val_indices), 100 * correct / total)) endTs = time.time() print("elapsed time = {} s".format(endTs - startTs))