def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] assert dataset_type == "sparse_libsvm" n_features = event['n_features'] # ps setting host = event['host'] port = event['port'] # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] n_workers = event["n_workers"] worker_index = event['worker_index'] assert sync_mode.lower() == Synchronization.Reduce print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('host = {}'.format(host)) print('port = {}'.format(port)) # Set thrift connection # Make socket transport = TSocket.TSocket(host, port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) # Create a client to use the protocol encoder t_client = ParameterServer.Client(protocol) # Connect! transport.open() # test thrift connection ps_client.ping(t_client) print("create and ping thrift server >>> HOST = {}, PORT = {}".format(host, port)) # Reading data from S3 read_start = time.time() storage = S3Storage() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) train_set = dataset.ins_list np_dtype = train_set[0].to_dense().numpy().dtype centroid_shape = (n_clusters, n_features) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, data type: {}, centroids shape: {}" .format(dataset_type, np_dtype, centroid_shape)) # register model model_name = Prefix.KMeans_Cent model_length = centroid_shape[0] * centroid_shape[1] + 1 ps_client.register_model(t_client, worker_index, model_name, model_length, n_workers) ps_client.exist_model(t_client, model_name) print("register and check model >>> name = {}, length = {}".format(model_name, model_length)) init_centroids_start = time.time() ps_client.can_pull(t_client, model_name, 0, worker_index) ps_model = ps_client.pull_model(t_client, model_name, 0, worker_index) if worker_index == 0: centroids_np = sparse_centroid_to_numpy(train_set[0:n_clusters], n_clusters) ps_client.can_push(t_client, model_name, 0, worker_index) ps_client.push_grad(t_client, model_name, np.append(centroids_np.flatten(), 1000.).astype(np.double) - np.asarray(ps_model).astype(np.double), 1., 0, worker_index) else: centroids_np = np.zeros(centroid_shape) ps_client.can_push(t_client, model_name, 0, worker_index) ps_client.push_grad(t_client, model_name, np.append(centroids_np.flatten(), 0).astype(np.double), 0, 0, worker_index) ps_client.can_pull(t_client, model_name, 1, worker_index) ps_model = ps_client.pull_model(t_client, model_name, 1, worker_index) cur_centroids = np.array(ps_model[0:-1]).astype(np.float32).reshape(centroid_shape) cur_error = float(ps_model[-1]) print("initial centroids cost {} s".format(time.time() - init_centroids_start)) model = cluster_models.get_model(train_set, torch.from_numpy(cur_centroids), dataset_type, n_features, n_clusters) train_start = time.time() for epoch in range(1, n_epochs + 1): epoch_start = time.time() # local computation model.find_nearest_cluster() local_cent = model.get_centroids("numpy").reshape(-1) local_cent_error = np.concatenate((local_cent.astype(np.double).flatten(), np.array([model.error], dtype=np.double))) epoch_cal_time = time.time() - epoch_start # push updates epoch_comm_start = time.time() last_cent_error = np.concatenate((cur_centroids.astype(np.double).flatten(), np.array([cur_error], dtype=np.double))) ps_model_inc = local_cent_error - last_cent_error ps_client.can_push(t_client, model_name, epoch, worker_index) ps_client.push_grad(t_client, model_name, ps_model_inc, 1. / n_workers, epoch, worker_index) # pull new model ps_client.can_pull(t_client, model_name, epoch + 1, worker_index) # sync all workers ps_model = ps_client.pull_model(t_client, model_name, epoch + 1, worker_index) model.centroids = [torch.from_numpy(c).reshape(1, n_features).to_sparse() for c in np.array(ps_model[0:-1]).astype(np.float32).reshape(centroid_shape)] model.error = float(ps_model[-1]) cur_centroids = model.get_centroids("numpy") cur_error = model.error epoch_comm_time = time.time() - epoch_comm_start print("Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_comm_time)) if model.error < threshold: break print("Worker[{}] finishes training: Error = {}, cost {} s" .format(worker_index, model.error, time.time() - train_start)) return
def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] n_features = event['n_features'] tmp_bucket = event["tmp_bucket"] merged_bucket = event["merged_bucket"] # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] n_workers = event["n_workers"] worker_index = event['worker_index'] assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('sync mode = {}'.format(sync_mode)) storage = S3Storage() communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index) # Reading data from S3 read_start = time.time() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) if dataset_type == "dense_libsvm": dataset = dataset.ins_np data_type = dataset.dtype centroid_shape = (n_clusters, dataset.shape[1]) elif dataset_type == "sparse_libsvm": dataset = dataset.ins_list first_entry = dataset[0].to_dense().numpy() data_type = first_entry.dtype centroid_shape = (n_clusters, first_entry.shape[1]) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}". format(dataset_type, data_type, centroid_shape, n_features)) init_centroids_start = time.time() if worker_index == 0: if dataset_type == "dense_libsvm": centroids = dataset[0:n_clusters] elif dataset_type == "sparse_libsvm": centroids = sparse_centroid_to_numpy(dataset[0:n_clusters], n_clusters) storage.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1", merged_bucket) print("generate initial centroids takes {} s".format( time.time() - init_centroids_start)) else: centroid_bytes = storage.load_or_wait(Prefix.KMeans_Init_Cent + "-1", merged_bucket).read() centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type) if centroid_shape != centroids.shape: raise Exception("The shape of centroids does not match.") print("Waiting for initial centroids takes {} s".format( time.time() - init_centroids_start)) model = cluster_models.get_model(dataset, centroids, dataset_type, n_features, n_clusters) train_start = time.time() for epoch in range(n_epochs): epoch_start = time.time() # rearrange data points model.find_nearest_cluster() local_cent = model.get_centroids("numpy").reshape(-1) local_cent_error = np.concatenate( (local_cent.flatten(), np.array([model.error]))) epoch_cal_time = time.time() - epoch_start # sync local centroids and error epoch_sync_start = time.time() postfix = str(epoch) if sync_mode == "reduce": cent_error_merge = communicator.reduce_epoch( local_cent_error, postfix) elif sync_mode == "reduce_scatter": cent_error_merge = communicator.reduce_scatter_epoch( local_cent_error, postfix) cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float( n_workers) error_merge = cent_error_merge[-1] / float(n_workers) model.centroids = cent_merge model.error = error_merge epoch_sync_time = time.time() - epoch_sync_start print( "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_sync_time)) if model.error < threshold: break if worker_index == 0: storage.clear(tmp_bucket) storage.clear(merged_bucket) print("Worker[{}] finishes training: Error = {}, cost {} s".format( worker_index, model.error, time.time() - train_start)) return
def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] assert dataset_type == "dense_libsvm" n_features = event['n_features'] n_workers = event["n_workers"] worker_index = event['worker_index'] tmp_table_name = event['tmp_table_name'] merged_table_name = event['merged_table_name'] key_col = event['key_col'] # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('sync mode = {}'.format(sync_mode)) s3_storage = S3Storage() dynamo_client = dynamo_operator.get_client() tmp_table = DynamoTable(dynamo_client, tmp_table_name) merged_table = DynamoTable(dynamo_client, merged_table_name) communicator = DynamoCommunicator(dynamo_client, tmp_table, merged_table, key_col, n_workers, worker_index) # Reading data from S3 read_start = time.time() lines = s3_storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type).ins_np data_type = dataset.dtype centroid_shape = (n_clusters, dataset.shape[1]) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, dtype: {}, Centroids shape: {}, num_features: {}". format(dataset_type, data_type, centroid_shape, n_features)) init_centroids_start = time.time() if worker_index == 0: centroids = dataset[0:n_clusters] merged_table.save(centroids.tobytes(), Prefix.KMeans_Init_Cent + "-1", key_col) else: centroid_bytes = (merged_table.load_or_wait( Prefix.KMeans_Init_Cent + "-1", key_col, 0.1))['value'].value centroids = centroid_bytes2np(centroid_bytes, n_clusters, data_type) if centroid_shape != centroids.shape: raise Exception("The shape of centroids does not match.") print("initialize centroids takes {} s".format(time.time() - init_centroids_start)) model = cluster_models.get_model(dataset, centroids, dataset_type, n_features, n_clusters) train_start = time.time() for epoch in range(n_epochs): epoch_start = time.time() # rearrange data points model.find_nearest_cluster() local_cent = model.get_centroids("numpy").reshape(-1) local_cent_error = np.concatenate( (local_cent.flatten(), np.array([model.error], dtype=np.float32))) epoch_cal_time = time.time() - epoch_start # sync local centroids and error epoch_comm_start = time.time() if sync_mode == "reduce": cent_error_merge = communicator.reduce_epoch( local_cent_error, epoch) elif sync_mode == "reduce_scatter": cent_error_merge = communicator.reduce_scatter_epoch( local_cent_error, epoch) cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float( n_workers) error_merge = cent_error_merge[-1] / float(n_workers) model.centroids = cent_merge model.error = error_merge epoch_comm_time = time.time() - epoch_comm_start print("one {} round cost {} s".format(sync_mode, epoch_comm_time)) print( "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_comm_time)) if model.error < threshold: break if worker_index == 0: tmp_table.clear(key_col) merged_table.clear(key_col) print("Worker[{}] finishes training: Error = {}, cost {} s".format( worker_index, model.error, time.time() - train_start)) return
def handler(event, context): # dataset data_bucket = event['data_bucket'] file = event['file'] dataset_type = event["dataset_type"] n_features = event['n_features'] tmp_bucket = event["tmp_bucket"] merged_bucket = event["merged_bucket"] assert dataset_type == "sparse_libsvm" # hyper-parameter n_clusters = event['n_clusters'] n_epochs = event["n_epochs"] threshold = event["threshold"] sync_mode = event["sync_mode"] n_workers = event["n_workers"] worker_index = event['worker_index'] assert sync_mode.lower() in [ Synchronization.Reduce, Synchronization.Reduce_Scatter ] print('data bucket = {}'.format(data_bucket)) print("file = {}".format(file)) print('number of workers = {}'.format(n_workers)) print('worker index = {}'.format(worker_index)) print('num clusters = {}'.format(n_clusters)) print('sync mode = {}'.format(sync_mode)) storage = S3Storage() communicator = S3Communicator(storage, tmp_bucket, merged_bucket, n_workers, worker_index) # Reading data from S3 read_start = time.time() lines = storage.load(file, data_bucket).read().decode('utf-8').split("\n") print("read data cost {} s".format(time.time() - read_start)) parse_start = time.time() dataset = libsvm_dataset.from_lines(lines, n_features, dataset_type) train_set = dataset.ins_list np_dtype = train_set[0].to_dense().numpy().dtype centroid_shape = (n_clusters, n_features) print("parse data cost {} s".format(time.time() - parse_start)) print("dataset type: {}, data type: {}, centroids shape: {}".format( dataset_type, np_dtype, centroid_shape)) # initialize centroids init_centroids_start = time.time() if worker_index == 0: centroids_np = sparse_centroid_to_numpy(train_set[0:n_clusters], n_clusters) storage.save(centroids_np.tobytes(), Prefix.KMeans_Init_Cent + "-1", merged_bucket) else: centroid_bytes = storage.load_or_wait(Prefix.KMeans_Init_Cent + "-1", merged_bucket).read() centroids_np = np.frombuffer(centroid_bytes, dtype=np_dtype).reshape(centroid_shape) centroids = torch.from_numpy(centroids_np) print("initial centroids cost {} s".format(time.time() - init_centroids_start)) model = cluster_models.get_model(train_set, centroids, dataset_type, n_features, n_clusters) assert isinstance(model, SparseKMeans) train_start = time.time() for epoch in range(n_epochs): epoch_start = time.time() # rearrange data points model.find_nearest_cluster() local_cent = model.get_centroids("numpy").astype( np.float32).reshape(-1) local_cent_error = np.concatenate( (local_cent.flatten(), np.array([model.error], dtype=np.float32))) epoch_cal_time = time.time() - epoch_start # sync local centroids and error epoch_sync_start = time.time() postfix = str(epoch) if sync_mode == "reduce": cent_error_merge = communicator.reduce_epoch( local_cent_error, postfix) elif sync_mode == "reduce_scatter": cent_error_merge = communicator.reduce_scatter_epoch( local_cent_error, postfix) print("one {} round cost {} s".format(sync_mode, time.time() - epoch_sync_start)) cent_merge = cent_error_merge[:-1].reshape(centroid_shape) / float( n_workers) print("merged centroids shape = {}".format(cent_merge.shape)) error_merge = cent_error_merge[-1] / float(n_workers) model.centroids = [ torch.from_numpy(c).reshape(1, n_features).to_sparse() for c in cent_merge ] model.error = error_merge epoch_sync_time = time.time() - epoch_sync_start print( "Epoch[{}] Worker[{}], error = {}, cost {} s, cal cost {} s, sync cost {} s" .format(epoch, worker_index, model.error, time.time() - epoch_start, epoch_cal_time, epoch_sync_time)) if model.error < threshold: break if worker_index == 0: storage.clear(tmp_bucket) storage.clear(merged_bucket) print("Worker[{}] finishes training: Error = {}, cost {} s".format( worker_index, model.error, time.time() - train_start)) return