def main(): """ This is the main function of assignment4.py Example: python assignment4/assignment4.py \ -c ../confs/assignment4.yml \ -l ../logs/assignment4.log """ # Initialize args = get_args() ColorizedLogger.setup_logger(args.log, args.debug, True) main_logger.info("Starting Assignment 4") # Load the configuration conf = Configuration(config_src=args.config_file) # Start the problems defined in the configuration # For each problem present in the config file, call the appropriate function for config_key in conf.config_keys: run(run_type=config_key, config=conf.get_config(config_name=config_key), tag=conf.tag, log_name=args.log, local=args.local) main_logger.info("Assignment 4 Finished")
def __init__(self): self.funcs = { 'simple': KMeansRunner.run_simple, 'vectorized': KMeansRunner.run_vectorized, 'vectorized_jacob': KMeansRunner.run_vectorized_jacob } self.features_iris = None self.features_tcga = None self.logger = ColorizedLogger(f'KMeans', 'green')
def __init__(self, mpi): self._kmeans_log_setup() self.mpi_enabled = mpi if self.mpi_enabled: self.comm = MPI.COMM_WORLD self.rank = self.comm.rank self.size = self.comm.size self.logger = ColorizedLogger('Kmeans %s' % self.rank, self.colors[self.rank]) else: self.logger = ColorizedLogger('Kmeans Serial', self.colors[0])
def __init__(self, dataset: Dict, epochs: int, batch_size_train: int, batch_size_test: int, learning_rate: float, test_before_train: bool, momentum: float = 0, seed: int = 1, data_parallel: bool = False, log_path: str = None): # Set the object variables self.data_parallel = data_parallel if self.data_parallel: self.rank = dist.get_rank() else: self.rank = None if self.rank in (None, 0): if log_path: self.__log_setup(log_path=log_path, clear_log=True) self.logger = ColorizedLogger(f'CnnRunner', 'green') self.dataset = dataset self.epochs = epochs self.learning_rate = learning_rate self.momentum = momentum self.batch_size_train = batch_size_train self.batch_size_test = batch_size_test self.test_before_train = test_before_train # Configure torch variables backends.cudnn.enabled = False torch.manual_seed(seed) # Create the training modules self.my_model = LeNet5(num_classes=10) # self.my_model = VolModel(num_classes=10) self.loss_function = nn.CrossEntropyLoss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if self.rank in (None, 0): self.logger.info("Model parameters are configured.") self.logger.info(f"Device: {self.device}") self.logger.info(f"Model Architecture:\n{self.my_model}") # Create folder where the results are going to be saved if self.rank in (None, 0): self.results_path = self.create_results_folder()
def main(): """This is the main function of main.py Example: python playground/main.py -m run_mode_1 -c confs/template_conf.yml -l logs/output.log """ # Initializing args = get_args() log_path = os.path.abspath(args.log) ColorizedLogger.setup_logger(log_path, args.debug, clear_log=True) # Load the configuration conf = Configuration(config_src=args.config_file) # Start check_required = lambda conf_type, conf_enabled, tag: \ ((conf_type == 'required' or tag != 'required_only') and conf_enabled) if 'bench' in conf.config_keys: for sub_config in conf.get_config(config_name='bench'): if check_required(sub_config['type'], sub_config['enabled'], conf.tag): run_bench(sub_config) if 'mpi' in conf.config_keys: for sub_config in conf.get_config(config_name='mpi'): if check_required(sub_config['type'], sub_config['enabled'], conf.tag): run_mpi(sub_config) if 'kmeans' in conf.config_keys: for sub_config in conf.get_config(config_name='kmeans'): if check_required(sub_config['type'], sub_config['enabled'], conf.tag): run_kmeans(sub_config) if 'cprofile' in conf.config_keys: for sub_config in conf.get_config(config_name='cprofile'): if check_required(sub_config['type'], sub_config['enabled'], conf.tag): run_cprofile(sub_config, log_path=log_path) if 'numba' in conf.config_keys: for sub_config in conf.get_config(config_name='numba'): if check_required(sub_config['type'], sub_config['enabled'], conf.tag): run_numba(sub_config)
def main(): """ This is the main function of assignment.py Example: python assignment1/assignment.py \ -c ../confs/assignment1.yml \ -l ../logs/assignment.log """ # Initialize args = get_args() ColorizedLogger.setup_logger(args.log, args.debug) main_logger.info("Starting Assignment 1") # Load the configuration conf = Configuration(config_src=args.config_file) # Start the problems defined in the configuration main_logger.info(f"{' Required Problems ':-^{100}}") check_required = lambda conf_type, tag: ( (conf_type == 'required' or tag != 'required_only' ) and conf_type != 'disabled') # For each problem present in the config file, call the appropriate function if 'problem1' in conf.config_keys: for bench_conf in conf.get_config(config_name='problem1'): if check_required(bench_conf['type'], conf.tag): problem1(bench_conf) if 'problem2' in conf.config_keys: for bench_conf in conf.get_config(config_name='problem2'): if check_required(bench_conf['type'], conf.tag): problem2(bench_conf) if 'problem3' in conf.config_keys: for bench_conf in conf.get_config(config_name='problem3'): if check_required(bench_conf['type'], conf.tag): problem3(bench_conf) # Run the extra challenges if the tag of the conf is not set as "required_only" main_logger.info(f"{' Optional Problems ':-^{100}}") if 'extra_challenges' in conf.config_keys: for bench_conf in conf.get_config(config_name='extra_challenges'): if check_required(bench_conf['type'], conf.tag): extra_challenges(bench_conf) main_logger.info("Assignment 1 Finished")
def main(): """ This is the main function of assignment.py Example: python assignment1/assignment.py \ -c ../confs/assignment1.yml \ -l ../logs/assignment.log """ # Initialize args = get_args() ColorizedLogger.setup_logger(args.log, args.debug, True) main_logger.info("Starting Assignment 2") # Load the configuration conf = Configuration(config_src=args.config_file) # Start the problems defined in the configuration check_required = lambda conf_type, conf_enabled, tag: \ ((conf_type == 'required' or tag != 'required_only') and conf_enabled) # For each problem present in the config file, call the appropriate function for config_key in conf.config_keys: if 'distributed' in config_key: for bench_conf in conf.get_config(config_name=config_key): if check_required(bench_conf['type'], bench_conf['enabled'], conf.tag): run_distributed(name=config_key, conf=bench_conf, log_name=args.log, local=args.local) else: for bench_conf in conf.get_config(config_name=config_key): if check_required(bench_conf['type'], bench_conf['enabled'], conf.tag): run_serial(name=config_key, conf=bench_conf, log_name=args.log) main_logger.info("Assignment 2 Finished")
class CnnRunner: logger: ColorizedLogger outputs_file: IO dataset: Dict epochs: int learning_rate: float momentum: float batch_size_train: int batch_size_test: int test_before_train: bool results_path: str data_parallel: bool rank: Union[int, None] def __init__(self, dataset: Dict, epochs: int, batch_size_train: int, batch_size_test: int, learning_rate: float, test_before_train: bool, momentum: float = 0, seed: int = 1, data_parallel: bool = False, log_path: str = None): # Set the object variables self.data_parallel = data_parallel if self.data_parallel: self.rank = dist.get_rank() else: self.rank = None if self.rank in (None, 0): if log_path: self.__log_setup(log_path=log_path, clear_log=True) self.logger = ColorizedLogger(f'CnnRunner', 'green') self.dataset = dataset self.epochs = epochs self.learning_rate = learning_rate self.momentum = momentum self.batch_size_train = batch_size_train self.batch_size_test = batch_size_test self.test_before_train = test_before_train # Configure torch variables backends.cudnn.enabled = False torch.manual_seed(seed) # Create the training modules self.my_model = LeNet5(num_classes=10) # self.my_model = VolModel(num_classes=10) self.loss_function = nn.CrossEntropyLoss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if self.rank in (None, 0): self.logger.info("Model parameters are configured.") self.logger.info(f"Device: {self.device}") self.logger.info(f"Model Architecture:\n{self.my_model}") # Create folder where the results are going to be saved if self.rank in (None, 0): self.results_path = self.create_results_folder() @staticmethod def __log_setup(log_path: str, clear_log: bool = False): sys_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(sys_path, '..', 'logs', log_path) ColorizedLogger.setup_logger(log_path=log_path, clear_log=clear_log) @staticmethod def create_results_folder(): # Create Base Assignment folder sys_path = os.path.dirname(os.path.realpath(__file__)) output_base_path = os.path.join(sys_path, '..', 'outputs', 'assignment4') # Find max run number and set the next previous_runs = [ d for d in glob(os.path.join(output_base_path, "run*")) if os.path.isdir(d) ] if len(previous_runs) > 0: previous_runs = [ int(d.split(os.sep)[-1][3:]) for d in previous_runs ] max_run_num = max(previous_runs) + 1 else: max_run_num = 0 # Create outputs folder for this run run_folder_name = f"run{max_run_num}" run_specific_path = os.path.join(output_base_path, run_folder_name) if not os.path.exists(run_specific_path): os.makedirs(run_specific_path) return run_specific_path def store_results(self, data: Union[Tuple, Dict], num_processes: int, train: bool) -> None: if not os.path.exists(self.results_path): os.makedirs(self.results_path) # Create Run Specific Metadata file metadata = { "num_processes": num_processes, "epochs": self.epochs, "learning_rate": self.learning_rate, "momentum": self.momentum, "batch_size_train": self.batch_size_train, "batch_size_test": self.batch_size_test, "data_parallel": self.data_parallel } # Save metadata as numpy dict and as human-readable csv np.save(file=os.path.join(self.results_path, "metadata.npy"), arr=np.array(metadata)) metadata_csv = np.array( [tuple(metadata.keys()), tuple(metadata.values())], dtype=str) np.savetxt(os.path.join(self.results_path, "metadata.csv"), metadata_csv, fmt="%s", delimiter=",") if train: np.save(file=os.path.join(self.results_path, "train_epoch_accuracies.npy"), arr=np.array(data[0])) np.save(file=os.path.join(self.results_path, "train_epoch_losses.npy"), arr=np.array(data[1])) np.save(file=os.path.join(self.results_path, "train_epoch_times.npy"), arr=np.array(data[2])) else: for conf_key in data: subset = data[conf_key] dict_to_save = { "test_loss": subset[0], "correct": subset[1], "total": subset[2], "percent_correct": subset[3] } np.save(file=os.path.join(self.results_path, f"test_results_{conf_key}.npy"), arr=np.array(dict_to_save)) def dataset_loader(self) -> Tuple[datasets.MNIST, datasets.MNIST]: if self.dataset['name'].lower() == 'mnist': transformation = transforms \ .Compose([transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) mnist_train = datasets.MNIST(self.dataset['save_path'], train=True, download=True, transform=transformation) mnist_test = datasets.MNIST(self.dataset['save_path'], train=False, download=True, transform=transformation) else: raise NotImplemented("Dataset not yet supported!") if self.rank in (None, 0): self.logger.info( f"{self.dataset['name'].capitalize()} dataset loaded successfully." ) return mnist_train, mnist_test def print_test_results(self, test_loss: float, correct: int, total: int, percent_correct: float) -> None: self.logger.info(f"Test Loss: {test_loss}", color="blue") self.logger.info(f"Correct/Total : {correct}/{total}", color="blue") self.logger.info(f"Accuracy: {100 * percent_correct:.2f}%", color="blue") def train_non_parallel( self, train_loader: DataLoader) -> Tuple[List, List, List]: size_train_dataset = len(train_loader.dataset) epoch_losses = [] epoch_accuracies = [] epoch_times = [] optimizer = optim.SGD(self.my_model.parameters(), lr=self.learning_rate, momentum=self.momentum) self.my_model.train() iter_epochs = tqdm(range(self.epochs), desc='Training Epochs') for _ in iter_epochs: timeit_ = timeit(internal_only=True) epoch_loss = 0.0 correct = 0 num_mini_batches = 0 with timeit_: iter_mini_batches = enumerate(train_loader) for num_mini_batches, (X, Y) in iter_mini_batches: optimizer.zero_grad() pred = self.my_model(X) pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1]) # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item() correct += (pred_val == Y).sum().item() loss = self.loss_function(pred, Y) iter_loss = loss.item() epoch_loss += iter_loss loss.backward() optimizer.step() epoch_loss /= (num_mini_batches + 1) epoch_losses.append(epoch_loss) epoch_accuracy = correct / size_train_dataset epoch_accuracies.append(epoch_accuracy) epoch_time = timeit_.total epoch_times.append(epoch_time) iter_epochs.set_postfix(epoch_accuracy=epoch_accuracy, epoch_loss=epoch_loss, epoch_time=epoch_time) return epoch_accuracies, epoch_losses, epoch_times def train_data_parallel( self, train_loader: DataLoader) -> Tuple[List, List, List]: my_model = nn.parallel.DistributedDataParallel(self.my_model) learning_rate = self.learning_rate * dist.get_world_size() optimizer = optim.SGD(my_model.parameters(), lr=learning_rate) size_train_dataset = len(train_loader.dataset) epoch_losses = [] epoch_accuracies = [] epoch_times = [] self.my_model.train() if self.rank == 0: iter_epochs = tqdm(range(self.epochs), desc='Training Epochs') else: iter_epochs = range(self.epochs) for _ in iter_epochs: timeit_ = timeit(internal_only=True) epoch_loss = 0.0 correct = 0 num_mini_batches = 0 with timeit_: iter_mini_batches = enumerate(train_loader) for num_mini_batches, (X, Y) in iter_mini_batches: optimizer.zero_grad() pred = self.my_model(X) pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1]) # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item() correct += (pred_val == Y).sum().item() loss = self.loss_function(pred, Y) iter_loss = loss.item() epoch_loss += iter_loss loss.backward() optimizer.step() epoch_loss /= (num_mini_batches + 1) epoch_losses.append(epoch_loss) epoch_accuracy = correct / (size_train_dataset / dist.get_world_size()) epoch_accuracies.append(epoch_accuracy) epoch_time = timeit_.total epoch_times.append(epoch_time) if self.rank == 0: iter_epochs.set_postfix(epoch_accuracy=epoch_accuracy, epoch_loss=epoch_loss, epoch_time=epoch_time) return epoch_accuracies, epoch_losses, epoch_times def test(self, test_loader: DataLoader) -> Tuple[float, int, int, float]: self.my_model.eval() test_loss = 0.0 correct = 0 with torch.no_grad(): iter_mini_batches = tqdm(enumerate(test_loader), desc='Testing', leave=False) for num_mini_batches, (X, Y) in iter_mini_batches: pred = self.my_model(X) test_loss += self.loss_function(pred, Y).item() pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1]) # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item() correct += (pred_val == Y).sum().item() iter_mini_batches.set_postfix(test_loss_accum=test_loss) test_loss /= len(test_loader.dataset) size_test_dataset = len(test_loader.dataset) accuracy = correct / size_test_dataset return test_loss, correct, size_test_dataset, accuracy def run_non_parallel(self, train_loader: DataLoader, test_loader: DataLoader) \ -> Tuple[Tuple, Dict]: test_results = {} # Test with randomly initialize parameters if self.test_before_train: test_results["before"] = self.test(test_loader) self.logger.info("Randomly Initialized params testing:", color="blue") self.print_test_results(*test_results["before"]) # Training train_results = self.train_non_parallel(train_loader) self.logger.info("Training Finished! Results:", color="magenta") # Testing test_results["after"] = self.test(test_loader) self.logger.info("Testing Finished! Storing results..", color="blue") self.print_test_results(*test_results["after"]) return train_results, test_results def run_data_parallel(self, train_loader: DataLoader, test_loader: DataLoader) \ -> Tuple[Tuple, Dict]: if self.rank == 0: self.logger.info(f"World size: {dist.get_world_size()}") test_results = {} # Test with randomly initialize parameters if self.test_before_train: test_results["before"] = self.test(test_loader) if self.rank == 0: self.logger.info("Randomly Initialized params testing:", color="blue") self.print_test_results(*test_results["before"]) # Training train_results = self.train_data_parallel(train_loader) if self.rank == 0: self.logger.info("Training Finished! Results:", color="magenta") # Testing test_results["after"] = self.test(test_loader) if self.rank == 0: self.logger.info("Testing Finished! Storing results..", color="blue") self.print_test_results(*test_results["after"]) return train_results, test_results def run(self, num_processes: int) -> None: """ Args: num_processes: Returns: """ # Load the Dataset mnist_train, mnist_test = self.dataset_loader() # Create Train and Test loaders if self.data_parallel: mode = "Data Parallel" train_sampler = DistributedSampler(mnist_train) shuffle = False else: mode = "Non-parallel" train_sampler = None shuffle = True if self.rank in (None, 0): self.logger.info( f"{mode} mode with {num_processes} proc(s) requested..") train_loader = torch.utils.data.DataLoader( mnist_train, batch_size=self.batch_size_train, shuffle=shuffle, sampler=train_sampler) test_loader = torch.utils.data.DataLoader( mnist_test, batch_size=self.batch_size_test, shuffle=True) # Train and Test if self.data_parallel: train_results, test_results = self.run_data_parallel( train_loader, test_loader) else: train_results, test_results = self.run_non_parallel( train_loader, test_loader) # Save Results if self.rank in (None, 0): self.store_results(data=train_results, num_processes=num_processes, train=True) self.store_results(data=test_results, num_processes=num_processes, train=False)
from contextlib import ContextDecorator from typing import Callable, IO, Union from functools import wraps from time import time from playground import ColorizedLogger time_logger = ColorizedLogger('Timeit', 'white') class timeit(ContextDecorator): custom_print: str skip: bool total: Union[float, None] internal_only: bool file: IO def __init__(self, **kwargs): """Decorator/ContextManager for counting the execution times of functions and code blocks Args: custom_print: Custom print string Use {duration} to reference the running time. When used as decorator it can also be formatted using `func_name`, `args`, and {0}, {1}, .. to reference the function's first, second, ... argument. skip: If True, don't time this time. Suitable when inside loops file: Write the timing output to a file too """ self.total = None self.skip = False
class ProfilingPlay: logger: ColorizedLogger def __init__(self, log_name: str): ColorizedLogger.setup_logger(log_path=log_name, clear_log=False) self.logger = ColorizedLogger(f'ProfilePlay', 'blue') self.logger.info(f"Initialized ProfilingPlay..") def hello(self): self.logger.info("Hello World!") @staticmethod def load_boston_data(): boston = datasets.load_boston() return boston.data, boston.target @staticmethod def build_model(): hparams = { 'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 5, 'learning_rate': 0.01, 'loss': 'ls' } model = GradientBoostingRegressor(**hparams) return model @classmethod def load_data(cls): data, target = cls.load_boston_data() x_train, x_valid, y_train, y_valid = train_test_split( data, target, test_size=0.33, random_state=42 ) return x_train, x_valid, y_train, y_valid def boston_gbpm(self): x_train, x_valid, y_train, y_valid = self.load_data() model = self.build_model() model.fit(x_train, y_train) preds = model.predict(x_valid) mse = mean_squared_error(y_valid, preds) self.logger.info(f"The mean squared error (MSE) on test set: {mse:.4f}") @staticmethod def build_list(): return [x for x in range(1_000_000)] @staticmethod def exponentiate(arry, power): return [x ** power for x in arry] def run_exponentiate(self): my_list = self.build_list() squared = self.exponentiate(my_list, 2) @timeit(custom_print="Running run() for {args[1]!r} took {duration:2.5f} sec(s)") def run(self, func_to_run: str): self.logger.info(f"Starting function {func_to_run}") if func_to_run == 'hello_world': self.hello() elif func_to_run == 'boston_gbpm': self.boston_gbpm() elif func_to_run == 'exponentiate': self.run_exponentiate() else: raise NotImplementedError(f"Function {func_to_run} not yet implemented.")
def __log_setup(log_path: str, clear_log: bool = False): sys_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(sys_path, '..', 'logs', log_path) ColorizedLogger.setup_logger(log_path=log_path, clear_log=clear_log)
class NumbaPlay: logger: ColorizedLogger conf: Dict def __init__(self, conf): self.logger = ColorizedLogger(f'NumbaPlay', 'blue') self.logger.info(f"Initialized NumbaPlay..") self.conf = conf @staticmethod @jit(nopython=True) def pythagorean_theorem(x: int, y: int): return math.sqrt(x**2 + y**2) @staticmethod def pythagorus(x, y): return math.sqrt(x**2 + y**2) def pythagorean_test(self): self.logger.info( f"Starting pythagorean tests for x={self.conf['x']}, y={self.conf['x']}.." ) with timeit( custom_print='Numba pythagorean took {duration:.5f} sec(s)'): self.pythagorean_theorem(self.conf['x'], self.conf['y']) with timeit( custom_print='No Numba pythagorean took {duration:.5f} sec(s)' ): self.pythagorus(self.conf['x'], self.conf['y']) @staticmethod @njit def monte_carlo_pi(nsamples): acc = 0 for i in range(nsamples): x = random.random() y = random.random() if (x**2 + y**2) < 1.0: acc += 1 return 4.0 * acc / nsamples def monte_carlo_pi_test(self): self.logger.info( f"Starting monte_carlo_pi tests for nsamples={self.conf['nsamples']}.." ) with timeit( custom_print='Numba monte_carlo_pi took {duration:.5f} sec(s)' ): self.monte_carlo_pi(self.conf['nsamples']) @staticmethod @njit(parallel=True) def prange(A): s = 0 # Without "parallel=True" in the jit-decorator # the prange statement is equivalent to range for i in prange(A.shape[0]): s += A[i] return s def prange_test(self): self.logger.info(f"Starting prange tests for A={self.conf['A']}..") with timeit(custom_print='Numba prange took {duration:.5f} sec(s)'): self.prange(np.arange(self.conf['A'])) @staticmethod @jit(nopython=True, parallel=True) def logistic_regression(X, Y, w, iterations): for i in range(iterations): w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X, w))) - 1.0) * Y), X) return w def logistic_regression_test(self): self.logger.info( f"Starting logistic_regression tests for " f"X={self.conf['x1']}, Y={self.conf['x2']}, " f"w={self.conf['w']}, iterations={self.conf['iterations']}..") with timeit(custom_print= 'Numba logistic_regression took {duration:.5f} sec(s)'): self.logistic_regression( np.random.rand(self.conf['x1'], self.conf['x2']), np.random.rand(self.conf['x1']), np.zeros([self.conf['x2']]), self.conf['iterations'])
def __init__(self, log_name: str): ColorizedLogger.setup_logger(log_path=log_name, clear_log=False) self.logger = ColorizedLogger(f'ProfilePlay', 'blue') self.logger.info(f"Initialized ProfilingPlay..")
class KMeansRunner: logger: ColorizedLogger funcs: Dict outputs_file: IO features_iris: Union[np.ndarray, None] features_tcga: Union[np.ndarray, None] def __init__(self): self.funcs = { 'simple': KMeansRunner.run_simple, 'vectorized': KMeansRunner.run_vectorized, 'vectorized_jacob': KMeansRunner.run_vectorized_jacob } self.features_iris = None self.features_tcga = None self.logger = ColorizedLogger(f'KMeans', 'green') @staticmethod def _compute_distances_simple(num_points: int, num_features: int, num_clusters: int, centroids: np.ndarray, features: np.ndarray): # all pair-wise _squared_ distances centroid_distances = np.zeros((num_points, num_clusters)) for i in range(num_points): xi = features[i, :] for c in range(num_clusters): cc = centroids[c, :] dist = 0 for j in range(num_features): dist += (xi[j] - cc[j])**2 centroid_distances[i, c] = dist return centroid_distances @staticmethod def _expectation_step_simple(num_points: int, num_clusters: int, centroid_distances: np.ndarray, cluster_assignments: np.ndarray): num_changed_assignments = 0 for i in range(num_points): # pick closest cluster min_cluster = 0 min_distance = np.inf for c in range(num_clusters): if centroid_distances[i, c] < min_distance: min_cluster = c min_distance = centroid_distances[i, c] if cluster_assignments[i] != min_cluster: num_changed_assignments += 1 cluster_assignments[i] = min_cluster return cluster_assignments, num_changed_assignments @staticmethod def _maximization_step_simple(num_clusters: int, num_points: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray): for c in range(num_clusters): new_centroid = 0 cluster_size = 0 for i in range(num_points): if cluster_assignments[i] == c: new_centroid = new_centroid + features[i, :] cluster_size += 1 new_centroid = new_centroid / cluster_size centroids[c, :] = new_centroid return centroids @staticmethod def _loop_simple(num_clusters: int, num_points: int, num_features: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray): while True: # Compute distances from sample points to centroids centroid_distances = KMeansRunner._compute_distances_simple( num_points, num_features, num_clusters, centroids, features) # Expectation step: assign clusters cluster_assignments, \ num_changed_assignments = KMeansRunner._expectation_step_simple(num_points, num_clusters, centroid_distances, cluster_assignments) # Maximization step: Update centroid for each cluster centroids = KMeansRunner._maximization_step_simple( num_clusters, num_points, cluster_assignments, features, centroids) if num_changed_assignments == 0: break # return cluster centroids and assignments return centroids, cluster_assignments @staticmethod def run_simple(features: np.ndarray, num_clusters: int): """Run Simple K-Means algorithm to convergence. Args: features: numpy.ndarray: An N-by-d array describing N data points each of dimension d num_clusters: int: The number of clusters desired """ num_points = features.shape[0] # num sample points num_features = features.shape[1] # num features # INITIALIZATION PHASE # initialize centroids randomly as distinct elements of xs np.random.seed(0) centroid_ids = np.random.choice(num_points, (num_clusters, ), replace=False) centroids = features[centroid_ids, :] cluster_assignments = np.zeros(num_points, dtype=np.uint8) # loop until convergence centroids, cluster_assignments = \ KMeansRunner._loop_simple(num_clusters, num_points, num_features, cluster_assignments, features, centroids) # return cluster centroids and assignments return centroids, cluster_assignments @staticmethod def _compute_distances_vectorized_jacob(num_points: int, num_clusters: int, centroids: np.ndarray, features: np.ndarray): # all pair-wise _squared_ distances centroid_distances = np.zeros((num_points, num_clusters)) for i in range(num_points): xi = features[i, :] for c in range(num_clusters): cc = centroids[c, :] dist = np.sum((xi - cc)**2) centroid_distances[i, c] = dist return centroid_distances @staticmethod def _expectation_step_vectorized_jacob(num_points: int, num_clusters: int, centroid_distances: np.ndarray, cluster_assignments: np.ndarray): num_changed_assignments = 0 # claim: we can just do the following: # assignments = np.argmin(centroid_distances, axis=1) for i in range(num_points): # pick closest cluster cmin = 0 mindist = np.inf for c in range(num_clusters): if centroid_distances[i, c] < mindist: cmin = c mindist = centroid_distances[i, c] if cluster_assignments[i] != cmin: num_changed_assignments += 1 cluster_assignments[i] = cmin return centroid_distances, cluster_assignments, num_changed_assignments @staticmethod def _maximization_step_vectorized_jacob(num_clusters: int, num_points: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray): for c in range(num_clusters): new_centroid = 0 cluster_size = 0 for i in range(num_points): if cluster_assignments[i] == c: new_centroid = new_centroid + features[i, :] cluster_size += 1 new_centroid = new_centroid / cluster_size centroids[c, :] = new_centroid return centroids @staticmethod def _loop_vectorized_jacob(num_clusters: int, num_points: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray): loop_cnt = 0 while True: loop_cnt += 1 # Compute distances from sample points to centroids centroid_distances = KMeansRunner._compute_distances_vectorized_jacob( num_points, num_clusters, centroids, features) # Expectation step: assign clusters centroid_distances, cluster_assignments, num_changed_assignments = \ KMeansRunner._expectation_step_vectorized_jacob(num_points, num_clusters, centroid_distances, cluster_assignments) # Maximization step: Update centroid for each cluster centroids = KMeansRunner._maximization_step_vectorized_jacob( num_clusters, num_points, cluster_assignments, features, centroids) if num_changed_assignments == 0: break # return cluster centroids and assignments return centroids, cluster_assignments @staticmethod def run_vectorized_jacob(features: np.ndarray, num_clusters: int): """Run k-means algorithm to convergence. Args: features: numpy.ndarray: An num_points-by-d array describing num_points data points each of dimension d num_clusters: int: The number of clusters desired """ num_points = features.shape[0] # num sample points # INITIALIZATION PHASE # initialize centroids randomly as distinct elements of xs np.random.seed(0) centroids_ids = np.random.choice(num_points, (num_clusters, ), replace=False) centroids = features[centroids_ids, :] cluster_assignments = np.zeros(num_points, dtype=np.uint8) # loop until convergence centroids, cluster_assignments = \ KMeansRunner._loop_vectorized_jacob(num_clusters, num_points, cluster_assignments, features, centroids) # return cluster centroids and assignments return centroids, cluster_assignments @staticmethod def _compute_distances_vectorized(centroids: np.ndarray, features: np.ndarray) -> np.ndarray: from scipy.spatial.distance import cdist # all pair-wise _squared_ distances return np.square(cdist(features, centroids, 'euclidean')) @staticmethod def _expectation_step_vectorized( centroid_distances: np.ndarray, cluster_assignments: np.ndarray) -> [np.ndarray, np.ndarray]: return np.argmin(centroid_distances, axis=1), cluster_assignments @staticmethod def _maximization_step_vectorized(num_clusters: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray) -> np.ndarray: for cluster_ind in range(num_clusters): features_of_curr_cluster = features[cluster_assignments == cluster_ind] centroids[cluster_ind, :] = np.mean(features_of_curr_cluster, axis=0) # USE PANDAS TO GROUP BY CLUSTER -> MEAN ??? return centroids @staticmethod def _break_condition_vectorized(cluster_assignments: np.ndarray, previous_assignments: np.ndarray): return (cluster_assignments == previous_assignments).all() @staticmethod def _loop_vectorized(num_clusters: int, cluster_assignments: np.ndarray, features: np.ndarray, centroids: np.ndarray): loop_cnt = 0 while True: loop_cnt += 1 # Compute distances from sample points to centroids # all pair-wise _squared_ distances centroid_distances = KMeansRunner._compute_distances_vectorized( centroids, features) # Expectation step: assign clusters cluster_assignments, previous_assignments = \ KMeansRunner._expectation_step_vectorized(centroid_distances, cluster_assignments) # Maximization step: Update centroid for each cluster centroids = KMeansRunner._maximization_step_vectorized( num_clusters, cluster_assignments, features, centroids) # Break Condition if KMeansRunner._break_condition_vectorized( cluster_assignments, previous_assignments): break # return cluster centroids and cluster_assignments return centroids, cluster_assignments @staticmethod def run_vectorized(features: np.ndarray, num_clusters: int): """Run k-means algorithm to convergence. This is the Lloyd's algorithm [2] which consists of alternating expectation and maximization steps. Args: features: numpy.ndarray: An num_points-by-d array describing num_points data points each of dimension d. num_clusters: int: The number of clusters desired. Returns: centroids: numpy.ndarray: A num_clusters-by-d array of cluster centroid positions. cluster_assignments: numpy.ndarray: An num_points-length vector of integers whose values from 0 to num_clusters-1 indicate which cluster each data element belongs to. [1] https://en.wikipedia.org/wiki/K-means_clustering [2] https://en.wikipedia.org/wiki/Lloyd%27s_algorithm """ # # INITIALIZATION PHASE # initialize centroids randomly as distinct elements of features num_points = features.shape[0] # num sample points np.random.seed(0) centroid_ids = np.random.choice(num_points, (num_clusters, ), replace=False) centroids = features[centroid_ids, :] cluster_assignments = np.zeros(num_points, dtype=np.uint8) # Loop until convergence centroids, cluster_assignments = \ KMeansRunner._loop_vectorized(num_clusters, cluster_assignments, features, centroids) # return cluster centroids and cluster_assignments return centroids, cluster_assignments def _load_dataset(self, dataset_name: str, dataset: str): if dataset == 'iris': if self.features_iris is None: from sklearn.datasets import load_iris self.features_iris, _ = load_iris(return_X_y=True) self.logger.info( f"Dataset {dataset_name} loaded. Shape: {self.features_iris.shape}." ) return self.features_iris else: if self.features_tcga is None: import pandas as pd features_pd = pd.read_csv(dataset) features_pd.drop('Unnamed: 0', axis=1, inplace=True) self.features_tcga = features_pd.to_numpy() self.logger.info( f"Dataset {dataset_name} loaded. Shape: {self.features_tcga.shape}." ) return self.features_tcga def run(self, run_type: str, num_clusters: int, dataset: str): """ Args: num_clusters: The number of clusters to find dataset: The name or path of the dataset Returns: Info: features shape: (# points, # features) centroids shape: (# clusters, # features) centroid_distances shape: (# points, # clusters) """ # Setup func to run and dataset to use run_func = self.funcs[run_type] dataset_name = 'tcga' if dataset != 'iris' else dataset # Prepare output folders and names sys_path = os.path.dirname(os.path.realpath(__file__)) output_file_name = f'assignment3_{dataset_name}_{run_type}.txt' profiler_file_name = f'assignment3_{dataset_name}_{run_type}.o' output_base_path = os.path.join(sys_path, '..', 'outputs') if not os.path.exists(output_base_path): os.makedirs(output_base_path) profiler_file_path = os.path.join(output_base_path, profiler_file_name) output_file_path = os.path.join(output_base_path, output_file_name) # Open results output file with open(output_file_path, 'w') as self.outputs_file: self.outputs_file.write( f'K-Means {run_type} version for the {dataset_name} dataset ' f'with {num_clusters} clusters .\n') # Load Dataset if not already loaded features = self._load_dataset(dataset_name, dataset) # Run Kmeans k_words = ['kmeans.py', 'ncalls' ] # Include only pstats that contain these words custom_print = f'Profiling `{run_type}` K-Means for the `{dataset_name}` dataset: ' with profileit(file=self.outputs_file, profiler_output=profiler_file_path, custom_print=custom_print, keep_only_these=k_words): centroids, assignments = run_func(features=features, num_clusters=num_clusters) # Save results self.logger.info(f"Final Cluster Assignments: \n{assignments}") self.outputs_file.write(f'Assignments:\n') self.outputs_file.write(f'{assignments.tolist()}\n') self.outputs_file.write(f'Centroids:\n') self.outputs_file.write(f'{centroids.tolist()}')
import random import multiprocessing as mp import threading from concurrent import futures from typing import List import time from playground import ColorizedLogger logger = ColorizedLogger('ParallelBench', 'red') class BenchTests: def __init__(self, max_float, loops): random.seed(2) self.max_float = max_float self.loops = loops self.result = random.uniform(1.0, self.max_float) self.numbers_to_mult = [ random.uniform(1.0, max_float) for _ in range(loops) ] self.numbers_to_div = [ random.uniform(1.0, max_float) for _ in range(loops) ] self.first_list = self.numbers_to_mult self.second_list = self.numbers_to_div self.third_list = [] def math_calc(self): while len(self.numbers_to_mult) + len(self.numbers_to_div) > 0: try:
import os import logging from typing import Dict, List, Tuple, Union import json import _io from io import StringIO, TextIOWrapper import re import yaml from jsonschema import validate as validate_json_schema from playground import ColorizedLogger logger = ColorizedLogger('Config', 'white') class Configuration: __slots__ = ('config', 'config_path', 'config_keys', 'tag') config: Dict config_path: str tag: str config_keys: List env_variable_tag: str = '!ENV' env_variable_pattern: str = r'.*?\${(\w+)}.*?' # ${var} def __init__(self, config_src: Union[TextIOWrapper, StringIO, str], config_schema_path: str = 'yml_schema.json'): """ The basic constructor. Creates a new instance of the Configuration class. Args:
from contextlib import ContextDecorator from typing import Callable, IO, List from io import StringIO from functools import wraps import cProfile import pstats from playground import ColorizedLogger profile_logger = ColorizedLogger('Profileit', 'white') class profileit(ContextDecorator): custom_print: str profiler: cProfile.Profile stream: StringIO sort_by: str keep_only_these: List fraction: float skip: bool profiler_output: str file: IO def __init__(self, **kwargs): """Decorator/ContextManager for profiling functions and code blocks Args: custom_print: Custom print string. When used as decorator it can also be formatted using `func_name`, `args`, and {0}, {1}, .. to reference the function's first, second, ... argument. sort_by: pstats sorting column
import argparse import logging import os import sys import traceback from typing import Dict from playground import ColorizedLogger, timeit, Configuration, NumbaPlay from playground import run_math_calc_test, run_fill_and_empty_list_test logger = ColorizedLogger(logger_name='Main', color='yellow') def get_args() -> argparse.Namespace: """Setup the argument parser Returns: argparse.Namespace: """ parser = argparse.ArgumentParser( description='A playground repo for the DSE-512 course..', add_help=False) # Required Args required_args = parser.add_argument_group('Required Arguments') config_file_params = { 'type': argparse.FileType('r'), 'required': True, 'help': "The configuration yml file" } required_args.add_argument('-c', '--config-file', **config_file_params) # Optional args
class MPlayI: __slots__ = ('comm', 'rank', 'size') comm: MPI.COMM_WORLD rank: int size: int logger: ColorizedLogger = ColorizedLogger('MPI Play', 'cyan') colors: Dict = { 0: 'blue', 1: 'green', 2: 'magenta', 3: 'cyan', 4: 'yellow', 5: 'white', 6: 'grey', 7: 'black' } def __init__(self): self._mpi_log_setup() self.comm = MPI.COMM_WORLD self.rank = self.comm.rank self.size = self.comm.size if self.rank == 0: self.logger.info(f"Starting with size: {self.size}") @staticmethod def _mpi_log_setup(): sys_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(sys_path, '..', '..', 'logs', 'mpi.log') ColorizedLogger.setup_logger(log_path=log_path) @staticmethod def _chunk_list(seq, num): avg = len(seq) / float(num) out = [] last = 0.0 while last < len(seq): out.append(seq[int(last):int(last + avg)]) last += avg return out @staticmethod def _chunk_for_scatterv(np_arr, size): avg_items_per_split, remaining_items = divmod(np_arr.shape[0], size) items_per_split = [avg_items_per_split + 1 if p < remaining_items else avg_items_per_split for p in range(size)] items_per_split = np.array(items_per_split) # displacement: the starting index of each sub-task starting_index = [sum(items_per_split[:p]) for p in range(size)] starting_index = np.array(starting_index) return items_per_split, starting_index def simple(self): self.logger.info(f"Hello from rank {self.rank} of size {self.size}") # Wait for everyone to sync up self.comm.Barrier() def broadcast(self): if self.rank == 0: x = np.random.randn(4) * 100 else: x = np.empty(4, dtype=np.float64) self.logger.info(x) self.logger.info(f"Rank {self.rank} before broadcast has {x}") self.comm.Bcast([x, MPI.DOUBLE]) self.logger.info(f"Rank {self.rank} after broadcast has {x}") def scatter_gather(self): if self.rank == 0: data = [x for x in range(self.size)] else: data = None data = self.comm.scatter(data, root=0) self.logger.info(f"Rank {self.rank} after scatter has {data}") self.comm.Barrier() if self.rank == 0: self.logger.info("****Gathering!****") data = self.comm.gather(data, root=0) self.logger.info(f"Rank {self.rank} after gather has {data}") def all_gather(self): if self.rank == 0: data = [x for x in range(self.size)] self.logger.info(f"Rank {self.rank} scattering {data}") else: data = None data = self.comm.scatter(data, root=0) self.logger.info(f"Rank {self.rank} after scatter has {data}") self.comm.Barrier() if self.rank == 0: self.logger.info("****Gathering!****") # Note that we no longer specify the root here! data = self.comm.allgather(data) self.logger.info(f"Rank {self.rank} after gather has {data}") def mpi_reduce(self): if self.rank == 0: data = [x for x in range(1, self.size + 1)] self.logger.info(f"Rank {self.rank} scattering {data}") else: data = None data = self.comm.scatter(data, root=0) self.logger.info(f"Rank {self.rank} after scatter has {data}") self.comm.Barrier() if self.rank == 0: self.logger.info(f"****Reduce!****") data = self.comm.reduce(data, root=0) self.logger.info(f"Rank {self.rank} after reduce has {data}") def mpi_all_reduce(self): if self.rank == 0: data = [x for x in range(1, self.size + 1)] self.logger.info(f"Rank {self.rank} scattering {data}") else: data = None data = self.comm.scatter(data, root=0) self.logger.info(f"Rank {self.rank} after scatter has {data}") self.comm.Barrier() if self.rank == 0: self.logger.info(f"****Reduce!****") # Similar to allgather, we do no specify a root process! data = self.comm.allreduce(data) self.logger.info(f"Rank {self.rank} after reduce has {data}") def count_lines(self): """Count the total lines of files in specified folder""" if self.rank == 0: from glob import glob files_path = os.path.join('data', 'mpi_count_lines', '*.txt') files = list(glob(files_path)) files = self._chunk_list(files, self.size) else: files = None files = self.comm.scatter(files, root=0) self.logger.info(f"Rank {self.rank} has to count lines for these files: {files}") lines_cnt = 0 for file in files: with open(file, 'r') as f: lines_cnt += sum(1 for _ in f) self.logger.info(f"Rank {self.rank} counted {lines_cnt} lines in total") self.comm.Barrier() total_lines_cnt = self.comm.reduce(lines_cnt, root=0) if self.rank == 0: self.logger.info(f"After reduce, counted {total_lines_cnt} lines from all ranks.") def reduce_complex_old(self): if self.rank == 0: data = np.array([[x * 1.1, x * 1.1 + 2, x * 1.1 + 4, x * 1.1 + 6] for x in range(1, (self.size + 1) * 2)]) # Create output array of same size res = np.zeros_like(data[0]) # Split input array by the number of available cores data_ch = np.array_split(data, self.size, axis=0) chunk_sizes = [] for i in range(0, len(data_ch), 1): chunk_sizes = np.append(chunk_sizes, len(data_ch[i])) chunk_sizes_input = chunk_sizes * data.shape[1] displacements_input = np.insert(np.cumsum(chunk_sizes_input), 0, 0)[0:-1] chunk_sizes_output = chunk_sizes * data.shape[1] displacements_output = np.insert(np.cumsum(chunk_sizes_output), 0, 0)[0:-1] self.logger.info(f"Rank {self.rank} scattering {data_ch}") self.logger.info(f"Expected result format: {res}") else: # Create variables on other cores chunk_sizes_input = None displacements_input = None chunk_sizes_output = None displacements_output = None data_ch = None data = None res = None data_ch = self.comm.bcast(data_ch, root=0) # Broadcast split array to other cores chunk_sizes_output = self.comm.bcast(chunk_sizes_output, root=0) displacements_output = self.comm.bcast(displacements_output, root=0) # Create array to receive subset of data on each core, where rank specifies the core output_chunk = np.zeros(np.shape(data_ch[self.rank])) self.comm.Scatterv([data, chunk_sizes_input, displacements_input, MPI.DOUBLE], output_chunk, root=0) self.logger.info( f"Rank {self.rank} after scatter has (shape: {output_chunk.shape}):\n{output_chunk}") # Create output array on each core output = np.zeros([len(output_chunk), output_chunk.shape[1]]) for i in range(0, np.shape(output_chunk)[0], 1): output[i, 0:output_chunk.shape[1]] = output_chunk[i] self.comm.Barrier() if self.rank == 0: self.logger.info(f"****Reduce!****") # self.comm.Gatherv(output, [res, chunk_sizes_output, displacements_output, MPI.DOUBLE], # root=0) # Gather output data together output = np.mean(output, axis=0) self.logger.info(f"Rank {self.rank} output (shape: {output.shape}):\n{output}") self.comm.Reduce( output, [res, chunk_sizes_output, displacements_output, MPI.DOUBLE], op=MPI.SUM, root=0 ) self.logger.info(f"Rank {self.rank} after reduce has {res}") def reduce_complex(self): if self.rank == 0: data = np.array([[x * 1., (x + 2) * 1., (x + 4) * 1., (x + 6) * 1.] for x in range(1, (self.size + 1) * 2)]) cluster_assignments = np.array([0, 1, 1, 3, 1, 1, 2, 4, 1, 3, 3], dtype=np.int64) num_features = data.shape[1] items_per_split_orig, starting_index_orig = self._chunk_for_scatterv(data, self.size) items_per_split = items_per_split_orig * num_features starting_index = starting_index_orig * num_features self.logger.info(f"Data ({data.shape}): {data[:1]}, ..") data = data.flatten() self.logger.info(f"Data Flat " f"({data.shape}, {data.dtype}):{data[:6]}, ..") self.logger.info(f"Assignments({cluster_assignments.shape}, {cluster_assignments.dtype}): " f"{cluster_assignments}") self.logger.info(f"Items per split Original: {items_per_split_orig}") self.logger.info(f"Items per split: {items_per_split}") self.logger.info(f"Starting Index Original: {starting_index_orig}") self.logger.info(f"Starting Index: {starting_index}") self.logger.info(f"Num Features: {num_features}") else: data = None cluster_assignments = None num_features = None # initialize items_per_split, and starting_index on worker processes items_per_split = np.zeros(self.size, dtype=np.int) items_per_split_orig = np.zeros(self.size, dtype=np.int) starting_index = None starting_index_orig = None # Broadcast the number of items per split self.comm.Bcast(items_per_split, root=0) self.comm.Bcast(items_per_split_orig, root=0) num_features = self.comm.bcast(num_features, root=0) # Scatter cluster assignments cluster_assignments_chunked = np.zeros(items_per_split_orig[self.rank], dtype=np.int64) self.logger.info(f"Initialized chunked assignments ({cluster_assignments_chunked.shape}): " f"{cluster_assignments_chunked}") self.comm.Scatterv([cluster_assignments, items_per_split_orig, starting_index_orig, MPI.INT64_T], cluster_assignments_chunked, root=0) self.logger.info(f"Received cluster_assignments_chunked " f"({cluster_assignments_chunked.shape}): {cluster_assignments_chunked}") # Scatter data points-features data_chunked_flat = np.zeros(items_per_split[self.rank]) self.comm.Scatterv([data, items_per_split, starting_index, MPI.DOUBLE], data_chunked_flat, root=0) data_chunked = data_chunked_flat.reshape(-1, num_features) self.logger.info(f"Received data_chunked ({data_chunked.shape}):\n{data_chunked}") # Reduce and find average for cluster 1 if self.rank == 0: self.logger.info(f"****Reduce!****") # Find avg for cluster 1 only data_chunked_clust_1 = data_chunked[cluster_assignments_chunked == 1] self.logger.info(f"Data for cluster 1 (shape: {data_chunked_clust_1.shape}:\n " f"{data_chunked_clust_1}") # Find sum of each cluster size_cluster_1_chunked = data_chunked_clust_1.shape[0] if size_cluster_1_chunked > 0: sum_cluster_1_chunked = np.sum(data_chunked_clust_1, axis=0) else: sum_cluster_1_chunked = np.zeros_like(data_chunked[0]) self.logger.info(f"Sum cluster 1 (shape: {sum_cluster_1_chunked.shape}: " f"{sum_cluster_1_chunked}") # Reduce the internal sums to find total sum sum_cluster_1 = np.zeros_like(sum_cluster_1_chunked) self.comm.Reduce([sum_cluster_1_chunked, MPI.DOUBLE], [sum_cluster_1, MPI.DOUBLE], op=MPI.SUM, root=0) self.logger.info(f"Chunked size: {size_cluster_1_chunked}") total_size = self.comm.reduce(size_cluster_1_chunked, op=MPI.SUM, root=0) if self.rank == 0: self.logger.info(f"Total size: {total_size}. Summed sums: {sum_cluster_1}") avg_cluster_1 = sum_cluster_1 / total_size self.logger.info(f"Average Cluster 1: {avg_cluster_1}")
def _mpi_log_setup(): sys_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(sys_path, '..', '..', 'logs', 'mpi.log') ColorizedLogger.setup_logger(log_path=log_path)
import logging import traceback import os import sys from typing import Dict from playground.main import get_args from playground import ColorizedLogger, Configuration, timeit # Create loggers with different colors to use in each problem main_logger = ColorizedLogger('Main', 'yellow') def prepare_for_run(name: str, conf: Dict): conf_props = conf['properties'] num_clusters = conf_props['num_clusters'] dataset = conf_props['dataset'] python_file_name = 'kmeans.py' dataset_name = 'tcga' if dataset != 'iris' else dataset main_logger.info( f"Invoking {python_file_name}({name}) " f"for {num_clusters} clusters and the {dataset_name} dataset") return python_file_name, num_clusters, dataset, dataset_name def run_serial(name: str, conf: Dict, log_name: str) -> None: """ Runs the KMeans ser9ap version for the specified configuration. """ # Extract the properties python_file_name, num_clusters, dataset, dataset_name = prepare_for_run( name, conf)
def __init__(self, conf): self.logger = ColorizedLogger(f'NumbaPlay', 'blue') self.logger.info(f"Initialized NumbaPlay..") self.conf = conf
import logging import traceback import os from typing import Dict from math import ceil from itertools import repeat, takewhile import multiprocessing import numpy as np from playground.main import get_args from playground import ColorizedLogger, Configuration, timeit # Create loggers with different colors to use in each problem main_logger = ColorizedLogger('Main', 'yellow') p1_logger = ColorizedLogger('Problem1', 'blue') p2_logger = ColorizedLogger('Problem2', 'green') p3_logger = ColorizedLogger('Problem3', 'magenta') extra_ch_logger = ColorizedLogger('ExtraMain', 'yellow') extra_sub_ch_logger = ColorizedLogger('ExtraSub', 'cyan') # Global Vars (For the Extra Challenges) # lock: multiprocessing.Lock # multi_list: List = [] def my_pid(x: int) -> None: """ Problem 1 function to be called using pool.map Parameters: x: the id of the worker
class KMeansRunner: __slots__ = ('comm', 'rank', 'size', 'logger', 'mpi_enabled') comm: MPI.COMM_WORLD rank: int size: int logger: ColorizedLogger colors: Dict = { 0: 'blue', 1: 'green', 2: 'magenta', 3: 'cyan', 4: 'yellow', 5: 'white', 6: 'grey', 7: 'black' } def __init__(self, mpi): self._kmeans_log_setup() self.mpi_enabled = mpi if self.mpi_enabled: self.comm = MPI.COMM_WORLD self.rank = self.comm.rank self.size = self.comm.size self.logger = ColorizedLogger('Kmeans %s' % self.rank, self.colors[self.rank]) else: self.logger = ColorizedLogger('Kmeans Serial', self.colors[0]) @staticmethod def _kmeans_log_setup(): sys_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(sys_path, '..', '..', 'logs', 'kmeans.log') ColorizedLogger.setup_logger(log_path=log_path) @staticmethod def _chunk_list(seq, num): avg = len(seq) / float(num) out = [] last = 0.0 while last < len(seq): out.append(seq[int(last):int(last + avg)]) last += avg return out @staticmethod def _run_vectorized(features: np.ndarray, num_clusters: int): """Run k-means algorithm to convergence. This is the Lloyd's algorithm [2] which consists of alternating expectation and maximization steps. Args: features: numpy.ndarray: An num_features-by-d array describing num_features data points each of dimension d. num_clusters: int: The number of clusters desired. Returns: centroids: numpy.ndarray: A num_clusters-by-d array of cluster centroid positions. cluster_assignments: numpy.ndarray: An num_features-length vector of integers whose values from 0 to num_clusters-1 indicate which cluster each data element belongs to. [1] https://en.wikipedia.org/wiki/K-means_clustering [2] https://en.wikipedia.org/wiki/Lloyd%27s_algorithm """ num_features = features.shape[0] # num sample points # # INITIALIZATION PHASE # initialize centroids randomly as distinct elements of features np.random.seed(0) centroid_ids = np.random.choice(num_features, (num_clusters, ), replace=False) centroids = features[centroid_ids, :] cluster_assignments = np.zeros(num_features, dtype=np.uint8) # Loop until convergence while True: # Compute distances from sample points to centroids # all pair-wise _squared_ distances centroid_distances = np.square(features[:, np.newaxis] - centroids).sum(axis=2) # Expectation step: assign clusters previous_assignments = cluster_assignments cluster_assignments = np.argmin(centroid_distances, axis=1) # Maximization step: Update centroid for each cluster for cluster_ind in range(num_clusters): features_of_curr_cluster = features[cluster_assignments == cluster_ind] centroids[cluster_ind, :] = np.mean(features_of_curr_cluster, axis=0) # USE PANDAS TO GROUP BY CLUSTER -> MEAN ??? # Break Condition if (cluster_assignments == previous_assignments).all(): break # return cluster centroids and cluster_assignments return centroids, cluster_assignments @staticmethod def _run_simple(features: np.ndarray, num_clusters: int): """Run k-means algorithm to convergence. Args: features: numpy.ndarray: An N-by-d array describing N data points each of dimension d num_clusters: int: The number of clusters desired """ N = features.shape[0] # num sample points d = features.shape[1] # dimension of space # # INITIALIZATION PHASE # initialize centroids randomly as distinct elements of features np.random.seed(0) cids = np.random.choice(N, (num_clusters, ), replace=False) centroids = features[cids, :] assignments = np.zeros(N, dtype=np.uint8) # loop until convergence while True: # Compute distances from sample points to centroids # all pair-wise _squared_ distances cdists = np.zeros((N, num_clusters)) for i in range(N): xi = features[i, :] for c in range(num_clusters): cc = centroids[c, :] dist = 0 for j in range(d): dist += (xi[j] - cc[j])**2 cdists[i, c] = dist # Expectation step: assign clusters num_changed_assignments = 0 for i in range(N): # pick closest cluster cmin = 0 mindist = np.inf for c in range(num_clusters): if cdists[i, c] < mindist: cmin = c mindist = cdists[i, c] if assignments[i] != cmin: num_changed_assignments += 1 assignments[i] = cmin # Maximization step: Update centroid for each cluster for c in range(num_clusters): newcent = 0 clustersize = 0 for i in range(N): if assignments[i] == c: newcent = newcent + features[i, :] clustersize += 1 newcent = newcent / clustersize centroids[c, :] = newcent if num_changed_assignments == 0: break # return cluster centroids and assignments return centroids, assignments def run_serial(self, num_clusters: int, type_run: str): from sklearn.datasets import load_iris features, labels = load_iris(return_X_y=True) # run k-means if type_run == 'simple': centroids, assignments = self._run_simple( features=features, num_clusters=num_clusters) elif type_run == 'vectorized': centroids, assignments = self._run_vectorized( features=features, num_clusters=num_clusters) else: raise Exception(f'Argument {type_run} not recognized!') # print out results self.logger.info( f"\nCentroids: {centroids}\nAssignments: {assignments}")