예제 #1
0
def main():
    """ This is the main function of assignment4.py

    Example:
        python assignment4/assignment4.py \
            -c ../confs/assignment4.yml \
            -l ../logs/assignment4.log
    """

    # Initialize
    args = get_args()
    ColorizedLogger.setup_logger(args.log, args.debug, True)
    main_logger.info("Starting Assignment 4")
    # Load the configuration
    conf = Configuration(config_src=args.config_file)
    # Start the problems defined in the configuration
    # For each problem present in the config file, call the appropriate function
    for config_key in conf.config_keys:
        run(run_type=config_key,
            config=conf.get_config(config_name=config_key),
            tag=conf.tag,
            log_name=args.log,
            local=args.local)

    main_logger.info("Assignment 4 Finished")
예제 #2
0
 def __init__(self):
     self.funcs = {
         'simple': KMeansRunner.run_simple,
         'vectorized': KMeansRunner.run_vectorized,
         'vectorized_jacob': KMeansRunner.run_vectorized_jacob
     }
     self.features_iris = None
     self.features_tcga = None
     self.logger = ColorizedLogger(f'KMeans', 'green')
예제 #3
0
 def __init__(self, mpi):
     self._kmeans_log_setup()
     self.mpi_enabled = mpi
     if self.mpi_enabled:
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.rank
         self.size = self.comm.size
         self.logger = ColorizedLogger('Kmeans %s' % self.rank,
                                       self.colors[self.rank])
     else:
         self.logger = ColorizedLogger('Kmeans Serial', self.colors[0])
예제 #4
0
    def __init__(self,
                 dataset: Dict,
                 epochs: int,
                 batch_size_train: int,
                 batch_size_test: int,
                 learning_rate: float,
                 test_before_train: bool,
                 momentum: float = 0,
                 seed: int = 1,
                 data_parallel: bool = False,
                 log_path: str = None):
        # Set the object variables
        self.data_parallel = data_parallel
        if self.data_parallel:
            self.rank = dist.get_rank()
        else:
            self.rank = None
        if self.rank in (None, 0):
            if log_path:
                self.__log_setup(log_path=log_path, clear_log=True)
            self.logger = ColorizedLogger(f'CnnRunner', 'green')
        self.dataset = dataset
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.batch_size_train = batch_size_train
        self.batch_size_test = batch_size_test
        self.test_before_train = test_before_train

        # Configure torch variables
        backends.cudnn.enabled = False
        torch.manual_seed(seed)
        # Create the training modules
        self.my_model = LeNet5(num_classes=10)
        # self.my_model = VolModel(num_classes=10)

        self.loss_function = nn.CrossEntropyLoss()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        if self.rank in (None, 0):
            self.logger.info("Model parameters are configured.")
            self.logger.info(f"Device: {self.device}")
            self.logger.info(f"Model Architecture:\n{self.my_model}")
        # Create folder where the results are going to be saved
        if self.rank in (None, 0):
            self.results_path = self.create_results_folder()
예제 #5
0
def main():
    """This is the main function of main.py

    Example: python playground/main.py -m run_mode_1

        -c confs/template_conf.yml -l logs/output.log
    """

    # Initializing
    args = get_args()
    log_path = os.path.abspath(args.log)
    ColorizedLogger.setup_logger(log_path, args.debug, clear_log=True)
    # Load the configuration
    conf = Configuration(config_src=args.config_file)
    # Start
    check_required = lambda conf_type, conf_enabled, tag: \
        ((conf_type == 'required' or tag != 'required_only') and conf_enabled)
    if 'bench' in conf.config_keys:
        for sub_config in conf.get_config(config_name='bench'):
            if check_required(sub_config['type'], sub_config['enabled'],
                              conf.tag):
                run_bench(sub_config)
    if 'mpi' in conf.config_keys:
        for sub_config in conf.get_config(config_name='mpi'):
            if check_required(sub_config['type'], sub_config['enabled'],
                              conf.tag):
                run_mpi(sub_config)
    if 'kmeans' in conf.config_keys:
        for sub_config in conf.get_config(config_name='kmeans'):
            if check_required(sub_config['type'], sub_config['enabled'],
                              conf.tag):
                run_kmeans(sub_config)
    if 'cprofile' in conf.config_keys:
        for sub_config in conf.get_config(config_name='cprofile'):
            if check_required(sub_config['type'], sub_config['enabled'],
                              conf.tag):
                run_cprofile(sub_config, log_path=log_path)
    if 'numba' in conf.config_keys:
        for sub_config in conf.get_config(config_name='numba'):
            if check_required(sub_config['type'], sub_config['enabled'],
                              conf.tag):
                run_numba(sub_config)
예제 #6
0
def main():
    """ This is the main function of assignment.py

    Example:
        python assignment1/assignment.py \
            -c ../confs/assignment1.yml \
            -l ../logs/assignment.log
    """

    # Initialize
    args = get_args()
    ColorizedLogger.setup_logger(args.log, args.debug)
    main_logger.info("Starting Assignment 1")
    # Load the configuration
    conf = Configuration(config_src=args.config_file)
    # Start the problems defined in the configuration
    main_logger.info(f"{' Required Problems ':-^{100}}")
    check_required = lambda conf_type, tag: (
        (conf_type == 'required' or tag != 'required_only'
         ) and conf_type != 'disabled')
    # For each problem present in the config file, call the appropriate function
    if 'problem1' in conf.config_keys:
        for bench_conf in conf.get_config(config_name='problem1'):
            if check_required(bench_conf['type'], conf.tag):
                problem1(bench_conf)
    if 'problem2' in conf.config_keys:
        for bench_conf in conf.get_config(config_name='problem2'):
            if check_required(bench_conf['type'], conf.tag):
                problem2(bench_conf)
    if 'problem3' in conf.config_keys:
        for bench_conf in conf.get_config(config_name='problem3'):
            if check_required(bench_conf['type'], conf.tag):
                problem3(bench_conf)
    # Run the extra challenges if the tag of the conf is not set as "required_only"
    main_logger.info(f"{' Optional Problems ':-^{100}}")
    if 'extra_challenges' in conf.config_keys:
        for bench_conf in conf.get_config(config_name='extra_challenges'):
            if check_required(bench_conf['type'], conf.tag):
                extra_challenges(bench_conf)
    main_logger.info("Assignment 1 Finished")
예제 #7
0
def main():
    """ This is the main function of assignment.py

    Example:
        python assignment1/assignment.py \
            -c ../confs/assignment1.yml \
            -l ../logs/assignment.log
    """

    # Initialize
    args = get_args()
    ColorizedLogger.setup_logger(args.log, args.debug, True)
    main_logger.info("Starting Assignment 2")
    # Load the configuration
    conf = Configuration(config_src=args.config_file)
    # Start the problems defined in the configuration
    check_required = lambda conf_type, conf_enabled, tag: \
        ((conf_type == 'required' or tag != 'required_only') and conf_enabled)
    # For each problem present in the config file, call the appropriate function
    for config_key in conf.config_keys:
        if 'distributed' in config_key:
            for bench_conf in conf.get_config(config_name=config_key):
                if check_required(bench_conf['type'], bench_conf['enabled'],
                                  conf.tag):
                    run_distributed(name=config_key,
                                    conf=bench_conf,
                                    log_name=args.log,
                                    local=args.local)
        else:
            for bench_conf in conf.get_config(config_name=config_key):
                if check_required(bench_conf['type'], bench_conf['enabled'],
                                  conf.tag):
                    run_serial(name=config_key,
                               conf=bench_conf,
                               log_name=args.log)

    main_logger.info("Assignment 2 Finished")
예제 #8
0
class CnnRunner:
    logger: ColorizedLogger
    outputs_file: IO
    dataset: Dict
    epochs: int
    learning_rate: float
    momentum: float
    batch_size_train: int
    batch_size_test: int
    test_before_train: bool
    results_path: str
    data_parallel: bool
    rank: Union[int, None]

    def __init__(self,
                 dataset: Dict,
                 epochs: int,
                 batch_size_train: int,
                 batch_size_test: int,
                 learning_rate: float,
                 test_before_train: bool,
                 momentum: float = 0,
                 seed: int = 1,
                 data_parallel: bool = False,
                 log_path: str = None):
        # Set the object variables
        self.data_parallel = data_parallel
        if self.data_parallel:
            self.rank = dist.get_rank()
        else:
            self.rank = None
        if self.rank in (None, 0):
            if log_path:
                self.__log_setup(log_path=log_path, clear_log=True)
            self.logger = ColorizedLogger(f'CnnRunner', 'green')
        self.dataset = dataset
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.batch_size_train = batch_size_train
        self.batch_size_test = batch_size_test
        self.test_before_train = test_before_train

        # Configure torch variables
        backends.cudnn.enabled = False
        torch.manual_seed(seed)
        # Create the training modules
        self.my_model = LeNet5(num_classes=10)
        # self.my_model = VolModel(num_classes=10)

        self.loss_function = nn.CrossEntropyLoss()
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        if self.rank in (None, 0):
            self.logger.info("Model parameters are configured.")
            self.logger.info(f"Device: {self.device}")
            self.logger.info(f"Model Architecture:\n{self.my_model}")
        # Create folder where the results are going to be saved
        if self.rank in (None, 0):
            self.results_path = self.create_results_folder()

    @staticmethod
    def __log_setup(log_path: str, clear_log: bool = False):
        sys_path = os.path.dirname(os.path.realpath(__file__))
        log_path = os.path.join(sys_path, '..', 'logs', log_path)
        ColorizedLogger.setup_logger(log_path=log_path, clear_log=clear_log)

    @staticmethod
    def create_results_folder():
        # Create Base Assignment folder
        sys_path = os.path.dirname(os.path.realpath(__file__))
        output_base_path = os.path.join(sys_path, '..', 'outputs',
                                        'assignment4')
        # Find max run number and set the next
        previous_runs = [
            d for d in glob(os.path.join(output_base_path, "run*"))
            if os.path.isdir(d)
        ]
        if len(previous_runs) > 0:
            previous_runs = [
                int(d.split(os.sep)[-1][3:]) for d in previous_runs
            ]
            max_run_num = max(previous_runs) + 1
        else:
            max_run_num = 0
        # Create outputs folder for this run
        run_folder_name = f"run{max_run_num}"
        run_specific_path = os.path.join(output_base_path, run_folder_name)
        if not os.path.exists(run_specific_path):
            os.makedirs(run_specific_path)
        return run_specific_path

    def store_results(self, data: Union[Tuple, Dict], num_processes: int,
                      train: bool) -> None:
        if not os.path.exists(self.results_path):
            os.makedirs(self.results_path)

        # Create Run Specific Metadata file
        metadata = {
            "num_processes": num_processes,
            "epochs": self.epochs,
            "learning_rate": self.learning_rate,
            "momentum": self.momentum,
            "batch_size_train": self.batch_size_train,
            "batch_size_test": self.batch_size_test,
            "data_parallel": self.data_parallel
        }
        # Save metadata as numpy dict and as human-readable csv
        np.save(file=os.path.join(self.results_path, "metadata.npy"),
                arr=np.array(metadata))
        metadata_csv = np.array(
            [tuple(metadata.keys()),
             tuple(metadata.values())], dtype=str)
        np.savetxt(os.path.join(self.results_path, "metadata.csv"),
                   metadata_csv,
                   fmt="%s",
                   delimiter=",")

        if train:
            np.save(file=os.path.join(self.results_path,
                                      "train_epoch_accuracies.npy"),
                    arr=np.array(data[0]))
            np.save(file=os.path.join(self.results_path,
                                      "train_epoch_losses.npy"),
                    arr=np.array(data[1]))
            np.save(file=os.path.join(self.results_path,
                                      "train_epoch_times.npy"),
                    arr=np.array(data[2]))
        else:
            for conf_key in data:
                subset = data[conf_key]
                dict_to_save = {
                    "test_loss": subset[0],
                    "correct": subset[1],
                    "total": subset[2],
                    "percent_correct": subset[3]
                }
                np.save(file=os.path.join(self.results_path,
                                          f"test_results_{conf_key}.npy"),
                        arr=np.array(dict_to_save))

    def dataset_loader(self) -> Tuple[datasets.MNIST, datasets.MNIST]:
        if self.dataset['name'].lower() == 'mnist':
            transformation = transforms \
                .Compose([transforms.Resize((32, 32)),
                          transforms.ToTensor(),
                          transforms.Normalize((0.1307,), (0.3081,))
                          ])
            mnist_train = datasets.MNIST(self.dataset['save_path'],
                                         train=True,
                                         download=True,
                                         transform=transformation)
            mnist_test = datasets.MNIST(self.dataset['save_path'],
                                        train=False,
                                        download=True,
                                        transform=transformation)
        else:
            raise NotImplemented("Dataset not yet supported!")
        if self.rank in (None, 0):
            self.logger.info(
                f"{self.dataset['name'].capitalize()} dataset loaded successfully."
            )

        return mnist_train, mnist_test

    def print_test_results(self, test_loss: float, correct: int, total: int,
                           percent_correct: float) -> None:
        self.logger.info(f"Test Loss: {test_loss}", color="blue")
        self.logger.info(f"Correct/Total : {correct}/{total}", color="blue")
        self.logger.info(f"Accuracy: {100 * percent_correct:.2f}%",
                         color="blue")

    def train_non_parallel(
            self, train_loader: DataLoader) -> Tuple[List, List, List]:
        size_train_dataset = len(train_loader.dataset)
        epoch_losses = []
        epoch_accuracies = []
        epoch_times = []
        optimizer = optim.SGD(self.my_model.parameters(),
                              lr=self.learning_rate,
                              momentum=self.momentum)
        self.my_model.train()
        iter_epochs = tqdm(range(self.epochs), desc='Training Epochs')
        for _ in iter_epochs:
            timeit_ = timeit(internal_only=True)
            epoch_loss = 0.0
            correct = 0
            num_mini_batches = 0
            with timeit_:
                iter_mini_batches = enumerate(train_loader)
                for num_mini_batches, (X, Y) in iter_mini_batches:
                    optimizer.zero_grad()
                    pred = self.my_model(X)
                    pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1])
                    # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item()
                    correct += (pred_val == Y).sum().item()
                    loss = self.loss_function(pred, Y)
                    iter_loss = loss.item()
                    epoch_loss += iter_loss
                    loss.backward()
                    optimizer.step()

            epoch_loss /= (num_mini_batches + 1)
            epoch_losses.append(epoch_loss)
            epoch_accuracy = correct / size_train_dataset
            epoch_accuracies.append(epoch_accuracy)
            epoch_time = timeit_.total
            epoch_times.append(epoch_time)
            iter_epochs.set_postfix(epoch_accuracy=epoch_accuracy,
                                    epoch_loss=epoch_loss,
                                    epoch_time=epoch_time)

        return epoch_accuracies, epoch_losses, epoch_times

    def train_data_parallel(
            self, train_loader: DataLoader) -> Tuple[List, List, List]:

        my_model = nn.parallel.DistributedDataParallel(self.my_model)
        learning_rate = self.learning_rate * dist.get_world_size()
        optimizer = optim.SGD(my_model.parameters(), lr=learning_rate)

        size_train_dataset = len(train_loader.dataset)
        epoch_losses = []
        epoch_accuracies = []
        epoch_times = []

        self.my_model.train()
        if self.rank == 0:
            iter_epochs = tqdm(range(self.epochs), desc='Training Epochs')
        else:
            iter_epochs = range(self.epochs)

        for _ in iter_epochs:
            timeit_ = timeit(internal_only=True)
            epoch_loss = 0.0
            correct = 0
            num_mini_batches = 0
            with timeit_:
                iter_mini_batches = enumerate(train_loader)
                for num_mini_batches, (X, Y) in iter_mini_batches:
                    optimizer.zero_grad()
                    pred = self.my_model(X)
                    pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1])
                    # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item()
                    correct += (pred_val == Y).sum().item()
                    loss = self.loss_function(pred, Y)
                    iter_loss = loss.item()
                    epoch_loss += iter_loss
                    loss.backward()
                    optimizer.step()

            epoch_loss /= (num_mini_batches + 1)
            epoch_losses.append(epoch_loss)
            epoch_accuracy = correct / (size_train_dataset /
                                        dist.get_world_size())
            epoch_accuracies.append(epoch_accuracy)
            epoch_time = timeit_.total
            epoch_times.append(epoch_time)
            if self.rank == 0:
                iter_epochs.set_postfix(epoch_accuracy=epoch_accuracy,
                                        epoch_loss=epoch_loss,
                                        epoch_time=epoch_time)

        return epoch_accuracies, epoch_losses, epoch_times

    def test(self, test_loader: DataLoader) -> Tuple[float, int, int, float]:
        self.my_model.eval()
        test_loss = 0.0
        correct = 0
        with torch.no_grad():
            iter_mini_batches = tqdm(enumerate(test_loader),
                                     desc='Testing',
                                     leave=False)
            for num_mini_batches, (X, Y) in iter_mini_batches:
                pred = self.my_model(X)
                test_loss += self.loss_function(pred, Y).item()
                pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1])
                # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item()
                correct += (pred_val == Y).sum().item()
                iter_mini_batches.set_postfix(test_loss_accum=test_loss)
        test_loss /= len(test_loader.dataset)
        size_test_dataset = len(test_loader.dataset)
        accuracy = correct / size_test_dataset

        return test_loss, correct, size_test_dataset, accuracy

    def run_non_parallel(self, train_loader: DataLoader, test_loader: DataLoader) \
            -> Tuple[Tuple, Dict]:

        test_results = {}
        # Test with randomly initialize parameters
        if self.test_before_train:
            test_results["before"] = self.test(test_loader)
            self.logger.info("Randomly Initialized params testing:",
                             color="blue")
            self.print_test_results(*test_results["before"])

        # Training
        train_results = self.train_non_parallel(train_loader)
        self.logger.info("Training Finished! Results:", color="magenta")

        # Testing
        test_results["after"] = self.test(test_loader)
        self.logger.info("Testing Finished! Storing results..", color="blue")
        self.print_test_results(*test_results["after"])

        return train_results, test_results

    def run_data_parallel(self, train_loader: DataLoader, test_loader: DataLoader) \
            -> Tuple[Tuple, Dict]:

        if self.rank == 0:
            self.logger.info(f"World size: {dist.get_world_size()}")

        test_results = {}
        # Test with randomly initialize parameters
        if self.test_before_train:
            test_results["before"] = self.test(test_loader)
            if self.rank == 0:
                self.logger.info("Randomly Initialized params testing:",
                                 color="blue")
                self.print_test_results(*test_results["before"])

        # Training
        train_results = self.train_data_parallel(train_loader)
        if self.rank == 0:
            self.logger.info("Training Finished! Results:", color="magenta")

        # Testing
        test_results["after"] = self.test(test_loader)
        if self.rank == 0:
            self.logger.info("Testing Finished! Storing results..",
                             color="blue")
            self.print_test_results(*test_results["after"])

        return train_results, test_results

    def run(self, num_processes: int) -> None:
        """
        Args:
            num_processes:
        Returns:
        """

        # Load the Dataset
        mnist_train, mnist_test = self.dataset_loader()
        # Create Train and Test loaders
        if self.data_parallel:
            mode = "Data Parallel"
            train_sampler = DistributedSampler(mnist_train)
            shuffle = False
        else:
            mode = "Non-parallel"
            train_sampler = None
            shuffle = True
        if self.rank in (None, 0):
            self.logger.info(
                f"{mode} mode with {num_processes} proc(s) requested..")
        train_loader = torch.utils.data.DataLoader(
            mnist_train,
            batch_size=self.batch_size_train,
            shuffle=shuffle,
            sampler=train_sampler)
        test_loader = torch.utils.data.DataLoader(
            mnist_test, batch_size=self.batch_size_test, shuffle=True)
        # Train and Test
        if self.data_parallel:
            train_results, test_results = self.run_data_parallel(
                train_loader, test_loader)
        else:
            train_results, test_results = self.run_non_parallel(
                train_loader, test_loader)
        # Save Results
        if self.rank in (None, 0):
            self.store_results(data=train_results,
                               num_processes=num_processes,
                               train=True)
            self.store_results(data=test_results,
                               num_processes=num_processes,
                               train=False)
예제 #9
0
from contextlib import ContextDecorator
from typing import Callable, IO, Union
from functools import wraps
from time import time

from playground import ColorizedLogger

time_logger = ColorizedLogger('Timeit', 'white')


class timeit(ContextDecorator):
    custom_print: str
    skip: bool
    total: Union[float, None]
    internal_only: bool
    file: IO

    def __init__(self, **kwargs):
        """Decorator/ContextManager for counting the execution times of functions and code blocks

        Args:
            custom_print: Custom print string Use {duration} to reference the running time.
                          When used as decorator it can also be formatted using
                          `func_name`, `args`, and {0}, {1}, .. to reference the function's
                          first, second, ... argument.
            skip: If True, don't time this time. Suitable when inside loops
            file: Write the timing output to a file too
        """

        self.total = None
        self.skip = False
예제 #10
0
class ProfilingPlay:
    logger: ColorizedLogger

    def __init__(self, log_name: str):
        ColorizedLogger.setup_logger(log_path=log_name, clear_log=False)
        self.logger = ColorizedLogger(f'ProfilePlay', 'blue')
        self.logger.info(f"Initialized ProfilingPlay..")

    def hello(self):
        self.logger.info("Hello World!")

    @staticmethod
    def load_boston_data():
        boston = datasets.load_boston()
        return boston.data, boston.target

    @staticmethod
    def build_model():
        hparams = {
            'n_estimators': 500,
            'max_depth': 5,
            'min_samples_split': 5,
            'learning_rate': 0.01,
            'loss': 'ls'
        }

        model = GradientBoostingRegressor(**hparams)

        return model

    @classmethod
    def load_data(cls):
        data, target = cls.load_boston_data()

        x_train, x_valid, y_train, y_valid = train_test_split(
            data, target, test_size=0.33, random_state=42
        )

        return x_train, x_valid, y_train, y_valid

    def boston_gbpm(self):
        x_train, x_valid, y_train, y_valid = self.load_data()

        model = self.build_model()

        model.fit(x_train, y_train)
        preds = model.predict(x_valid)

        mse = mean_squared_error(y_valid, preds)
        self.logger.info(f"The mean squared error (MSE) on test set: {mse:.4f}")

    @staticmethod
    def build_list():
        return [x for x in range(1_000_000)]

    @staticmethod
    def exponentiate(arry, power):
        return [x ** power for x in arry]

    def run_exponentiate(self):
        my_list = self.build_list()
        squared = self.exponentiate(my_list, 2)

    @timeit(custom_print="Running run() for {args[1]!r} took {duration:2.5f} sec(s)")
    def run(self, func_to_run: str):
        self.logger.info(f"Starting function {func_to_run}")
        if func_to_run == 'hello_world':
            self.hello()
        elif func_to_run == 'boston_gbpm':
            self.boston_gbpm()
        elif func_to_run == 'exponentiate':
            self.run_exponentiate()
        else:
            raise NotImplementedError(f"Function {func_to_run} not yet implemented.")
예제 #11
0
 def __log_setup(log_path: str, clear_log: bool = False):
     sys_path = os.path.dirname(os.path.realpath(__file__))
     log_path = os.path.join(sys_path, '..', 'logs', log_path)
     ColorizedLogger.setup_logger(log_path=log_path, clear_log=clear_log)
예제 #12
0
class NumbaPlay:
    logger: ColorizedLogger
    conf: Dict

    def __init__(self, conf):
        self.logger = ColorizedLogger(f'NumbaPlay', 'blue')
        self.logger.info(f"Initialized NumbaPlay..")
        self.conf = conf

    @staticmethod
    @jit(nopython=True)
    def pythagorean_theorem(x: int, y: int):
        return math.sqrt(x**2 + y**2)

    @staticmethod
    def pythagorus(x, y):
        return math.sqrt(x**2 + y**2)

    def pythagorean_test(self):
        self.logger.info(
            f"Starting pythagorean tests for x={self.conf['x']}, y={self.conf['x']}.."
        )
        with timeit(
                custom_print='Numba pythagorean took {duration:.5f} sec(s)'):
            self.pythagorean_theorem(self.conf['x'], self.conf['y'])
        with timeit(
                custom_print='No Numba pythagorean took {duration:.5f} sec(s)'
        ):
            self.pythagorus(self.conf['x'], self.conf['y'])

    @staticmethod
    @njit
    def monte_carlo_pi(nsamples):
        acc = 0
        for i in range(nsamples):
            x = random.random()
            y = random.random()
            if (x**2 + y**2) < 1.0:
                acc += 1
        return 4.0 * acc / nsamples

    def monte_carlo_pi_test(self):
        self.logger.info(
            f"Starting monte_carlo_pi tests for nsamples={self.conf['nsamples']}.."
        )
        with timeit(
                custom_print='Numba monte_carlo_pi took {duration:.5f} sec(s)'
        ):
            self.monte_carlo_pi(self.conf['nsamples'])

    @staticmethod
    @njit(parallel=True)
    def prange(A):
        s = 0
        # Without "parallel=True" in the jit-decorator
        # the prange statement is equivalent to range
        for i in prange(A.shape[0]):
            s += A[i]
        return s

    def prange_test(self):
        self.logger.info(f"Starting prange tests for A={self.conf['A']}..")
        with timeit(custom_print='Numba prange took {duration:.5f} sec(s)'):
            self.prange(np.arange(self.conf['A']))

    @staticmethod
    @jit(nopython=True, parallel=True)
    def logistic_regression(X, Y, w, iterations):
        for i in range(iterations):
            w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X, w))) - 1.0) * Y),
                        X)
        return w

    def logistic_regression_test(self):
        self.logger.info(
            f"Starting logistic_regression tests for "
            f"X={self.conf['x1']}, Y={self.conf['x2']}, "
            f"w={self.conf['w']}, iterations={self.conf['iterations']}..")
        with timeit(custom_print=
                    'Numba logistic_regression took {duration:.5f} sec(s)'):
            self.logistic_regression(
                np.random.rand(self.conf['x1'], self.conf['x2']),
                np.random.rand(self.conf['x1']), np.zeros([self.conf['x2']]),
                self.conf['iterations'])
예제 #13
0
 def __init__(self, log_name: str):
     ColorizedLogger.setup_logger(log_path=log_name, clear_log=False)
     self.logger = ColorizedLogger(f'ProfilePlay', 'blue')
     self.logger.info(f"Initialized ProfilingPlay..")
예제 #14
0
class KMeansRunner:
    logger: ColorizedLogger
    funcs: Dict
    outputs_file: IO
    features_iris: Union[np.ndarray, None]
    features_tcga: Union[np.ndarray, None]

    def __init__(self):
        self.funcs = {
            'simple': KMeansRunner.run_simple,
            'vectorized': KMeansRunner.run_vectorized,
            'vectorized_jacob': KMeansRunner.run_vectorized_jacob
        }
        self.features_iris = None
        self.features_tcga = None
        self.logger = ColorizedLogger(f'KMeans', 'green')

    @staticmethod
    def _compute_distances_simple(num_points: int, num_features: int,
                                  num_clusters: int, centroids: np.ndarray,
                                  features: np.ndarray):
        # all  pair-wise _squared_ distances
        centroid_distances = np.zeros((num_points, num_clusters))
        for i in range(num_points):
            xi = features[i, :]
            for c in range(num_clusters):
                cc = centroids[c, :]
                dist = 0
                for j in range(num_features):
                    dist += (xi[j] - cc[j])**2
                centroid_distances[i, c] = dist

        return centroid_distances

    @staticmethod
    def _expectation_step_simple(num_points: int, num_clusters: int,
                                 centroid_distances: np.ndarray,
                                 cluster_assignments: np.ndarray):
        num_changed_assignments = 0
        for i in range(num_points):
            # pick closest cluster
            min_cluster = 0
            min_distance = np.inf
            for c in range(num_clusters):
                if centroid_distances[i, c] < min_distance:
                    min_cluster = c
                    min_distance = centroid_distances[i, c]
            if cluster_assignments[i] != min_cluster:
                num_changed_assignments += 1
            cluster_assignments[i] = min_cluster

        return cluster_assignments, num_changed_assignments

    @staticmethod
    def _maximization_step_simple(num_clusters: int, num_points: int,
                                  cluster_assignments: np.ndarray,
                                  features: np.ndarray, centroids: np.ndarray):
        for c in range(num_clusters):
            new_centroid = 0
            cluster_size = 0
            for i in range(num_points):
                if cluster_assignments[i] == c:
                    new_centroid = new_centroid + features[i, :]
                    cluster_size += 1
            new_centroid = new_centroid / cluster_size
            centroids[c, :] = new_centroid
        return centroids

    @staticmethod
    def _loop_simple(num_clusters: int, num_points: int, num_features: int,
                     cluster_assignments: np.ndarray, features: np.ndarray,
                     centroids: np.ndarray):
        while True:
            # Compute distances from sample points to centroids
            centroid_distances = KMeansRunner._compute_distances_simple(
                num_points, num_features, num_clusters, centroids, features)

            # Expectation step: assign clusters
            cluster_assignments, \
            num_changed_assignments = KMeansRunner._expectation_step_simple(num_points,
                                                                            num_clusters,
                                                                            centroid_distances,
                                                                            cluster_assignments)

            # Maximization step: Update centroid for each cluster
            centroids = KMeansRunner._maximization_step_simple(
                num_clusters, num_points, cluster_assignments, features,
                centroids)

            if num_changed_assignments == 0:
                break

        # return cluster centroids and assignments
        return centroids, cluster_assignments

    @staticmethod
    def run_simple(features: np.ndarray, num_clusters: int):
        """Run Simple K-Means algorithm to convergence.

        Args:
            features: numpy.ndarray: An N-by-d array describing N data points each of dimension d
            num_clusters: int: The number of clusters desired
        """

        num_points = features.shape[0]  # num sample points
        num_features = features.shape[1]  # num features

        # INITIALIZATION PHASE
        # initialize centroids randomly as distinct elements of xs
        np.random.seed(0)
        centroid_ids = np.random.choice(num_points, (num_clusters, ),
                                        replace=False)
        centroids = features[centroid_ids, :]
        cluster_assignments = np.zeros(num_points, dtype=np.uint8)

        # loop until convergence
        centroids, cluster_assignments = \
            KMeansRunner._loop_simple(num_clusters, num_points, num_features, cluster_assignments,
                                      features, centroids)

        # return cluster centroids and assignments
        return centroids, cluster_assignments

    @staticmethod
    def _compute_distances_vectorized_jacob(num_points: int, num_clusters: int,
                                            centroids: np.ndarray,
                                            features: np.ndarray):
        # all  pair-wise _squared_ distances
        centroid_distances = np.zeros((num_points, num_clusters))
        for i in range(num_points):
            xi = features[i, :]
            for c in range(num_clusters):
                cc = centroids[c, :]
                dist = np.sum((xi - cc)**2)
                centroid_distances[i, c] = dist
        return centroid_distances

    @staticmethod
    def _expectation_step_vectorized_jacob(num_points: int, num_clusters: int,
                                           centroid_distances: np.ndarray,
                                           cluster_assignments: np.ndarray):
        num_changed_assignments = 0
        # claim: we can just do the following:
        # assignments = np.argmin(centroid_distances, axis=1)
        for i in range(num_points):
            # pick closest cluster
            cmin = 0
            mindist = np.inf
            for c in range(num_clusters):
                if centroid_distances[i, c] < mindist:
                    cmin = c
                    mindist = centroid_distances[i, c]
            if cluster_assignments[i] != cmin:
                num_changed_assignments += 1
            cluster_assignments[i] = cmin

        return centroid_distances, cluster_assignments, num_changed_assignments

    @staticmethod
    def _maximization_step_vectorized_jacob(num_clusters: int, num_points: int,
                                            cluster_assignments: np.ndarray,
                                            features: np.ndarray,
                                            centroids: np.ndarray):
        for c in range(num_clusters):
            new_centroid = 0
            cluster_size = 0
            for i in range(num_points):
                if cluster_assignments[i] == c:
                    new_centroid = new_centroid + features[i, :]
                    cluster_size += 1
            new_centroid = new_centroid / cluster_size
            centroids[c, :] = new_centroid

        return centroids

    @staticmethod
    def _loop_vectorized_jacob(num_clusters: int, num_points: int,
                               cluster_assignments: np.ndarray,
                               features: np.ndarray, centroids: np.ndarray):
        loop_cnt = 0
        while True:
            loop_cnt += 1
            # Compute distances from sample points to centroids
            centroid_distances = KMeansRunner._compute_distances_vectorized_jacob(
                num_points, num_clusters, centroids, features)

            # Expectation step: assign clusters
            centroid_distances, cluster_assignments, num_changed_assignments = \
                KMeansRunner._expectation_step_vectorized_jacob(num_points, num_clusters,
                                                                centroid_distances,
                                                                cluster_assignments)

            # Maximization step: Update centroid for each cluster
            centroids = KMeansRunner._maximization_step_vectorized_jacob(
                num_clusters, num_points, cluster_assignments, features,
                centroids)

            if num_changed_assignments == 0:
                break

        # return cluster centroids and assignments
        return centroids, cluster_assignments

    @staticmethod
    def run_vectorized_jacob(features: np.ndarray, num_clusters: int):
        """Run k-means algorithm to convergence.

        Args:
            features: numpy.ndarray: An num_points-by-d array describing num_points data points each
            of dimension d
            num_clusters: int: The number of clusters desired
        """
        num_points = features.shape[0]  # num sample points

        # INITIALIZATION PHASE
        # initialize centroids randomly as distinct elements of xs
        np.random.seed(0)
        centroids_ids = np.random.choice(num_points, (num_clusters, ),
                                         replace=False)
        centroids = features[centroids_ids, :]
        cluster_assignments = np.zeros(num_points, dtype=np.uint8)

        # loop until convergence
        centroids, cluster_assignments = \
            KMeansRunner._loop_vectorized_jacob(num_clusters, num_points, cluster_assignments,
                                                features, centroids)

        # return cluster centroids and assignments
        return centroids, cluster_assignments

    @staticmethod
    def _compute_distances_vectorized(centroids: np.ndarray,
                                      features: np.ndarray) -> np.ndarray:
        from scipy.spatial.distance import cdist
        # all  pair-wise _squared_ distances
        return np.square(cdist(features, centroids, 'euclidean'))

    @staticmethod
    def _expectation_step_vectorized(
            centroid_distances: np.ndarray,
            cluster_assignments: np.ndarray) -> [np.ndarray, np.ndarray]:
        return np.argmin(centroid_distances, axis=1), cluster_assignments

    @staticmethod
    def _maximization_step_vectorized(num_clusters: int,
                                      cluster_assignments: np.ndarray,
                                      features: np.ndarray,
                                      centroids: np.ndarray) -> np.ndarray:
        for cluster_ind in range(num_clusters):
            features_of_curr_cluster = features[cluster_assignments ==
                                                cluster_ind]
            centroids[cluster_ind, :] = np.mean(features_of_curr_cluster,
                                                axis=0)
        # USE PANDAS TO GROUP BY CLUSTER -> MEAN ???
        return centroids

    @staticmethod
    def _break_condition_vectorized(cluster_assignments: np.ndarray,
                                    previous_assignments: np.ndarray):
        return (cluster_assignments == previous_assignments).all()

    @staticmethod
    def _loop_vectorized(num_clusters: int, cluster_assignments: np.ndarray,
                         features: np.ndarray, centroids: np.ndarray):
        loop_cnt = 0
        while True:
            loop_cnt += 1
            # Compute distances from sample points to centroids
            # all  pair-wise _squared_ distances
            centroid_distances = KMeansRunner._compute_distances_vectorized(
                centroids, features)

            # Expectation step: assign clusters
            cluster_assignments, previous_assignments = \
                KMeansRunner._expectation_step_vectorized(centroid_distances, cluster_assignments)

            # Maximization step: Update centroid for each cluster
            centroids = KMeansRunner._maximization_step_vectorized(
                num_clusters, cluster_assignments, features, centroids)
            # Break Condition
            if KMeansRunner._break_condition_vectorized(
                    cluster_assignments, previous_assignments):
                break

        # return cluster centroids and cluster_assignments
        return centroids, cluster_assignments

    @staticmethod
    def run_vectorized(features: np.ndarray, num_clusters: int):
        """Run k-means algorithm to convergence.

            This is the Lloyd's algorithm [2] which consists of alternating expectation
            and maximization steps.

            Args:
                features: numpy.ndarray: An num_points-by-d array describing num_points data points
                each of dimension d.
                num_clusters: int: The number of clusters desired.
            Returns:
                centroids: numpy.ndarray: A num_clusters-by-d array of cluster centroid
                    positions.
                cluster_assignments: numpy.ndarray: An num_points-length vector of integers whose
                values from 0 to num_clusters-1 indicate which cluster each data element belongs to.

            [1] https://en.wikipedia.org/wiki/K-means_clustering
            [2] https://en.wikipedia.org/wiki/Lloyd%27s_algorithm
            """
        #
        # INITIALIZATION PHASE
        # initialize centroids randomly as distinct elements of features

        num_points = features.shape[0]  # num sample points
        np.random.seed(0)
        centroid_ids = np.random.choice(num_points, (num_clusters, ),
                                        replace=False)
        centroids = features[centroid_ids, :]
        cluster_assignments = np.zeros(num_points, dtype=np.uint8)
        # Loop until convergence
        centroids, cluster_assignments = \
            KMeansRunner._loop_vectorized(num_clusters, cluster_assignments, features, centroids)

        # return cluster centroids and cluster_assignments
        return centroids, cluster_assignments

    def _load_dataset(self, dataset_name: str, dataset: str):
        if dataset == 'iris':
            if self.features_iris is None:
                from sklearn.datasets import load_iris
                self.features_iris, _ = load_iris(return_X_y=True)
                self.logger.info(
                    f"Dataset {dataset_name} loaded. Shape: {self.features_iris.shape}."
                )
            return self.features_iris
        else:
            if self.features_tcga is None:
                import pandas as pd
                features_pd = pd.read_csv(dataset)
                features_pd.drop('Unnamed: 0', axis=1, inplace=True)
                self.features_tcga = features_pd.to_numpy()
                self.logger.info(
                    f"Dataset {dataset_name} loaded. Shape: {self.features_tcga.shape}."
                )
            return self.features_tcga

    def run(self, run_type: str, num_clusters: int, dataset: str):
        """

        Args:
            num_clusters: The number of clusters to find
            dataset: The name or path of the dataset

        Returns:

        Info:
            features shape: (# points, # features)
            centroids shape: (# clusters, # features)
            centroid_distances shape: (# points, # clusters)
        """

        # Setup func to run and dataset to use
        run_func = self.funcs[run_type]
        dataset_name = 'tcga' if dataset != 'iris' else dataset

        # Prepare output folders and names
        sys_path = os.path.dirname(os.path.realpath(__file__))
        output_file_name = f'assignment3_{dataset_name}_{run_type}.txt'
        profiler_file_name = f'assignment3_{dataset_name}_{run_type}.o'
        output_base_path = os.path.join(sys_path, '..', 'outputs')
        if not os.path.exists(output_base_path):
            os.makedirs(output_base_path)
        profiler_file_path = os.path.join(output_base_path, profiler_file_name)
        output_file_path = os.path.join(output_base_path, output_file_name)

        # Open results output file
        with open(output_file_path, 'w') as self.outputs_file:
            self.outputs_file.write(
                f'K-Means {run_type} version for the {dataset_name} dataset '
                f'with {num_clusters} clusters .\n')

            # Load Dataset if not already loaded
            features = self._load_dataset(dataset_name, dataset)

            # Run Kmeans
            k_words = ['kmeans.py', 'ncalls'
                       ]  # Include only pstats that contain these words
            custom_print = f'Profiling `{run_type}` K-Means for the `{dataset_name}` dataset: '
            with profileit(file=self.outputs_file,
                           profiler_output=profiler_file_path,
                           custom_print=custom_print,
                           keep_only_these=k_words):
                centroids, assignments = run_func(features=features,
                                                  num_clusters=num_clusters)

            # Save results
            self.logger.info(f"Final Cluster Assignments: \n{assignments}")
            self.outputs_file.write(f'Assignments:\n')
            self.outputs_file.write(f'{assignments.tolist()}\n')
            self.outputs_file.write(f'Centroids:\n')
            self.outputs_file.write(f'{centroids.tolist()}')
예제 #15
0
import random
import multiprocessing as mp
import threading
from concurrent import futures
from typing import List
import time

from playground import ColorizedLogger

logger = ColorizedLogger('ParallelBench', 'red')


class BenchTests:
    def __init__(self, max_float, loops):
        random.seed(2)
        self.max_float = max_float
        self.loops = loops
        self.result = random.uniform(1.0, self.max_float)
        self.numbers_to_mult = [
            random.uniform(1.0, max_float) for _ in range(loops)
        ]
        self.numbers_to_div = [
            random.uniform(1.0, max_float) for _ in range(loops)
        ]
        self.first_list = self.numbers_to_mult
        self.second_list = self.numbers_to_div
        self.third_list = []

    def math_calc(self):
        while len(self.numbers_to_mult) + len(self.numbers_to_div) > 0:
            try:
예제 #16
0
import os
import logging
from typing import Dict, List, Tuple, Union
import json
import _io
from io import StringIO, TextIOWrapper
import re
import yaml
from jsonschema import validate as validate_json_schema

from playground import ColorizedLogger

logger = ColorizedLogger('Config', 'white')


class Configuration:
    __slots__ = ('config', 'config_path', 'config_keys', 'tag')

    config: Dict
    config_path: str
    tag: str
    config_keys: List
    env_variable_tag: str = '!ENV'
    env_variable_pattern: str = r'.*?\${(\w+)}.*?'  # ${var}

    def __init__(self, config_src: Union[TextIOWrapper, StringIO, str],
                 config_schema_path: str = 'yml_schema.json'):
        """
       The basic constructor. Creates a new instance of the Configuration class.

        Args:
예제 #17
0
from contextlib import ContextDecorator
from typing import Callable, IO, List
from io import StringIO
from functools import wraps
import cProfile
import pstats

from playground import ColorizedLogger

profile_logger = ColorizedLogger('Profileit', 'white')


class profileit(ContextDecorator):
    custom_print: str
    profiler: cProfile.Profile
    stream: StringIO
    sort_by: str
    keep_only_these: List
    fraction: float
    skip: bool
    profiler_output: str
    file: IO

    def __init__(self, **kwargs):
        """Decorator/ContextManager for profiling functions and code blocks

        Args:
            custom_print: Custom print string. When used as decorator it can also be formatted using
                          `func_name`, `args`, and {0}, {1}, .. to reference the function's
                          first, second, ... argument.
            sort_by: pstats sorting column
예제 #18
0
import argparse
import logging
import os
import sys
import traceback
from typing import Dict

from playground import ColorizedLogger, timeit, Configuration, NumbaPlay
from playground import run_math_calc_test, run_fill_and_empty_list_test

logger = ColorizedLogger(logger_name='Main', color='yellow')


def get_args() -> argparse.Namespace:
    """Setup the argument parser

    Returns:
        argparse.Namespace:
    """
    parser = argparse.ArgumentParser(
        description='A playground repo for the DSE-512 course..',
        add_help=False)
    # Required Args
    required_args = parser.add_argument_group('Required Arguments')
    config_file_params = {
        'type': argparse.FileType('r'),
        'required': True,
        'help': "The configuration yml file"
    }
    required_args.add_argument('-c', '--config-file', **config_file_params)
    # Optional args
예제 #19
0
class MPlayI:
    __slots__ = ('comm', 'rank', 'size')
    comm: MPI.COMM_WORLD
    rank: int
    size: int
    logger: ColorizedLogger = ColorizedLogger('MPI Play', 'cyan')
    colors: Dict = {
        0: 'blue',
        1: 'green',
        2: 'magenta',
        3: 'cyan',
        4: 'yellow',
        5: 'white',
        6: 'grey',
        7: 'black'
    }

    def __init__(self):
        self._mpi_log_setup()
        self.comm = MPI.COMM_WORLD
        self.rank = self.comm.rank
        self.size = self.comm.size

        if self.rank == 0:
            self.logger.info(f"Starting with size: {self.size}")

    @staticmethod
    def _mpi_log_setup():
        sys_path = os.path.dirname(os.path.realpath(__file__))
        log_path = os.path.join(sys_path, '..', '..', 'logs', 'mpi.log')
        ColorizedLogger.setup_logger(log_path=log_path)

    @staticmethod
    def _chunk_list(seq, num):
        avg = len(seq) / float(num)
        out = []
        last = 0.0

        while last < len(seq):
            out.append(seq[int(last):int(last + avg)])
            last += avg
        return out

    @staticmethod
    def _chunk_for_scatterv(np_arr, size):
        avg_items_per_split, remaining_items = divmod(np_arr.shape[0], size)
        items_per_split = [avg_items_per_split + 1
                           if p < remaining_items else avg_items_per_split
                           for p in range(size)]
        items_per_split = np.array(items_per_split)
        # displacement: the starting index of each sub-task
        starting_index = [sum(items_per_split[:p]) for p in range(size)]
        starting_index = np.array(starting_index)
        return items_per_split, starting_index

    def simple(self):
        self.logger.info(f"Hello from rank {self.rank} of size {self.size}")
        # Wait for everyone to sync up
        self.comm.Barrier()

    def broadcast(self):

        if self.rank == 0:
            x = np.random.randn(4) * 100
        else:
            x = np.empty(4, dtype=np.float64)
            self.logger.info(x)

        self.logger.info(f"Rank {self.rank} before broadcast has {x}")
        self.comm.Bcast([x, MPI.DOUBLE])
        self.logger.info(f"Rank {self.rank} after broadcast has {x}")

    def scatter_gather(self):
        if self.rank == 0:
            data = [x for x in range(self.size)]
        else:
            data = None
        data = self.comm.scatter(data, root=0)
        self.logger.info(f"Rank {self.rank} after scatter has {data}")
        self.comm.Barrier()
        if self.rank == 0:
            self.logger.info("****Gathering!****")
        data = self.comm.gather(data, root=0)
        self.logger.info(f"Rank {self.rank} after gather has {data}")

    def all_gather(self):
        if self.rank == 0:
            data = [x for x in range(self.size)]
            self.logger.info(f"Rank {self.rank} scattering {data}")
        else:
            data = None

        data = self.comm.scatter(data, root=0)
        self.logger.info(f"Rank {self.rank} after scatter has {data}")

        self.comm.Barrier()

        if self.rank == 0:
            self.logger.info("****Gathering!****")

        # Note that we no longer specify the root here!
        data = self.comm.allgather(data)
        self.logger.info(f"Rank {self.rank} after gather has {data}")

    def mpi_reduce(self):
        if self.rank == 0:
            data = [x for x in range(1, self.size + 1)]
            self.logger.info(f"Rank {self.rank} scattering {data}")
        else:
            data = None

        data = self.comm.scatter(data, root=0)
        self.logger.info(f"Rank {self.rank} after scatter has {data}")

        self.comm.Barrier()

        if self.rank == 0:
            self.logger.info(f"****Reduce!****")

        data = self.comm.reduce(data, root=0)
        self.logger.info(f"Rank {self.rank} after reduce has {data}")

    def mpi_all_reduce(self):
        if self.rank == 0:
            data = [x for x in range(1, self.size + 1)]
            self.logger.info(f"Rank {self.rank} scattering {data}")
        else:
            data = None

        data = self.comm.scatter(data, root=0)
        self.logger.info(f"Rank {self.rank} after scatter has {data}")

        self.comm.Barrier()

        if self.rank == 0:
            self.logger.info(f"****Reduce!****")

        # Similar to allgather, we do no specify a root process!
        data = self.comm.allreduce(data)
        self.logger.info(f"Rank {self.rank} after reduce has {data}")

    def count_lines(self):
        """Count the total lines of files in specified folder"""
        if self.rank == 0:
            from glob import glob
            files_path = os.path.join('data', 'mpi_count_lines', '*.txt')
            files = list(glob(files_path))
            files = self._chunk_list(files, self.size)
        else:
            files = None
        files = self.comm.scatter(files, root=0)
        self.logger.info(f"Rank {self.rank} has to count lines for these files: {files}")
        lines_cnt = 0
        for file in files:
            with open(file, 'r') as f:
                lines_cnt += sum(1 for _ in f)
        self.logger.info(f"Rank {self.rank} counted {lines_cnt} lines in total")
        self.comm.Barrier()

        total_lines_cnt = self.comm.reduce(lines_cnt, root=0)
        if self.rank == 0:
            self.logger.info(f"After reduce, counted {total_lines_cnt} lines from all ranks.")

    def reduce_complex_old(self):

        if self.rank == 0:
            data = np.array([[x * 1.1, x * 1.1 + 2, x * 1.1 + 4, x * 1.1 + 6] for x in
                             range(1, (self.size + 1) * 2)])
            # Create output array of same size
            res = np.zeros_like(data[0])
            # Split input array by the number of available cores
            data_ch = np.array_split(data, self.size, axis=0)

            chunk_sizes = []

            for i in range(0, len(data_ch), 1):
                chunk_sizes = np.append(chunk_sizes, len(data_ch[i]))

            chunk_sizes_input = chunk_sizes * data.shape[1]
            displacements_input = np.insert(np.cumsum(chunk_sizes_input), 0, 0)[0:-1]

            chunk_sizes_output = chunk_sizes * data.shape[1]
            displacements_output = np.insert(np.cumsum(chunk_sizes_output), 0, 0)[0:-1]
            self.logger.info(f"Rank {self.rank} scattering {data_ch}")
            self.logger.info(f"Expected result format: {res}")
        else:
            # Create variables on other cores
            chunk_sizes_input = None
            displacements_input = None
            chunk_sizes_output = None
            displacements_output = None
            data_ch = None
            data = None
            res = None

        data_ch = self.comm.bcast(data_ch, root=0)  # Broadcast split array to other cores
        chunk_sizes_output = self.comm.bcast(chunk_sizes_output, root=0)
        displacements_output = self.comm.bcast(displacements_output, root=0)

        # Create array to receive subset of data on each core, where rank specifies the core
        output_chunk = np.zeros(np.shape(data_ch[self.rank]))
        self.comm.Scatterv([data, chunk_sizes_input, displacements_input, MPI.DOUBLE], output_chunk,
                           root=0)
        self.logger.info(
            f"Rank {self.rank} after scatter has (shape: {output_chunk.shape}):\n{output_chunk}")
        # Create output array on each core
        output = np.zeros([len(output_chunk), output_chunk.shape[1]])

        for i in range(0, np.shape(output_chunk)[0], 1):
            output[i, 0:output_chunk.shape[1]] = output_chunk[i]

        self.comm.Barrier()

        if self.rank == 0:
            self.logger.info(f"****Reduce!****")

        # self.comm.Gatherv(output, [res, chunk_sizes_output, displacements_output, MPI.DOUBLE],
        #                   root=0)  # Gather output data together
        output = np.mean(output, axis=0)
        self.logger.info(f"Rank {self.rank} output (shape: {output.shape}):\n{output}")
        self.comm.Reduce(
            output,
            [res, chunk_sizes_output, displacements_output, MPI.DOUBLE],
            op=MPI.SUM,
            root=0
        )

        self.logger.info(f"Rank {self.rank} after reduce has {res}")

    def reduce_complex(self):
        if self.rank == 0:
            data = np.array([[x * 1., (x + 2) * 1., (x + 4) * 1., (x + 6) * 1.] for x in
                             range(1, (self.size + 1) * 2)])
            cluster_assignments = np.array([0, 1, 1, 3, 1, 1, 2, 4, 1, 3, 3], dtype=np.int64)
            num_features = data.shape[1]
            items_per_split_orig, starting_index_orig = self._chunk_for_scatterv(data, self.size)
            items_per_split = items_per_split_orig * num_features
            starting_index = starting_index_orig * num_features
            self.logger.info(f"Data ({data.shape}): {data[:1]}, ..")
            data = data.flatten()
            self.logger.info(f"Data Flat "
                             f"({data.shape}, {data.dtype}):{data[:6]}, ..")
            self.logger.info(f"Assignments({cluster_assignments.shape}, {cluster_assignments.dtype}): "
                             f"{cluster_assignments}")
            self.logger.info(f"Items per split Original: {items_per_split_orig}")
            self.logger.info(f"Items per split: {items_per_split}")
            self.logger.info(f"Starting Index Original: {starting_index_orig}")
            self.logger.info(f"Starting Index: {starting_index}")
            self.logger.info(f"Num Features: {num_features}")
        else:
            data = None
            cluster_assignments = None
            num_features = None
            # initialize items_per_split, and starting_index on worker processes
            items_per_split = np.zeros(self.size, dtype=np.int)
            items_per_split_orig = np.zeros(self.size, dtype=np.int)
            starting_index = None
            starting_index_orig = None

        # Broadcast the number of items per split
        self.comm.Bcast(items_per_split, root=0)
        self.comm.Bcast(items_per_split_orig, root=0)
        num_features = self.comm.bcast(num_features, root=0)

        # Scatter cluster assignments
        cluster_assignments_chunked = np.zeros(items_per_split_orig[self.rank], dtype=np.int64)
        self.logger.info(f"Initialized chunked assignments ({cluster_assignments_chunked.shape}): "
                         f"{cluster_assignments_chunked}")
        self.comm.Scatterv([cluster_assignments, items_per_split_orig, starting_index_orig,
                            MPI.INT64_T], cluster_assignments_chunked, root=0)
        self.logger.info(f"Received cluster_assignments_chunked "
                         f"({cluster_assignments_chunked.shape}): {cluster_assignments_chunked}")

        # Scatter data points-features
        data_chunked_flat = np.zeros(items_per_split[self.rank])
        self.comm.Scatterv([data, items_per_split, starting_index, MPI.DOUBLE],
                           data_chunked_flat,
                           root=0)
        data_chunked = data_chunked_flat.reshape(-1, num_features)
        self.logger.info(f"Received data_chunked ({data_chunked.shape}):\n{data_chunked}")

        # Reduce and find average for cluster 1
        if self.rank == 0:
            self.logger.info(f"****Reduce!****")

        # Find avg for cluster 1 only
        data_chunked_clust_1 = data_chunked[cluster_assignments_chunked == 1]
        self.logger.info(f"Data for cluster 1 (shape: {data_chunked_clust_1.shape}:\n "
                         f"{data_chunked_clust_1}")
        # Find sum of each cluster
        size_cluster_1_chunked = data_chunked_clust_1.shape[0]
        if size_cluster_1_chunked > 0:
            sum_cluster_1_chunked = np.sum(data_chunked_clust_1, axis=0)
        else:
            sum_cluster_1_chunked = np.zeros_like(data_chunked[0])
        self.logger.info(f"Sum cluster 1 (shape: {sum_cluster_1_chunked.shape}: "
                         f"{sum_cluster_1_chunked}")
        # Reduce the internal sums to find total sum
        sum_cluster_1 = np.zeros_like(sum_cluster_1_chunked)
        self.comm.Reduce([sum_cluster_1_chunked, MPI.DOUBLE], [sum_cluster_1, MPI.DOUBLE],
                         op=MPI.SUM, root=0)

        self.logger.info(f"Chunked size: {size_cluster_1_chunked}")
        total_size = self.comm.reduce(size_cluster_1_chunked, op=MPI.SUM, root=0)
        if self.rank == 0:
            self.logger.info(f"Total size: {total_size}. Summed sums: {sum_cluster_1}")
            avg_cluster_1 = sum_cluster_1 / total_size
            self.logger.info(f"Average Cluster 1: {avg_cluster_1}")
예제 #20
0
 def _mpi_log_setup():
     sys_path = os.path.dirname(os.path.realpath(__file__))
     log_path = os.path.join(sys_path, '..', '..', 'logs', 'mpi.log')
     ColorizedLogger.setup_logger(log_path=log_path)
예제 #21
0
import logging
import traceback
import os
import sys
from typing import Dict

from playground.main import get_args
from playground import ColorizedLogger, Configuration, timeit

# Create loggers with different colors to use in each problem
main_logger = ColorizedLogger('Main', 'yellow')


def prepare_for_run(name: str, conf: Dict):
    conf_props = conf['properties']
    num_clusters = conf_props['num_clusters']
    dataset = conf_props['dataset']
    python_file_name = 'kmeans.py'
    dataset_name = 'tcga' if dataset != 'iris' else dataset
    main_logger.info(
        f"Invoking {python_file_name}({name}) "
        f"for {num_clusters} clusters and the {dataset_name} dataset")
    return python_file_name, num_clusters, dataset, dataset_name


def run_serial(name: str, conf: Dict, log_name: str) -> None:
    """ Runs the KMeans ser9ap version for the specified configuration. """

    # Extract the properties
    python_file_name, num_clusters, dataset, dataset_name = prepare_for_run(
        name, conf)
예제 #22
0
 def __init__(self, conf):
     self.logger = ColorizedLogger(f'NumbaPlay', 'blue')
     self.logger.info(f"Initialized NumbaPlay..")
     self.conf = conf
예제 #23
0
import logging
import traceback
import os
from typing import Dict
from math import ceil
from itertools import repeat, takewhile

import multiprocessing
import numpy as np

from playground.main import get_args
from playground import ColorizedLogger, Configuration, timeit

# Create loggers with different colors to use in each problem
main_logger = ColorizedLogger('Main', 'yellow')
p1_logger = ColorizedLogger('Problem1', 'blue')
p2_logger = ColorizedLogger('Problem2', 'green')
p3_logger = ColorizedLogger('Problem3', 'magenta')
extra_ch_logger = ColorizedLogger('ExtraMain', 'yellow')
extra_sub_ch_logger = ColorizedLogger('ExtraSub', 'cyan')

# Global Vars (For the Extra Challenges)
# lock: multiprocessing.Lock
# multi_list: List = []


def my_pid(x: int) -> None:
    """ Problem 1 function to be called using pool.map

    Parameters:
        x: the id of the worker
예제 #24
0
class KMeansRunner:
    __slots__ = ('comm', 'rank', 'size', 'logger', 'mpi_enabled')
    comm: MPI.COMM_WORLD
    rank: int
    size: int
    logger: ColorizedLogger
    colors: Dict = {
        0: 'blue',
        1: 'green',
        2: 'magenta',
        3: 'cyan',
        4: 'yellow',
        5: 'white',
        6: 'grey',
        7: 'black'
    }

    def __init__(self, mpi):
        self._kmeans_log_setup()
        self.mpi_enabled = mpi
        if self.mpi_enabled:
            self.comm = MPI.COMM_WORLD
            self.rank = self.comm.rank
            self.size = self.comm.size
            self.logger = ColorizedLogger('Kmeans %s' % self.rank,
                                          self.colors[self.rank])
        else:
            self.logger = ColorizedLogger('Kmeans Serial', self.colors[0])

    @staticmethod
    def _kmeans_log_setup():
        sys_path = os.path.dirname(os.path.realpath(__file__))
        log_path = os.path.join(sys_path, '..', '..', 'logs', 'kmeans.log')
        ColorizedLogger.setup_logger(log_path=log_path)

    @staticmethod
    def _chunk_list(seq, num):
        avg = len(seq) / float(num)
        out = []
        last = 0.0

        while last < len(seq):
            out.append(seq[int(last):int(last + avg)])
            last += avg
        return out

    @staticmethod
    def _run_vectorized(features: np.ndarray, num_clusters: int):
        """Run k-means algorithm to convergence.

        This is the Lloyd's algorithm [2] which consists of alternating expectation
        and maximization steps.

        Args:
            features: numpy.ndarray: An num_features-by-d array describing num_features data points
            each of dimension d.
            num_clusters: int: The number of clusters desired.
        Returns:
            centroids: numpy.ndarray: A num_clusters-by-d array of cluster centroid
                positions.
            cluster_assignments: numpy.ndarray: An num_features-length vector of integers whose values
                from 0 to num_clusters-1 indicate which cluster each data element
                belongs to.

        [1] https://en.wikipedia.org/wiki/K-means_clustering
        [2] https://en.wikipedia.org/wiki/Lloyd%27s_algorithm
        """
        num_features = features.shape[0]  # num sample points
        #
        # INITIALIZATION PHASE
        # initialize centroids randomly as distinct elements of features
        np.random.seed(0)
        centroid_ids = np.random.choice(num_features, (num_clusters, ),
                                        replace=False)
        centroids = features[centroid_ids, :]
        cluster_assignments = np.zeros(num_features, dtype=np.uint8)
        # Loop until convergence
        while True:
            # Compute distances from sample points to centroids
            # all  pair-wise _squared_ distances
            centroid_distances = np.square(features[:, np.newaxis] -
                                           centroids).sum(axis=2)

            # Expectation step: assign clusters
            previous_assignments = cluster_assignments
            cluster_assignments = np.argmin(centroid_distances, axis=1)

            # Maximization step: Update centroid for each cluster
            for cluster_ind in range(num_clusters):
                features_of_curr_cluster = features[cluster_assignments ==
                                                    cluster_ind]
                centroids[cluster_ind, :] = np.mean(features_of_curr_cluster,
                                                    axis=0)
            # USE PANDAS TO GROUP BY CLUSTER -> MEAN ???
            # Break Condition
            if (cluster_assignments == previous_assignments).all():
                break

        # return cluster centroids and cluster_assignments
        return centroids, cluster_assignments

    @staticmethod
    def _run_simple(features: np.ndarray, num_clusters: int):
        """Run k-means algorithm to convergence.

        Args:
            features: numpy.ndarray: An N-by-d array describing N data points each of dimension d
            num_clusters: int: The number of clusters desired
        """

        N = features.shape[0]  # num sample points
        d = features.shape[1]  # dimension of space

        #
        # INITIALIZATION PHASE
        # initialize centroids randomly as distinct elements of features
        np.random.seed(0)
        cids = np.random.choice(N, (num_clusters, ), replace=False)
        centroids = features[cids, :]
        assignments = np.zeros(N, dtype=np.uint8)

        # loop until convergence
        while True:
            # Compute distances from sample points to centroids
            # all  pair-wise _squared_ distances
            cdists = np.zeros((N, num_clusters))
            for i in range(N):
                xi = features[i, :]
                for c in range(num_clusters):
                    cc = centroids[c, :]
                    dist = 0
                    for j in range(d):
                        dist += (xi[j] - cc[j])**2
                    cdists[i, c] = dist

            # Expectation step: assign clusters
            num_changed_assignments = 0
            for i in range(N):
                # pick closest cluster
                cmin = 0
                mindist = np.inf
                for c in range(num_clusters):
                    if cdists[i, c] < mindist:
                        cmin = c
                        mindist = cdists[i, c]
                if assignments[i] != cmin:
                    num_changed_assignments += 1
                assignments[i] = cmin

            # Maximization step: Update centroid for each cluster
            for c in range(num_clusters):
                newcent = 0
                clustersize = 0
                for i in range(N):
                    if assignments[i] == c:
                        newcent = newcent + features[i, :]
                        clustersize += 1
                newcent = newcent / clustersize
                centroids[c, :] = newcent

            if num_changed_assignments == 0:
                break

        # return cluster centroids and assignments
        return centroids, assignments

    def run_serial(self, num_clusters: int, type_run: str):
        from sklearn.datasets import load_iris
        features, labels = load_iris(return_X_y=True)

        # run k-means
        if type_run == 'simple':
            centroids, assignments = self._run_simple(
                features=features, num_clusters=num_clusters)
        elif type_run == 'vectorized':
            centroids, assignments = self._run_vectorized(
                features=features, num_clusters=num_clusters)
        else:
            raise Exception(f'Argument {type_run} not recognized!')

        # print out results
        self.logger.info(
            f"\nCentroids: {centroids}\nAssignments: {assignments}")