Exemplo n.º 1
0
 def pythagorean_test(self):
     self.logger.info(
         f"Starting pythagorean tests for x={self.conf['x']}, y={self.conf['x']}.."
     )
     with timeit(
             custom_print='Numba pythagorean took {duration:.5f} sec(s)'):
         self.pythagorean_theorem(self.conf['x'], self.conf['y'])
     with timeit(
             custom_print='No Numba pythagorean took {duration:.5f} sec(s)'
     ):
         self.pythagorus(self.conf['x'], self.conf['y'])
Exemplo n.º 2
0
def _loop_vectorized_jacob(num_clusters: int, num_points: int,
                           cluster_assignments: np.ndarray,
                           features: np.ndarray, centroids: np.ndarray,
                           outputs_file: IO):
    loop_cnt = 0
    t_time_dists = 0.0
    t_time_expect = 0.0
    t_time_maxim = 0.0
    while True:
        loop_cnt += 1
        # Compute distances from sample points to centroids
        timeit_obj = timeit(internal_only=True)
        with timeit_obj:
            centroid_distances = _compute_distances_vectorized_jacob(
                num_points, num_clusters, centroids, features)
        t_time_dists += timeit_obj.total

        # Expectation step: assign clusters
        timeit_obj = timeit(internal_only=True)
        with timeit_obj:
            centroid_distances, cluster_assignments, num_changed_assignments = \
                _expectation_step_vectorized_jacob(num_points, num_clusters,
                                                   centroid_distances,
                                                   cluster_assignments)
        t_time_expect += timeit_obj.total

        # Maximization step: Update centroid for each cluster
        timeit_obj = timeit(internal_only=True)
        with timeit_obj:
            centroids = _maximization_step_vectorized_jacob(
                num_clusters, num_points, cluster_assignments, features,
                centroids)
        t_time_maxim += timeit_obj.total

        if num_changed_assignments == 0:
            break

    custom_print = '_compute_distances_vectorized_jacob K-Means using numba' + \
                   f' dataset took {t_time_dists:.4f} sec(s)\n'
    outputs_file.write(custom_print)
    custom_print = '_expectation_step_vectorized_jacob K-Means using numba' + \
                   f' dataset took {t_time_expect:.4f} sec(s)\n'
    outputs_file.write(custom_print)
    custom_print = '_maximization_step_vectorized_jacob K-Means using numba' + \
                   f' dataset took {t_time_maxim:.4f} sec(s)\n'
    outputs_file.write(custom_print)

    # return cluster centroids and assignments
    return centroids, cluster_assignments
Exemplo n.º 3
0
def run_kmeans(conf: Dict) -> None:
    """ Runs the KMeans tests for the specified configuration. """

    config = conf['config']
    run_type = conf['type']
    num_clusters = config['num_clusters']
    logger.info(
        f"Invoking kmeans.py with type=`{run_type}` and num_clusters=`{num_clusters}`"
    )
    sys_path = os.path.dirname(os.path.realpath(__file__))
    run_file_path = os.path.join(sys_path, 'KMeans', 'kmeans.py')
    if run_type == 'mpi':
        nprocs = config['nprocs']
        cmd = 'mpirun -n {nprocs} {python} {file} {num_clusters} {type}' \
            .format(nprocs=nprocs,
                    python=sys.executable,
                    file=run_file_path,
                    type=run_type,
                    num_clusters=num_clusters)

    elif run_type in ('simple', 'vectorized', 'distributed'):
        cmd = '{python} {file} {num_clusters} {type}'.format(
            python=sys.executable,
            file=run_file_path,
            type=run_type,
            num_clusters=num_clusters)
    else:
        raise Exception(f'Argument {run_type} not recognized!')
    with timeit(custom_print=
                f'Running KMeans {run_type} for {num_clusters} clusters took' +
                ' {duration:2.5f} sec(s)'):
        os.system(cmd)
Exemplo n.º 4
0
def run_vectorized_jacob(features: np.ndarray, num_clusters: int,
                         outputs_file: IO):
    """Run k-means algorithm to convergence.

    Args:
        outputs_file:
        features: numpy.ndarray: An num_points-by-d array describing num_points data points each
        of dimension d
        num_clusters: int: The number of clusters desired
    """
    num_points = features.shape[0]  # num sample points

    # INITIALIZATION PHASE
    # initialize centroids randomly as distinct elements of xs
    np.random.seed(0)
    centroids_ids = np.random.choice(num_points, (num_clusters, ),
                                     replace=False)
    centroids = features[centroids_ids, :]
    cluster_assignments = np.zeros(num_points, dtype=np.uint8)

    # loop until convergence
    custom_print = f'_loop_vectorized_jacob K-Means using numba' + \
                   ' dataset took {duration:.4f} sec(s)\n'
    with timeit(file=outputs_file, custom_print=custom_print):
        centroids, cluster_assignments = \
            _loop_vectorized_jacob(num_clusters, num_points, cluster_assignments,
                                   features, centroids, outputs_file)

    # return cluster centroids and assignments
    return centroids, cluster_assignments
Exemplo n.º 5
0
 def monte_carlo_pi_test(self):
     self.logger.info(
         f"Starting monte_carlo_pi tests for nsamples={self.conf['nsamples']}.."
     )
     with timeit(
             custom_print='Numba monte_carlo_pi took {duration:.5f} sec(s)'
     ):
         self.monte_carlo_pi(self.conf['nsamples'])
Exemplo n.º 6
0
def run(kmeans_obj: KMeansRunner, run_type: str, num_clusters: int,
        dataset: str):
    """

    Args:
        kmeans_obj:
        num_clusters: The number of clusters to find
        dataset: The name or path of the dataset

    Returns:

    Info:
        features shape: (# points, # features)
        centroids shape: (# clusters, # features)
        centroid_distances shape: (# points, # clusters)
    """

    # Setup func to run and dataset to use
    dataset_name = 'tcga' if dataset != 'iris' else dataset

    # Prepare output folders and names
    sys_path = os.path.dirname(os.path.realpath(__file__))
    output_file_name = f'assignment3_{dataset_name}_{run_type}.txt'
    output_base_path = os.path.join(sys_path, '..', 'outputs')
    if not os.path.exists(output_base_path):
        os.makedirs(output_base_path)
    output_file_path = os.path.join(output_base_path, output_file_name)

    # Open results output file
    outputs_file = open(output_file_path, 'w')
    outputs_file.write(
        f'K-Means {run_type} version for the {dataset_name} dataset '
        f'with {num_clusters} clusters .\n')

    # Load Dataset if not already loaded
    features = kmeans_obj._load_dataset(dataset_name, dataset)

    # Run Kmeans
    custom_print = f'`{run_type}` K-Means for the `{dataset_name}`' + \
                   ' dataset took {duration:.4f} sec(s)\n'
    with timeit(file=outputs_file, custom_print=custom_print):
        centroids, assignments = run_vectorized_jacob(
            features=features,
            num_clusters=num_clusters,
            outputs_file=outputs_file)

    # Save results
    kmeans_obj.logger.info(f"Final Cluster Assignments: \n{assignments}")
    outputs_file.write(f'Assignments:\n')
    outputs_file.write(f'{assignments.tolist()}\n')
    outputs_file.write(f'Centroids:\n')
    outputs_file.write(f'{centroids.tolist()}')

    # Close file stream
    outputs_file.close()
Exemplo n.º 7
0
 def logistic_regression_test(self):
     self.logger.info(
         f"Starting logistic_regression tests for "
         f"X={self.conf['x1']}, Y={self.conf['x2']}, "
         f"w={self.conf['w']}, iterations={self.conf['iterations']}..")
     with timeit(custom_print=
                 'Numba logistic_regression took {duration:.5f} sec(s)'):
         self.logistic_regression(
             np.random.rand(self.conf['x1'], self.conf['x2']),
             np.random.rand(self.conf['x1']), np.zeros([self.conf['x2']]),
             self.conf['iterations'])
Exemplo n.º 8
0
def extra_2(conf_props: Dict):
    """ Extra Challenge 2 solution

    Parameters:
         conf_props: The config loaded from the yml file
            Example:
                pool_sizes:
                  - 2
                  - 4
                  - 8
                  - 16
                  - 32
                num_term: 8000000
    """

    num_term = int(conf_props["num_term"])

    extra_ch_logger.info(
        "Will call `py_pi_better_with_queue` for pool_size in "
        f"{tuple(conf_props['pool_sizes'])}")
    for pool_size in conf_props["pool_sizes"]:
        pool_size = int(pool_size)
        # Create a multiprocessing manager and a Queue
        multi_manager = multiprocessing.Manager()
        multi_queue = multi_manager.Queue()
        # Slightly modified code from problem 3
        extra_sub_ch_logger.info(
            f"Pool Size={pool_size}: Calling Async workers for N={num_term}")
        step = ceil(num_term / pool_size)
        i_start = range(1, num_term, step)
        i_stop = map(
            lambda el: el + step - 1
            if el + step - 1 <= num_term else num_term, i_start)
        args = zip(repeat(num_term), i_start, i_stop, repeat(multi_queue))
        # Setup manually a Pool
        custom_string = f'Pool Size={pool_size}: Calculation of pi for N={num_term} and  ' + \
                        'took: {duration:2.5f} sec(s) total'
        with timeit(custom_print=custom_string):
            pool = multiprocessing.Pool(processes=pool_size)
            pool.starmap_async(func=py_pi_better_with_queue, iterable=args)
            pi_chunks = []
            while len(pi_chunks) < pool_size:
                pi_chunks.append(multi_queue.get())
            calced_pi = sum(pi_chunks)
        real_pi = np.pi
        pi_diff = abs(real_pi - calced_pi)
        extra_sub_ch_logger.info(
            f"Pool Size={pool_size}: Pi({num_term}) = {calced_pi}"
            f"(Real is {real_pi}, difference is {pi_diff})")
        pool.close()
        pool.join()
Exemplo n.º 9
0
    def train_data_parallel(
            self, train_loader: DataLoader) -> Tuple[List, List, List]:

        my_model = nn.parallel.DistributedDataParallel(self.my_model)
        learning_rate = self.learning_rate * dist.get_world_size()
        optimizer = optim.SGD(my_model.parameters(), lr=learning_rate)

        size_train_dataset = len(train_loader.dataset)
        epoch_losses = []
        epoch_accuracies = []
        epoch_times = []

        self.my_model.train()
        if self.rank == 0:
            iter_epochs = tqdm(range(self.epochs), desc='Training Epochs')
        else:
            iter_epochs = range(self.epochs)

        for _ in iter_epochs:
            timeit_ = timeit(internal_only=True)
            epoch_loss = 0.0
            correct = 0
            num_mini_batches = 0
            with timeit_:
                iter_mini_batches = enumerate(train_loader)
                for num_mini_batches, (X, Y) in iter_mini_batches:
                    optimizer.zero_grad()
                    pred = self.my_model(X)
                    pred_val = torch.flatten(pred.data.max(1, keepdim=True)[1])
                    # correct += pred_val.eq(Y.data.view_as(pred_val)).sum().item()
                    correct += (pred_val == Y).sum().item()
                    loss = self.loss_function(pred, Y)
                    iter_loss = loss.item()
                    epoch_loss += iter_loss
                    loss.backward()
                    optimizer.step()

            epoch_loss /= (num_mini_batches + 1)
            epoch_losses.append(epoch_loss)
            epoch_accuracy = correct / (size_train_dataset /
                                        dist.get_world_size())
            epoch_accuracies.append(epoch_accuracy)
            epoch_time = timeit_.total
            epoch_times.append(epoch_time)
            if self.rank == 0:
                iter_epochs.set_postfix(epoch_accuracy=epoch_accuracy,
                                        epoch_loss=epoch_loss,
                                        epoch_time=epoch_time)

        return epoch_accuracies, epoch_losses, epoch_times
Exemplo n.º 10
0
def problem3(conf: Dict) -> None:
    """ Problem 3 solution

    Parameters:
         conf: The config loaded from the yml file
            Example:
                properties:
                  pool_size: 4
                  chunk_size: 1
                  num_terms:
                    - 100
                    - 500
                    - 1000
                    - 2000
                    - 10000
                    - 50000
                conf_type: required
    """

    p3_logger.info("Starting Problem 3..")
    conf_props = conf['properties']
    p3_logger.info(
        f"Will call `py_pi_better` for N in {tuple(conf_props['num_terms'])}")
    # Run the pi calculation once for each number of terms requested in the yml
    for num_term in conf_props["num_terms"]:
        # Split the work of `num_terms` into `pool_size` number of parts
        step = ceil(num_term / conf_props["pool_size"])
        i_start = range(1, num_term, step)
        i_stop = map(
            lambda el: el + step - 1
            if el + step - 1 <= num_term else num_term, i_start)
        # Zip N with the i_start and i_stop iterables. Propagate the same N value using repeat
        args = zip(repeat(num_term), i_start, i_stop)
        # Call py_pi_better() using pool.starmap() (starmap accepts iterable with multiple arguments)
        with multiprocessing.Pool(processes=conf_props['pool_size']) as pool:
            # timeit can be used as a context manager too. Pass it a custom string and count the
            # total time to calculate pi
            custom_string = f'N={num_term}: Parallel calculation of pi took: ' + \
                            '{duration:2.5f} sec(s) total'
            with timeit(custom_print=custom_string):
                # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool.map
                pi_chunks = pool.starmap(func=py_pi_better,
                                         iterable=args,
                                         chunksize=conf_props['chunk_size'])
                calced_pi = sum(pi_chunks)
        real_pi = np.pi
        pi_diff = abs(real_pi - calced_pi)
        p3_logger.info(
            f"N={num_term}: Pi({num_term}) = {calced_pi} (Real is {real_pi}, "
            f"difference is {pi_diff})")
Exemplo n.º 11
0
def extra_1(conf_props: Dict):
    """ Extra Challenge 1 solution

    Parameters:
         conf_props: The config loaded from the yml file
            Example:
                pool_sizes:
                  - 2
                  - 4
                  - 8
                  - 16
                  - 32
                chunk_size: 1
                num_term: 8000000
    """

    num_term = int(conf_props["num_term"])
    extra_ch_logger.info("Will call `py_pi_better` for pool_size in "
                         f"{tuple(conf_props['pool_sizes'])}")
    for pool_size in conf_props["pool_sizes"]:
        pool_size = int(pool_size)
        # Slightly modified code from problem 3
        extra_sub_ch_logger.info(
            f"Pool Size={pool_size}: Calling workers for N={num_term}")
        step = ceil(num_term / pool_size)
        i_start = range(1, num_term, step)
        i_stop = map(
            lambda el: el + step - 1
            if el + step - 1 <= num_term else num_term, i_start)
        args = zip(repeat(num_term), i_start, i_stop)
        with multiprocessing.Pool(processes=pool_size) as pool:
            custom_string = f'Pool Size={pool_size}: Calculation of pi for N={num_term} took: ' + \
                            '{duration:2.5f} sec(s) total'
            with timeit(custom_print=custom_string):
                pi_chunks = pool.starmap(func=py_pi_better,
                                         iterable=args,
                                         chunksize=conf_props['chunk_size'])
                calced_pi = sum(pi_chunks)
        real_pi = np.pi
        pi_diff = abs(real_pi - calced_pi)
        extra_sub_ch_logger.info(
            f"Pool Size={pool_size}: Pi({num_term}) = {calced_pi} "
            f"(Real is {real_pi}, difference is {pi_diff})")
Exemplo n.º 12
0
 def prange_test(self):
     self.logger.info(f"Starting prange tests for A={self.conf['A']}..")
     with timeit(custom_print='Numba prange took {duration:.5f} sec(s)'):
         self.prange(np.arange(self.conf['A']))