def optimize(self, net_string, device_graph): """ Optimizes a configuration for the given net on the given hardware. :param net: The network that should be optimized, given as a json string. :param device_graph: The device graph that the network should be optimized for. :return: A network JSON string with optimized device placements. """ net = json.loads(net_string) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) best_score = -1 best_net = None for comb in tqdm(product(range(len(device_graph.devices)), repeat=len(groups)), total=len(device_graph.devices)**len(groups), unit='placements'): net = apply_placement(net_string, comb, groups) score = self.evaluate_placement(net, device_graph) if score < best_score or best_net is None: best_net = net best_score = score return best_net
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) placement = [randint(0, n_devices - 1) for n in range(len(groups))] # [0] * len(groups) score = self.evaluate_placement( apply_placement(net_string, placement, groups), device_graph) if self.score_save_period: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('step, time\n') for i in tqdm(range(self.steps), disable=not self.verbose): new_placement = placement[:] new_placement[randint(0, len(new_placement) - 1)] = randint( 0, n_devices - 1) new_score = self.evaluate_placement( apply_placement(net_string, new_placement, groups), device_graph) if self.verbose and (i + 1) % self.verbose == 0: log(f'[{i + 1}/{self.steps}] Best run time: {score:,.2f}ms') if self.score_save_period and i % self.score_save_period == 0: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + 1}, {score}\n') if new_score != -1: if new_score < score or score == -1 \ or random() < expit((score - new_score) / self.temp(i)): score = new_score placement = new_placement solution = json.dumps(apply_placement(net_string, placement, groups), indent=4) with open(os.path.join(get_log_dir(), 'sa_solution.json'), 'w') as f: f.write(solution) return solution
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) def generate_neighbours(placement): if n_devices == 1: return i = 0 while i < len(placement): p = placement[i] if p < n_devices - 1: n = placement[:] n[i] = p + 1 yield n if p > 0: n = placement[:] n[i] = p - 1 yield n i += 1 net = json.loads(net_string) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) placement = generate_random_placement(len(groups), n_devices) score = self.evaluate_placement( apply_placement(net_string, placement, groups), device_graph) i = 0 while True: i += 1 if self.verbose: log(f'Iteration {i}. Best running time: {score:.2f}ms') for n in generate_neighbours(placement): new_score = self.evaluate_placement( apply_placement(net_string, n, groups), device_graph) if (new_score < score or score == -1) and new_score != -1: placement = n score = new_score break else: break return placement
def eval_function(x): nonlocal i t.update(1) new_placement = [int(round(g)) for g in x] score = self.evaluate_placement( apply_placement(net_string, new_placement, groups), device_graph) i += 1 return score
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('generation, time\n') i = 0 with tqdm(total=self.steps, disable=not self.verbose) as t: def eval_function(x): nonlocal i t.update(1) new_placement = [int(round(g)) for g in x] score = self.evaluate_placement( apply_placement(net_string, new_placement, groups), device_graph) i += 1 return score def callback(x, score, context): if self.verbose: log(f'[{i + 1}/{self.steps}] Found new minimum: {score:.2f}ms' ) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + 1}, {score}\n') result = scipy.optimize.dual_annealing( eval_function, [(0, n_devices - 1)] * len(groups), no_local_search=True, maxfun=self.steps, callback=callback) placement = [int(round(g)) for g in result.x] if self.verbose: log(f'Best found placement: {placement}') solution = json.dumps(apply_placement(net_string, placement, groups), indent=4) with open(os.path.join(get_log_dir(), 'sa_solution.json'), 'w') as f: f.write(solution) return solution
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) def initialize_swarm(): swarm = [] for i in range(self.swarm_size): position = generate_random_placement(len(groups), n_devices) velocity = [random.random() * n_devices * 2 - n_devices] particle = Particle(position, velocity) particle.evaluate(evaluate) swarm.append(particle) return swarm def find_global_best(swarm): global_best = max(swarm, key=lambda x: x.best_score) return global_best.position def position_to_placement(position): return [min(max(int(g), 0), n_devices - 1) for g in position] def evaluate(position): placement = position_to_placement(position) return self.evaluate_placement( apply_placement(net_string, placement, groups), device_graph) swarm = initialize_swarm() global_best_position = find_global_best(swarm) for i in tqdm(range(self.steps)): for particle in swarm: particle.update_velocity(self.w, self.l1, self.l2, global_best_position) particle.update_position() particle.evaluate(evaluate) global_best_position = find_global_best(swarm) return json.dumps( apply_placement(net_string, position_to_placement(global_best_position), groups))
def benchmark(individual, benchmarking_function): device_assignment = get_device_assignment( apply_placement(net_string, individual, groups)) time, memory_overflow = benchmarking_function( device_assignment, return_memory_overflow=True) description = create_description(individual) # Time is set to -1 if memory overflows - but we check with memory_overflow instead time = max(time, 0) if memory_overflow == -1: memory_overflow = 1 if memory_overflow > 0: time += memory_overflow * 10**9 * 1 return 1 / time, description, individual
def create_description(individual): c = Counter(individual) device_mode = c.most_common(1)[0][0] device_mode = round((device_mode / len(device_graph.devices)) * self.dimension_sizes[0]) used_devices = round( ((len(set(individual)) - 1) / (len(device_graph.devices))) * self.dimension_sizes[1]) comp_graph_dict = apply_placement(net_string, individual, groups) comp_graph = ComputationGraph() comp_graph.load_from_string(json.dumps(comp_graph_dict)) num_jumps, max_jumps = comp_graph.get_number_of_jumps( return_max_jumps=True) num_jumps = round( (num_jumps / max_jumps) * (self.dimension_sizes[2] - 1)) return (device_mode, used_devices, num_jumps)
def _evaluate(individual, net_string, groups, device_graph, pipeline_batches=1, batches=1, simulator_comp_penalty=1, simulator_comm_penalty=1, device_memory_utilization=1): description, individual = individual comp_graph_dict = apply_placement(net_string, individual, groups) score = 1 / evaluate_placement( comp_graph_dict, device_graph, pipeline_batches=pipeline_batches, batches=batches, comp_penalty=simulator_comp_penalty, comm_penalty=simulator_comm_penalty, device_memory_utilization=device_memory_utilization) return score, description, individual
def evaluate(position): placement = position_to_placement(position) return self.evaluate_placement( apply_placement(net_string, placement, groups), device_graph)
def optimize(self, net_string, device_graph, return_full_archive=False): if self.n_threads > 1: self.worker_pool = Pool(self.n_threads) n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) if self.dimension_sizes[0] == -1: self.dimension_sizes[0] = n_devices if self.dimension_sizes[1] == -1: self.dimension_sizes[1] = n_devices if self.dimension_sizes[2] == -1: comp_graph = ComputationGraph() comp_graph.load_from_string(net_string) _, max_jumps = comp_graph.get_number_of_jumps( return_max_jumps=True) self.dimension_sizes[2] = max_jumps archive_scores = np.empty(self.dimension_sizes) archive_scores[:] = np.NaN archive_individuals = np.zeros(list(self.dimension_sizes) + [len(groups)], dtype=int) def evaluate(individual): return _evaluate(individual, net_string, groups, device_graph, self.dimension_sizes, self.pipeline_batches, self.batches, self.simulator_comp_penalty, self.simulator_comm_penalty, self.device_memory_utilization) def mutate(individual): new_individual = [] if random.random() < self.replace_mutation_rate: devices_present = list(set(individual)) i1 = random.choice(devices_present) i2 = random.choice(devices_present) new_individual = [i2 if i == i1 else i for i in individual] elif random.random() < self.zone_mutation_rate: split1 = random.randint(0, len(individual) - 1) split2 = split1 + min(np.random.geometric(0.2), len(individual) - split1) dev = random.randint(0 if self.allow_cpu else 1, n_devices - 1) new_individual = individual[:split1] + [dev] * ( split2 - split1) + individual[split2:] else: for i, gene in enumerate(individual): if random.random() < self.copy_mutation_rate and i > 0: new_individual.append(individual[i - 1]) elif random.random() < self.mutation_rate: if self.allow_cpu: new_individual.append( random.randint(0, n_devices - 1)) else: new_individual.append( random.randint(1, n_devices - 1)) else: new_individual.append(gene) return new_individual def crossover(parent1, parent2): crossover_point = random.randint(1, len(parent1) - 1) return parent1[:crossover_point] + parent2[crossover_point:] def create_candidates(n, create_random=False, create_trivial=False, selectable_candidates=None): if n <= 0: return [] candidates = [] if create_trivial: candidates.extend([[i] * len(groups) for i in range(1, n_devices)]) n -= n_devices - 1 if self.allow_cpu: candidates.append([0] * len(groups)) n -= 1 if create_random: while len(candidates) < n: candidates.append( generate_random_placement( len(groups), n_devices, allow_device_0=self.allow_cpu)) else: selectable_indices = np.argwhere(np.isfinite(archive_scores)) # selectable_indices = sorted(selectable_indices, key=lambda x: -archive_scores[x[0], x[1], x[2]]) while len(candidates) < n: c = [] if selectable_candidates: for _ in range(1 + int( random.random() < self.crossover_rate)): c.append(random.choice(selectable_candidates)) else: if self.selection == 'random': for _ in range(1 + int( random.random() < self.crossover_rate)): idx = random.choice(selectable_indices) c.append( archive_individuals[idx[0], idx[1], idx[2], :].tolist()) elif self.selection == 'tournament': idx = [] t = min(self.tournament_size, len(selectable_indices)) while len(idx) < 1 + int( random.random() < self.crossover_rate): competitors = random.sample( selectable_indices.tolist(), t) winner = max(competitors, key=lambda x: archive_scores[x[ 0], x[1], x[2]]) idx.append(winner) for i in idx: c.append(archive_individuals[i[0], i[1], i[2], :].tolist()) if len(c) == 2: candidate = crossover(*c) else: candidate = c[0] candidate = mutate(candidate) candidates.append(candidate) return candidates def create_description(individual): c = Counter(individual) device_mode = c.most_common(1)[0][0] device_mode = round((device_mode / len(device_graph.devices)) * self.dimension_sizes[0]) used_devices = round( ((len(set(individual)) - 1) / (len(device_graph.devices))) * self.dimension_sizes[1]) comp_graph_dict = apply_placement(net_string, individual, groups) comp_graph = ComputationGraph() comp_graph.load_from_string(json.dumps(comp_graph_dict)) num_jumps, max_jumps = comp_graph.get_number_of_jumps( return_max_jumps=True) num_jumps = round( (num_jumps / max_jumps) * (self.dimension_sizes[2] - 1)) return (device_mode, used_devices, num_jumps) def benchmark(individual, benchmarking_function): device_assignment = get_device_assignment( apply_placement(net_string, individual, groups)) time, memory_overflow = benchmarking_function( device_assignment, return_memory_overflow=True) description = create_description(individual) # Time is set to -1 if memory overflows - but we check with memory_overflow instead time = max(time, 0) if memory_overflow == -1: memory_overflow = 1 if memory_overflow > 0: time += memory_overflow * 10**9 * 1 return 1 / time, description, individual def reevaluate_archive(benchmarking_function=None, n_keep=None, time_threshold=None): indices = list(np.argwhere(np.isfinite(archive_scores))) if time_threshold: indices = [ i for i in indices if archive_scores[i[0], i[1], i[2]] >= 1 / time_threshold ] if n_keep: indices = sorted( indices, key=lambda i: -archive_scores[i[0], i[1], i[2]]) indices = indices[:n_keep] assert len( indices), 'No solutions fulfill the specified requirements' archive_scores[:] = np.NaN if self.verbose: if n_keep: log(f'Reevaluating {n_keep} best individuals in archive (and throwing away the rest)' ) else: log('Reevaluating all individuals in archive') if time_threshold: log(f'Time threshold: {time_threshold}ms') for i in tqdm(indices, disable=not self.verbose): individual = archive_individuals[i[0], i[1], i[2], :].tolist() if benchmarking_function: archive_scores[i[0], i[1], i[2]] = benchmark(individual, benchmarking_function)[0] else: archive_scores[i[0], i[1], i[2]] = evaluate(individual)[0] def log_archive(file_name): indices = list(np.argwhere(np.isfinite(archive_scores))) indices = sorted(indices, key=lambda i: -archive_scores[i[0], i[1], i[2]]) with open(os.path.join(get_log_dir(), 'archive_logs', file_name), 'w') as f: f.write('niche; time; placement\n') for i in indices: niche = tuple(i) time = 1 / archive_scores[i[0], i[1], i[2]] placement = archive_individuals[i[0], i[1], i[2]].tolist() f.write(f'{niche}; {time}; {placement}\n') def run_optimization(steps, benchmarking_function=None, start_step=0): nonlocal archive_individuals, archive_scores if self.verbose: if benchmarking_function: log('Optimizing with benchmarking...') else: log('Optimizing with simulator...') step_size = 1 if benchmarking_function else self.n_threads for i in tqdm(range(0, steps, step_size), disable=not self.verbose): init_number = min(max(0, self.initial_size - i), self.n_threads) if self.include_trivial_solutions and i == 0: candidates = create_candidates(init_number, create_trivial=True, create_random=True) else: candidates = create_candidates(init_number, create_random=True) if init_number > 0: candidates += create_candidates( self.n_threads - init_number, selectable_candidates=candidates[:]) else: candidates += create_candidates(self.n_threads - init_number) if benchmarking_function: eval_results = [ benchmark(candidates[0], benchmarking_function) ] elif self.n_threads == 1: eval_results = [evaluate(candidates[0])] else: fn_args = zip(((create_description(c), c) for c in candidates), repeat(net_string), repeat(groups), repeat(device_graph), repeat(self.pipeline_batches), repeat(self.batches), repeat(self.simulator_comp_penalty), repeat(self.simulator_comm_penalty), repeat(self.device_memory_utilization)) eval_results = self.worker_pool.starmap(_evaluate, fn_args) for result in eval_results: score, description, individual = result previous_elite_score = archive_scores[description[0], description[1], description[2]] if np.isnan(previous_elite_score ) or previous_elite_score < score: archive_scores[description[0], description[1], description[2]] = score archive_individuals[description[0], description[1], description[2], :] = individual if self.verbose and (i + 1) % self.verbose < step_size: best_time = 1 / np.nanmax(archive_scores) log(f'[{i + 1}/{steps}] Best time: {best_time:.4f}ms') if self.score_save_period and (i % self.score_save_period == 0 or steps - i < step_size): best_time = 1 / np.nanmax(archive_scores) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + start_step + 1}, {best_time}\n') if self.archive_log_period and ( i + 1) % self.archive_log_period < step_size: log_archive(f'step_{i + start_step + 1:06}.csv') if self.score_save_period: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('step, time\n') run_optimization(self.steps) if self.worker_pool: self.worker_pool.close() if self.archive_log_period is not None: log_archive('1_simulation_finished.csv') if self.benchmarking_steps > 0 or self.benchmark_before_selection: reevaluate_archive(self.benchmarking_function, n_keep=self.benchmarking_n_keep, time_threshold=self.benchmarking_time_threshold) if self.archive_log_period is not None: log_archive('2_reevaluated.csv') if self.benchmarking_steps > 0: run_optimization(self.benchmarking_steps, self.benchmarking_function, self.steps) log_archive('3_benchmarking_finished.csv') if self.show_score_plot: if self.verbose: log('Plotting archive scores...', end='') graph = ComputationGraph() graph.load_from_string(net_string) _, max_jumps = graph.get_number_of_jumps(return_max_jumps=True) plot_map_elites_archive(archive_scores, n_devices, max_jumps, self.plot_axes, save_path=os.path.join( get_log_dir(), 'archive_plot.pdf')) if self.verbose: log('Done') if self.plot_animation: if not self.archive_log_period and self.verbose: log('self.plot_animation was set to True, but archive logging was not enabled. ' 'Skipping animation plot.') else: if self.verbose: log('Plotting archive animation...', end='') paths = glob( os.path.join(get_log_dir(), 'archive_logs', 'step_*.csv')) plot_archive_animation( paths, (os.path.join(get_log_dir(), 'archive_animation.mp4'), os.path.join(get_log_dir(), 'archive_animation.gif')), self.dimension_sizes, n_devices=n_devices, max_jumps=max_jumps, axes=self.plot_axes, fps=self.animation_fps) if self.verbose: log('Done') if return_full_archive: return archive_scores, archive_individuals best_index = np.nanargmax(archive_scores) best_individual = archive_individuals.reshape( (-1, len(groups)))[best_index] if self.verbose: log(f'Best individual: {best_individual.tolist()}') solution = json.dumps(apply_placement(net_string, best_individual.tolist(), groups), indent=4) with open(os.path.join(get_log_dir(), 'me_solution.json'), 'w') as f: f.write(solution) return solution