def initialize_dask(): """Initialize Dask environment.""" from distributed.client import default_client try: client = default_client() except ValueError: from distributed import Client # The indentation here is intentional, we want the code to be indented. ErrorMessage.not_initialized( "Dask", """ from distributed import Client client = Client() """, ) num_cpus = CpuCount.get() memory_limit = Memory.get() worker_memory_limit = memory_limit // num_cpus if memory_limit else "auto" client = Client(n_workers=num_cpus, memory_limit=worker_memory_limit) num_cpus = len(client.ncores()) NPartitions._put(num_cpus)
def initialize_dask(): from distributed.client import get_client try: client = get_client() except ValueError: from distributed import Client # The indentation here is intentional, we want the code to be indented. ErrorMessage.not_initialized( "Dask", """ from distributed import Client client = Client() """, ) client = Client(n_workers=CpuCount.get()) num_cpus = len(client.ncores()) NPartitions.put_if_default(num_cpus)
ignore_reinit_error=True, ) num_cpus = ray.global_state.cluster_resources()["CPU"] except AssertionError: pass elif execution_engine == "Dask": from distributed.client import _get_global_client if threading.current_thread().name == "MainThread": # initialize the dask client client = _get_global_client() if client is None: from distributed import Client client = Client() num_cpus = sum(client.ncores().values()) elif execution_engine != "Python": raise ImportError( "Unrecognized execution engine: {}.".format(execution_engine)) DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ "DataFrame", "Series", "read_csv", "read_parquet", "read_json", "read_html", "read_clipboard", "read_excel",
class DaskHelper: """ Manages Dask client and provides associated helper functions. """ def __init__(self, n_workers=1, client=None): if client is not None and isinstance(client, Client): self.client = client self.n_workers = self._get_n_workers() else: self.n_workers = n_workers self.client = Client(n_workers=self.n_workers, processes=True, threads_per_worker=1, scheduler_port=0) self.futures = [] self.shared_data = None self.worker_list = self.update_worker_list() # adds only this many workers at a time when new workers available self._add_worker_batch_lim = 20 def _get_n_workers(self): n_workers = len(self.client.ncores()) return n_workers def update_n_workers(self): self.n_workers = self._get_n_workers() def _get_worker_list(self): worker_list = list(self.client.ncores().keys()) return worker_list def update_worker_list(self): self.worker_list = self._get_worker_list() def __getstate__(self): """ Allows the object to picklable while having Dask client as a class attribute. """ d = dict(self.__dict__) d["client"] = None # hack to allow Dask client to be a class attribute return d def __del__(self): """ Ensures a clean kill of the Dask client and frees up a port. """ if hasattr(self, "client") and isinstance(self, Client): self.client.close() def submit_job(self, func, x, workers): if self.shared_data is not None: # self.client.submit(func, x, self.shared_data, workers=workers) self.futures.append( self.client.submit(func, x, self.shared_data, workers=workers)) else: # self.client.submit(func, x, workers=workers) self.futures.append(self.client.submit(func, x, workers=workers)) def _check_a_worker(self, worker_metrics): """ Checks if a worker can take a new job or not If a worker is executing (1) then it is certainly not free. If a worker is not executing (0) and has no jobs ready (0) then it likely free. If a worker has more than 1 job(s) in memory and ready to be executed, then it is not free. Else, it is considered free. """ if worker_metrics["executing"]: return False if not worker_metrics["executing"] and not worker_metrics["ready"]: return True if worker_metrics["in_memory"] * worker_metrics["ready"]: return False return True def is_worker_available(self) -> Union[List, bool]: """ Checks if at least one worker is available to run a job """ if self.n_workers == 1: # in the synchronous case, one worker is always available return True self.update_n_workers() # `self.futures` contains only the list of futures submitted by the main process that # instantiated this class, which is adequate to check if workers available when workers # are exclusive available to only this process if len(self.futures) >= self.n_workers: # pause/wait if active worker count greater allocated workers return [] # False # Given multiple different benchmark processes can share the same pool of workers, to # have a better estimate of queued jobs, need to retrieve information about the # status of each worker in the current worker_list if hasattr(self.client, "_scheduler_identity") and \ "workers" in self.client._scheduler_identity: workers = self.client._scheduler_identity["workers"] self.worker_list = list(workers.keys()) worker_status = list( map(lambda k: self._check_a_worker(workers[k]['metrics']), self.worker_list)) # If at least one of the available worker(s) are free, a True signal is returned available = np.array( self.worker_list)[np.where(worker_status)[0]].tolist() else: available = list(self.client.ncores().keys()) np.random.shuffle(available) return available def fetch_futures(self, retries=1, wait_time=0.05): """ Removes the futures which are done from the list Loops over the list a given number of times, waiting for a given time, to check if any or more futures have finished. If at any time, all futures have been processed, it breaks out. """ counter = 0 while counter < retries: if self.n_workers > 1: self.futures = [ future for future in self.futures if not future.done() ] if len(self.futures) == 0: break else: # Dask not invoked in the synchronous case (n_workers=1) self.futures = [] time.sleep(wait_time) counter += 1 return None def distribute_data_to_workers(self, data): """ Shares data across all workers to not serialize and transfer with every job """ if self.worker_list is None: self.worker_list = self._get_worker_list() self.shared_data = self.client.scatter(data, broadcast=True) current_worker_list = list(self.client.ncores().keys()) if len(set(current_worker_list) - set(self.worker_list)) > 0: # redistribute data across workers only when new workers have been added self.shared_data = self.client.scatter(data, broadcast=True) self.worker_list = self._get_worker_list() def is_worker_alive(self): return len(self.futures) > 0
class DEHB(DEHBBase): def __init__(self, cs=None, f=None, dimensions=None, mutation_factor=0.5, crossover_prob=0.5, strategy='rand1_bin', min_budget=None, save_smac=False, max_budget=None, eta=3, min_clip=None, max_clip=None, configspace=True, boundary_fix_type='random', max_age=np.inf, n_workers=None, client=None, **kwargs): super().__init__(cs=cs, f=f, dimensions=dimensions, mutation_factor=mutation_factor, crossover_prob=crossover_prob, strategy=strategy, min_budget=min_budget, max_budget=max_budget, eta=eta, min_clip=min_clip, max_clip=max_clip, configspace=configspace, boundary_fix_type=boundary_fix_type, max_age=max_age, **kwargs) self.iteration_counter = -1 self.de = {} self._max_pop_size = None self.active_brackets = [] # list of SHBracketManager objects self.traj = [] self.smac_trajectory = [] self.runtime = [] self.history = [] self.smac_history = {"data": [], "config_origins": {}, "configs": {}} self.start = None self.save_smac = save_smac # Dask variables if n_workers is None and client is None: raise ValueError( "Need to specify either 'n_workers'(>0) or 'client' (a Dask client)!" ) if client is not None and isinstance(client, Client): self.client = client self.n_workers = len(client.ncores()) else: self.n_workers = n_workers if self.n_workers > 1: self.client = Client( n_workers=self.n_workers, processes=True, threads_per_worker=1, scheduler_port=0 ) # port 0 makes Dask select a random free port else: self.client = None self.futures = [] self.shared_data = None # Initializing DE subpopulations self._get_pop_sizes() self._init_subpop() # Misc. self.available_gpus = None self.gpu_usage = None self.single_node_with_gpus = None def __getstate__(self): """ Allows the object to picklable while having Dask client as a class attribute. """ d = dict(self.__dict__) d["client"] = None # hack to allow Dask client to be a class attribute d["logger"] = None # hack to allow logger object to be a class attribute return d def __del__(self): """ Ensures a clean kill of the Dask client and frees up a port. """ if hasattr(self, "client") and isinstance(self, Client): self.client.close() def _f_objective(self, job_info): """ Wrapper to call DE's objective function. """ # check if job_info appended during job submission self.submit_job() includes "gpu_devices" if "gpu_devices" in job_info and self.single_node_with_gpus: # should set the environment variable for the spawned worker process # reprioritising a CUDA device order specific to this worker process os.environ.update( {"CUDA_VISIBLE_DEVICES": job_info["gpu_devices"]}) config, budget, parent_id = job_info['config'], job_info[ 'budget'], job_info['parent_id'] bracket_id = job_info['bracket_id'] kwargs = job_info["kwargs"] res = self.de[budget].f_objective(config, budget, **kwargs) info = res["info"] if "info" in res else dict() run_info = { 'fitness': res["fitness"], 'cost': res["cost"], 'config': config, 'budget': budget, 'parent_id': parent_id, 'bracket_id': bracket_id, 'info': info } if "gpu_devices" in job_info: # important for GPU usage tracking if single_node_with_gpus=True device_id = int(job_info["gpu_devices"].strip().split(",")[0]) run_info.update({"device_id": device_id}) return run_info def _create_cuda_visible_devices(self, available_gpus: List[int], start_id: int) -> str: """ Generates a string to set the CUDA_VISIBLE_DEVICES environment variable. Given a list of available GPU device IDs and a preferred ID (start_id), the environment variable is created by putting the start_id device first, followed by the remaining devices arranged randomly. The worker that uses this string to set the environment variable uses the start_id GPU device primarily now. """ assert start_id in available_gpus available_gpus = deepcopy(available_gpus) available_gpus.remove(start_id) np.random.shuffle(available_gpus) final_variable = [str(start_id)] + [str(_id) for _id in available_gpus] final_variable = ",".join(final_variable) return final_variable def distribute_gpus(self): """ Function to create a GPU usage tracker dict. The idea is to extract the exact GPU device IDs available. During job submission, each submitted job is given a preference of a GPU device ID based on the GPU device with the least number of active running jobs. On retrieval of the result, this gpu usage dict is updated for the device ID that the finished job was mapped to. """ try: available_gpus = os.environ["CUDA_VISIBLE_DEVICES"] available_gpus = available_gpus.strip().split(",") self.available_gpus = [int(_id) for _id in available_gpus] except KeyError as e: print("Unable to find valid GPU devices. " "Environment variable {} not visible!".format(str(e))) self.available_gpus = [] self.gpu_usage = dict() for _id in self.available_gpus: self.gpu_usage[_id] = 0 def vector_to_configspace(self, config): assert hasattr(self, "de") assert len(self.budgets) > 0 return self.de[self.budgets[0]].vector_to_configspace(config) def configspace_to_vector(self, config): assert hasattr(self, "de") assert len(self.budgets) > 0 return self.de[self.budgets[0]].configspace_to_vector(config) def reset(self): super().reset() if self.n_workers > 1 and hasattr(self, "client") and isinstance( self.client, Client): self.client.restart() else: self.client = None self.futures = [] self.shared_data = None self.iteration_counter = -1 self.de = {} self._max_pop_size = None self.start = None self.active_brackets = [] self.traj = [] self.smac_trajectory = [] self.runtime = [] self.history = [] self.smac_history = {"data": [], "config_origins": {}, "configs": {}} self._get_pop_sizes() self._init_subpop() self.available_gpus = None self.gpu_usage = None def init_population(self, pop_size): if self.configspace: population = self.cs.sample_configuration(size=pop_size) population = [ self.configspace_to_vector(individual) for individual in population ] else: population = np.random.uniform(low=0.0, high=1.0, size=(pop_size, self.dimensions)) return population def clean_inactive_brackets(self): """ Removes brackets from the active list if it is done as communicated by Bracket Manager """ if len(self.active_brackets) == 0: return self.active_brackets = [ bracket for bracket in self.active_brackets if ~bracket.is_bracket_done() ] return # Also generates SMAC like History def _update_trackers(self, traj, runtime, history): if len(self.traj) == self.inc_run: incumbent = ConfigSpace.Configuration( self.cs, vector=traj[1]).get_dictionary() for key in list(incumbent.keys()): if key != "agent": incumbent[key[2:]] = incumbent.pop(key) traj_obj = { "cpu_time": time.time() - self.start, "wall_clocktime": time.time() - self.start, "evaluations": len(self.traj), "cost": traj[0], "incumbent": incumbent, "budget": 0, "origin": "DEHB" } self.smac_trajectory.append(traj_obj) self.smac_history["data"].append(( [ len(self.traj) + 1, # config id None, # instance id 0, # seed... history[-2], # budget ], [ history[1], # cost / fitness history[2], # time / cost { # status "__enum__": "StatusType.SUCCESS" }, time.time() - history[2], # start time time.time(), # end time, history[-1] # info ])) self.smac_history["config_origins"][str(len(self.traj) + 1)] = "DEHB" config = ConfigSpace.Configuration(self.cs, vector=history[0]).get_dictionary() for key in list(config.keys()): if key != "agent": config[key[2:]] = config.pop(key) self.smac_history["configs"][str(len(self.traj) + 1)] = config self.traj.append(traj[0]) self.runtime.append(runtime) self.history.append(history) def _update_incumbents(self, config, score, run, info): self.inc_config = config self.inc_score = score self.inc_run = run self.inc_info = info def _get_pop_sizes(self): """Determines maximum pop size for each budget """ self._max_pop_size = {} for i in range(self.max_SH_iter): n, r = self.get_next_iteration(i) for j, r_j in enumerate(r): self._max_pop_size[r_j] = max( n[j], self._max_pop_size[r_j] ) if r_j in self._max_pop_size.keys() else n[j] def _init_subpop(self): """ List of DE objects corresponding to the budgets (fidelities) """ self.de = {} for i, b in enumerate(self._max_pop_size.keys()): self.de[b] = AsyncDE(**self.de_params, budget=b, pop_size=self._max_pop_size[b]) self.de[b].population = self.de[b].init_population( pop_size=self._max_pop_size[b]) self.de[b].fitness = np.array([np.inf] * self._max_pop_size[b]) # adding attributes to DEHB objects to allow communication across subpopulations self.de[b].parent_counter = 0 self.de[b].promotion_pop = None self.de[b].promotion_fitness = None def _concat_pops(self, exclude_budget=None): """ Concatenates all subpopulations """ budgets = list(self.budgets) if exclude_budget is not None: budgets.remove(exclude_budget) pop = [] for budget in budgets: pop.extend(self.de[budget].population.tolist()) return np.array(pop) def _start_new_bracket(self): """ Starts a new bracket based on Hyperband """ # start new bracket self.iteration_counter += 1 # iteration counter gives the bracket count or bracket ID n_configs, budgets = self.get_next_iteration(self.iteration_counter) bracket = SHBracketManager(n_configs=n_configs, budgets=budgets, bracket_id=self.iteration_counter) self.active_brackets.append(bracket) return bracket def _get_worker_count(self): if isinstance(self.client, Client): return len(self.client.ncores()) else: return 1 def is_worker_available(self, verbose=False): """ Checks if at least one worker is available to run a job """ if self.n_workers == 1 or self.client is None or not isinstance( self.client, Client): # in the synchronous case, one worker is always available return True # checks the absolute number of workers mapped to the client scheduler # client.ncores() should return a dict with the keys as unique addresses to these workers # treating the number of available workers in this manner workers = self._get_worker_count() # len(self.client.ncores()) if len(self.futures) >= workers: # pause/wait if active worker count greater allocated workers return False return True def _get_promotion_candidate(self, low_budget, high_budget, n_configs): """ Manages the population to be promoted from the lower to the higher budget. This is triggered or in action only during the first full HB bracket, which is equivalent to the number of brackets <= max_SH_iter. """ # finding the individuals that have been evaluated (fitness < np.inf) evaluated_configs = np.where(self.de[low_budget].fitness != np.inf)[0] promotion_candidate_pop = self.de[low_budget].population[ evaluated_configs] promotion_candidate_fitness = self.de[low_budget].fitness[ evaluated_configs] # ordering the evaluated individuals based on their fitness values pop_idx = np.argsort(promotion_candidate_fitness) # creating population for promotion if none promoted yet or nothing to promote if self.de[high_budget].promotion_pop is None or \ len(self.de[high_budget].promotion_pop) == 0: self.de[high_budget].promotion_pop = np.empty((0, self.dimensions)) self.de[high_budget].promotion_fitness = np.array([]) # iterating over the evaluated individuals from the lower budget and including them # in the promotion population for the higher budget only if it's not in the population # this is done to ensure diversity of population and avoid redundant evaluations for idx in pop_idx: individual = promotion_candidate_pop[idx] # checks if the candidate individual already exists in the high budget population if np.any( np.all(individual == self.de[high_budget].population, axis=1)): # skipping already present individual to allow diversity and reduce redundancy continue self.de[high_budget].promotion_pop = np.append( self.de[high_budget].promotion_pop, [individual], axis=0) self.de[high_budget].promotion_fitness = np.append( self.de[high_budget].promotion_pop, promotion_candidate_fitness[pop_idx]) # retaining only n_configs self.de[high_budget].promotion_pop = self.de[ high_budget].promotion_pop[:n_configs] self.de[high_budget].promotion_fitness = \ self.de[high_budget].promotion_fitness[:n_configs] if len(self.de[high_budget].promotion_pop) > 0: config = self.de[high_budget].promotion_pop[0] # removing selected configuration from population self.de[high_budget].promotion_pop = self.de[ high_budget].promotion_pop[1:] self.de[high_budget].promotion_fitness = self.de[ high_budget].promotion_fitness[1:] else: # in case of an edge failure case where all high budget individuals are same # just choose the best performing individual from the lower budget (again) config = self.de[low_budget].population[pop_idx[0]] return config def _get_next_parent_for_subpop(self, budget): """ Maintains a looping counter over a subpopulation, to iteratively select a parent """ parent_id = self.de[budget].parent_counter self.de[budget].parent_counter += 1 self.de[budget].parent_counter = self.de[ budget].parent_counter % self._max_pop_size[budget] return parent_id def _acquire_config(self, bracket, budget): """ Generates/chooses a configuration based on the budget and iteration number """ # select a parent/target parent_id = self._get_next_parent_for_subpop(budget) target = self.de[budget].population[parent_id] # identify lower budget/fidelity to transfer information from lower_budget, num_configs = bracket.get_lower_budget_promotions(budget) if self.iteration_counter < self.max_SH_iter: # promotions occur only in the first set of SH brackets under Hyperband # for the first rung/budget in the current bracket, no promotion is possible and # evolution can begin straight away # for the subsequent rungs, individuals will be promoted from the lower_budget if budget != bracket.budgets[0]: # TODO: check if generalizes to all budget spacings config = self._get_promotion_candidate(lower_budget, budget, num_configs) return config, parent_id # DE evolution occurs when either all individuals in the subpopulation have been evaluated # at least once, i.e., has fitness < np.inf, which can happen if # iteration_counter <= max_SH_iter but certainly never when iteration_counter > max_SH_iter # a single DE evolution --- (mutation + crossover) occurs here mutation_pop_idx = np.argsort( self.de[lower_budget].fitness)[:num_configs] mutation_pop = self.de[lower_budget].population[mutation_pop_idx] # generate mutants from previous budget subpopulation or global population if len(mutation_pop) < self.de[budget]._min_pop_size: filler = self.de[budget]._min_pop_size - len(mutation_pop) + 1 new_pop = self.de[budget]._init_mutant_population( pop_size=filler, population=self._concat_pops(), target=target, best=self.inc_config) mutation_pop = np.concatenate((mutation_pop, new_pop)) # generate mutant from among individuals in mutation_pop mutant = self.de[budget].mutation(current=target, best=self.inc_config, alt_pop=mutation_pop) # perform crossover with selected parent config = self.de[budget].crossover(target=target, mutant=mutant) config = self.de[budget].boundary_check(config) return config, parent_id def _get_next_job(self): """ Loads a configuration and budget to be evaluated next by a free worker """ bracket = None if len(self.active_brackets) == 0 or \ np.all([bracket.is_bracket_done() for bracket in self.active_brackets]): # start new bracket when no pending jobs from existing brackets or empty bracket list bracket = self._start_new_bracket() else: for _bracket in self.active_brackets: # check if _bracket is not waiting for previous rung results of same bracket # _bracket is not waiting on the last rung results # these 2 checks allow DEHB to have a "synchronous" Successive Halving if not _bracket.previous_rung_waits() and _bracket.is_pending( ): # bracket eligible for job scheduling bracket = _bracket break if bracket is None: # start new bracket when existing list has all waiting brackets bracket = self._start_new_bracket() # budget that the SH bracket allots budget = bracket.get_next_job_budget() config, parent_id = self._acquire_config(bracket, budget) # notifies the Bracket Manager that a single config is to run for the budget chosen job_info = { "config": config, "budget": budget, "parent_id": parent_id, "bracket_id": bracket.bracket_id } return job_info def _get_gpu_id_with_low_load(self): candidates = [] for k, v in self.gpu_usage.items(): if v == min(self.gpu_usage.values()): candidates.append(k) device_id = np.random.choice(candidates) # creating string for setting environment variable CUDA_VISIBLE_DEVICES gpu_ids = self._create_cuda_visible_devices(self.available_gpus, device_id) # updating GPU usage self.gpu_usage[device_id] += 1 self.logger.debug("GPU device selected: {}".format(device_id)) self.logger.debug("GPU device usage: {}".format(self.gpu_usage)) return gpu_ids def submit_job(self, job_info, **kwargs): """ Asks a free worker to run the objective function on config and budget """ job_info[ "kwargs"] = self.shared_data if self.shared_data is not None else kwargs # submit to to Dask client if self.n_workers > 1 or isinstance(self.client, Client): if self.single_node_with_gpus: # managing GPU allocation for the job to be submitted job_info.update( {"gpu_devices": self._get_gpu_id_with_low_load()}) self.futures.append(self.client.submit(self._f_objective, job_info)) else: # skipping scheduling to Dask worker to avoid added overheads in the synchronous case self.futures.append(self._f_objective(job_info)) # pass information of job submission to Bracket Manager for bracket in self.active_brackets: if bracket.bracket_id == job_info['bracket_id']: # registering is IMPORTANT for Bracket Manager to perform SH bracket.register_job(job_info['budget']) break def _fetch_results_from_workers(self): """ Iterate over futures and collect results from finished workers """ if self.n_workers > 1 or isinstance(self.client, Client): done_list = [(i, future) for i, future in enumerate(self.futures) if future.done()] else: # Dask not invoked in the synchronous case done_list = [(i, future) for i, future in enumerate(self.futures)] if len(done_list) > 0: self.logger.debug("Collecting {} of the {} job(s) active.".format( len(done_list), len(self.futures))) for _, future in done_list: if self.n_workers > 1 or isinstance(self.client, Client): run_info = future.result() if "device_id" in run_info: # updating GPU usage self.gpu_usage[run_info["device_id"]] -= 1 self.logger.debug("GPU device released: {}".format( run_info["device_id"])) future.release() else: # Dask not invoked in the synchronous case run_info = future # update bracket information fitness, cost = run_info["fitness"], run_info["cost"] info = run_info["info"] if "info" in run_info else dict() budget, parent_id = run_info["budget"], run_info["parent_id"] config = run_info["config"] bracket_id = run_info["bracket_id"] for bracket in self.active_brackets: if bracket.bracket_id == bracket_id: # bracket job complete bracket.complete_job( budget) # IMPORTANT to perform synchronous SH # carry out DE selection if fitness <= self.de[budget].fitness[parent_id]: self.de[budget].population[parent_id] = config self.de[budget].fitness[parent_id] = fitness # updating incumbents if self.de[budget].fitness[parent_id] < self.inc_score: self._update_incumbents( config=self.de[budget].population[parent_id], score=self.de[budget].fitness[parent_id], run=len(self.traj), info=info) # book-keeping self._update_trackers(traj=(self.inc_score, self.inc_config), runtime=cost, history=(config, float(fitness), float(cost), float(budget), info)) # remove processed future self.futures = np.delete(self.futures, [i for i, _ in done_list]).tolist() def _is_run_budget_exhausted(self, fevals=None, brackets=None, total_cost=None): """ Checks if the DEHB run should be terminated or continued """ delimiters = [fevals, brackets, total_cost] delim_sum = sum(x is not None for x in delimiters) if delim_sum == 0: raise ValueError( "Need one of 'fevals', 'brackets' or 'total_cost' as budget for DEHB to run." ) if fevals is not None: if len(self.traj) >= fevals: return True elif brackets is not None: if self.iteration_counter >= brackets: for bracket in self.active_brackets: # waits for all brackets < iteration_counter to finish by collecting results if bracket.bracket_id < self.iteration_counter and \ not bracket.is_bracket_done(): return False return True else: if time.time() - self.start >= total_cost: return True if len(self.runtime ) > 0 and self.runtime[-1] - self.start >= total_cost: return True return False def _save_incumbent(self, name=None): if name is None: name = time.strftime("%x %X %Z", time.localtime(self.start)) name = name.replace("/", '-').replace(":", '-').replace(" ", '_') try: res = dict() if self.configspace: config = self.vector_to_configspace(self.inc_config) res["config"] = config.get_dictionary() else: res["config"] = self.inc_config.tolist() res["score"] = self.inc_score res["info"] = self.inc_info with open( os.path.join(self.output_path, "incumbent_{}.json".format(name)), 'w') as f: json.dump(res, f) except Exception as e: self.logger.warning("Incumbent not saved: {}".format(repr(e))) def _save_history(self, name=None): if name is None: name = time.strftime("%x %X %Z", time.localtime(self.start)) name = name.replace("/", '-').replace(":", '-').replace(" ", '_') try: with open( os.path.join(self.output_path, "history_{}.pkl".format(name)), 'wb') as f: pickle.dump(self.history, f) if self.save_smac: with open( os.path.join(self.output_path, "runhistory_{}.json".format(name)), 'w') as f: json.dump(self.smac_history, f, indent=2) with open( os.path.join(self.output_path, "traj_{}.json".format(name)), 'w') as f: for traj in self.smac_trajectory: json.dump(traj, f) f.write("\n") except Exception as e: self.logger.warning("History not saved: {}".format(repr(e))) def _verbosity_debug(self): for bracket in self.active_brackets: self.logger.debug("Bracket ID {}:\n{}".format( bracket.bracket_id, str(bracket))) def _verbosity_runtime(self, fevals, brackets, total_cost): if fevals is not None: remaining = (len(self.traj), fevals, "function evaluation(s) done") elif brackets is not None: _suffix = "bracket(s) started; # active brackets: {}".format( len(self.active_brackets)) remaining = (self.iteration_counter + 1, brackets, _suffix) else: elapsed = np.format_float_positional(time.time() - self.start, precision=2) remaining = (elapsed, total_cost, "seconds elapsed") self.logger.info("{}/{} {}".format(remaining[0], remaining[1], remaining[2])) @logger.catch def run(self, fevals=None, brackets=None, total_cost=None, single_node_with_gpus=False, verbose=False, debug=False, save_intermediate=True, save_history=True, **kwargs): """ Main interface to run optimization by DEHB This function waits on workers and if a worker is free, asks for a configuration and a budget to evaluate on and submits it to the worker. In each loop, it checks if a job is complete, fetches the results, carries the necessary processing of it asynchronously to the worker computations. The duration of the DEHB run can be controlled by specifying one of 3 parameters. If more than one are specified, DEHB selects only one in the priority order (high to low): 1) Number of function evaluations (fevals) 2) Number of Successive Halving brackets run under Hyperband (brackets) 3) Total computational cost (in seconds) aggregated by all function evaluations (total_cost) """ # checks if a Dask client exists if len(kwargs) > 0 and self.n_workers > 1 and isinstance( self.client, Client): # broadcasts all additional data passed as **kwargs to all client workers # this reduces overload in the client-worker communication by not having to # serialize the redundant data used by all workers for every job self.shared_data = self.client.scatter(kwargs, broadcast=True) # allows each worker to be mapped to a different GPU when running on a single node # where all available GPUs are accessible self.single_node_with_gpus = single_node_with_gpus if self.single_node_with_gpus: self.distribute_gpus() self.start = time.time() if verbose: print("\nLogging at {} for optimization starting at {}\n".format( os.path.join(os.getcwd(), self.log_filename), time.strftime("%x %X %Z", time.localtime(self.start)))) if debug: logger.configure(handlers=[{"sink": sys.stdout}]) while True: if self._is_run_budget_exhausted(fevals, brackets, total_cost): break if self.is_worker_available(): job_info = self._get_next_job() if brackets is not None and job_info['bracket_id'] >= brackets: # ignore submission and only collect results # when brackets are chosen as run budget, an extra bracket is created # since iteration_counter is incremented in _get_next_job() and then checked # in _is_run_budget_exhausted(), therefore, need to skip suggestions # coming from the extra allocated bracket # _is_run_budget_exhausted() will not return True until all the lower brackets # have finished computation and returned its results pass else: if self.n_workers > 1 or isinstance(self.client, Client): self.logger.debug("{}/{} worker(s) available.".format( self._get_worker_count() - len(self.futures), self._get_worker_count())) # submits job_info to a worker for execution self.submit_job(job_info, **kwargs) if verbose: budget = job_info['budget'] self._verbosity_runtime(fevals, brackets, total_cost) self.logger.info( "Evaluating a configuration with budget {} under " "bracket ID {}".format(budget, job_info['bracket_id'])) self.logger.info( "Best score seen/Incumbent score: {}".format( self.inc_score)) self._verbosity_debug() self._fetch_results_from_workers() if save_intermediate and self.inc_config is not None: self._save_incumbent() if save_history and self.history is not None: self._save_history() self.clean_inactive_brackets() # end of while if verbose and len(self.futures) > 0: self.logger.info( "DEHB optimisation over! Waiting to collect results from workers running..." ) while len(self.futures) > 0: self._fetch_results_from_workers() if save_intermediate and self.inc_config is not None: self._save_incumbent() if save_history and self.history is not None: self._save_history() time.sleep(0.05) # waiting 50ms if verbose: time_taken = time.time() - self.start self.logger.info( "End of optimisation! Total duration: {}; Total fevals: {}\n". format(time_taken, len(self.traj))) self.logger.info("Incumbent score: {}".format(self.inc_score)) self.logger.info("Incumbent config: ") if self.configspace: config = self.vector_to_configspace(self.inc_config) for k, v in config.get_dictionary().items(): self.logger.info("{}: {}".format(k, v)) else: self.logger.info("{}".format(self.inc_config)) self._save_incumbent() self._save_history() return np.array(self.traj), np.array(self.runtime), np.array( self.history, dtype=object)
START_TIMEOUT = 1800 # 30 min SILENCE_TIMEOUT = 900 # 15 min MAX_COLLECT_TIME = 86400 # 1 day def processing_task_list(cli): return list(itertools.chain.from_iterable(cli.processing().values())) cli = Client('127.0.0.1:8786') print("Waiting for tasks to start running") timeout = time.time() + START_TIMEOUT while not cli.ncores(): time.sleep(5) if time.time() > timeout: raise Exception("workers never started") print("First worker connected. Starting data collection.") start_time = time.time() end_time = time.time() + MAX_COLLECT_TIME with open('graph.csv', 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['Time', 'Cores', 'Scheduled Tasks', 'Completed Tasks']) silence_end_time = time.time() + SILENCE_TIMEOUT
def preprocessing_script(): """ This script will process all the hybridization folders combined in a processing folder. The input parameters are passed using arparse Parameters: ----------- scheduler: string tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). default = False. If False the process will run on the local computer using nCPUs-1 path: string Path to the processing directory """ # Inputs of the function parser = argparse.ArgumentParser(description='Preprocessing script') parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003') parser.add_argument('-path', help='processing directory') args = parser.parse_args() # Directory to process processing_directory = args.path # Dask scheduler address scheduler_address = args.scheduler if scheduler_address: # Start dask client on server or cluster client=Client(scheduler_address) else: # Start dask client on local machine. It will use all the availabe # cores -1 # number of core to use ncores = multiprocessing.cpu_count()-1 cluster = LocalCluster(n_workers=ncores) client=Client(cluster) # Subdirectories of the processing_directory that need to be skipped for the # analysis blocked_directories = ['_logs'] # Starting logger utils.init_file_logger(processing_directory) logger = logging.getLogger() # Determine the operating system running the code os_windows, add_slash = utils.determine_os() # Check training slash in the processing directory processing_directory=utils.check_trailing_slash(processing_directory,os_windows) # Get a list of the hybridization to process processing_hyb_list = next(os.walk(processing_directory))[1] # Remove the blocked directories from the directories to process processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ] for processing_hyb in processing_hyb_list: # Determine the hyb number from the name hybridization_number = processing_hyb.split('_hyb')[-1] hybridization = 'Hybridization' + hybridization_number hyb_dir = processing_directory + processing_hyb + add_slash # Parse the Experimental metadata file (serial) experiment_infos,image_properties, hybridizations_infos, \ converted_positions, microscope_parameters =\ utils.experimental_metadata_parser(hyb_dir) # Parse the configuration file flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir) # ----------------- .nd2 FILE CONVERSION ------------------------------ # Create the temporary subdirectory tree (serial) tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\ hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash) # Get the list of the nd2 files to process inside the directory files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2') # Get the list of genes that are analyzed in the current hybridization gene_list = list(hybridizations_infos[hybridization].keys()) # Organize the file to process in a list which order match the gene_list for # parallel processing organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f ] organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f ] # Each .nd2 file will be processed in a worker part of a different node # Get the addresses of one process/node to use for conversion node_addresses = utils.identify_nodes(client) workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()] # Run the conversion futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list, tmp_gene_dirs,processing_hyb=processing_hyb, use_ram=flt_rawcnt_config['use_ram'], max_ram=flt_rawcnt_config['max_ram'], workers=workers_conversion) client.gather(futures_processes) # --------------------------------------------------------------------- # ----------------- FILTERING AND RAW COUNTING ------------------------ # Create directories # Create the directory where to save the filtered images suffix = 'filtered_png' filtered_png_img_dir_path, filtered_png_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) suffix = 'filtered_npy' filtered_img_dir_path, filtered_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Create the directory where to save the counting suffix = 'counting' counting_dir_path, counting_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,flt_rawcnt_config['skip_tags_counting'], flt_rawcnt_config['skip_genes_counting'], analysis_name=flt_rawcnt_config['analysis_name']) if flt_rawcnt_config['illumination_correction']: # Create the directory where to save the counting suffix = 'illumination_funcs' illumination_func_dir_path, illumination_func_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Loop through channels and calculate illumination for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') logger.debug('Create average image for gene %s', gene) # Chunking the image list num_chunks = sum(list(client.ncores().values())) chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks) # Scatter the images sublists to process in parallel futures = client.scatter(chunked_list) # Create dask processing graph output = [] for future in futures: ImgMean = delayed(utils.partial_image_mean)(future) output.append(ImgMean) ImgMean_all = delayed(sum)(output) ImgMean_all = ImgMean_all/float(len(futures)) # Compute the graph ImgMean = ImgMean_all.compute() logger.debug('Create illumination function for gene %s',gene) # Create illumination function Illumination=filters.gaussian(ImgMean,sigma=(20,300,300)) # Normalization of the illumination Illumination_flat=np.amax(Illumination,axis=0) Illumination_norm=Illumination_flat/np.amax(Illumination_flat) logger.debug('Save illumination function for gene %s',gene) # Save the illumination function illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0] illumination_fname=illumination_path+gene+'_illumination_func.npy' np.save(illumination_fname,Illumination_norm,allow_pickle=False) # Broadcast the illumination function to all the cores client.scatter(Illumination_norm, broadcast=True) logger.debug('Filtering %s',gene) # Filtering and counting futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \ illumination_function=Illumination_norm,\ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\ filtered_img_gene_dirs =filtered_img_gene_dirs,\ counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \ min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) else: for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') # filtering logger.debug('Filtering without illumination correction %s',gene) futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \ filtered_img_gene_dirs=filtered_img_gene_dirs, \ counting_gene_dirs=counting_gene_dirs, \ plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\ stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) # --------------------------------------------------------------------- # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------ # # Combine the filter data in one single .ppf for each hybridization # # This step will run in serial mode and will not need to shuffle data # # between cores because everything is on the common file system # logger.debug('Create .ppf.hdf5 file') # # Create the ppf.hdf5 file that contains the filtered data in uint16 # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb, # hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties) # logger.debug('Write the .npy filtered files into the .ppf file') # # Load and write the .npy tmp images into the hdf5 file # # open the hdf5 file # with h5py.File(preprocessing_file_path) as f_hdl: # # Loop through each gene # for gene in hybridizations_infos[hybridization].keys(): # logger.debug('Writing %s images in .ppf.hdf5',gene) # # list of the files to transfer # filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # loop through the list of file # for f_file in filtered_files_list: # pos = f_file.split('/')[-1].split('_')[-1].split('.')[0] # f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file) # f_hdl.flush() # # --------------------------------------------------------------------- # # ----------------- STITCHING ------------------------ # # Load the stitching parameters from the .yaml file # # Stitch the image in 2D or 3D (3D need more work/testing) # nr_dim = flt_rawcnt_config['nr_dim'] # # Estimated overlapping between images according to the Nikon software # est_overlap = image_properties['Overlapping_percentage'] # # Number of peaks to use for the alignment # nr_peaks = flt_rawcnt_config['nr_peaks'] # # Determine if the coords need to be flipped # y_flip = flt_rawcnt_config['y_flip'] # # Method to use for blending # # can be 'linear' or 'non linear' # # The methods that performs the best is the 'non linear' # blend = flt_rawcnt_config['blend'] # # Reference gene for stitching # reference_gene = flt_rawcnt_config['reference_gene'] # pixel_size = image_properties['PixelSize'] # # Get the list of the filtered files of the reference gene # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # Create pointer of the hdf5 file that will store the stitched reference image # # for the current hybridization # # Writing # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb # data_name = (tile_file_base_name # + '_' + reference_gene # + '_stitching_data') # stitching_file_name = tile_file_base_name + '.sf.hdf5' # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest') # replace with 'a' as soon as you fix the error # # Determine the tiles organization # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization, # est_overlap = est_overlap, y_flip = False, nr_dim = 2) # # Align the tiles # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples, # filtered_files_list=filtered_files_list,micData=micData, # nr_peaks=nr_peaks) # # Gather the futures # data = client.gather(futures_processes) # # In this case the order of the returned contingency tuples is with # # the order of the input contig_tuples # # P_all = [el for data_single in data for el in data_single[0]] # P_all =[data_single[0] for data_single in data ] # P_all = np.array(P_all) # P_all = P_all.flat[:] # covs_all = [data_single[1] for data_single in data] # alignment = {'P': P_all, # 'covs': covs_all} # # Calculates a shift in global coordinates for each tile (global # # alignment) and then applies these shifts to the corner coordinates # # of each tile and returns and saves these shifted corner coordinates. # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples, # micData, nr_pixels, z_count, # alignment, data_name, # nr_dim=nr_dim) # # Create the hdf5 file structure # stitched_group, linear_blending, blend = hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels, # reference_gene, blend = 'non linear') # # Fill the hdf5 containing the stitched image with empty data and # # create the blending mask # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64) # if blend is not None: # # make mask # stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64) # tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask']) # # Create the subdirectory used to save the blended tiles # suffix = 'blended_tiles' # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Get the directory with the filtered npy images of the reference_gene to use for stitching # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0] # # Create the tmp directory where to save the masks # suffix = 'masks' # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Create and save the mask files # for corn_value,corner_coords in joining['corner_list']: # if not(np.isnan(corner_coords[0])): # cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels), # int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)] # fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value) # np.save(fname,cur_mask) # # Blend all the tiles and save them in a directory # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'], # stitching_files_dir = stitching_files_dir, # blended_tiles_directory = blended_tiles_directory, # masked_tiles_directory = masked_tiles_directory, # analysis_name = flt_rawcnt_config['analysis_name'], # processing_hyb = processing_hyb,reference_gene = reference_gene, # micData = micData,tiles = tiles,nr_pixels=nr_pixels, # linear_blending=linear_blending) # _ = client.gather(futures_processes) # # Write the stitched image # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels) # # close the hdf5 file # stitching_file.close() # # Delete the directories with blended tiles and masks # shutil.rmtree(blended_tiles_directory) # shutil.rmtree(masked_tiles_directory) # ----------------- DELETE FILES ------------------------ # Don't delete the *.npy files here because can be used to # create the final images using the apply stitching related function client.close()
def _update_engine(publisher: Parameter): global DEFAULT_NPARTITIONS, dask_client, num_cpus from modin.config import Backend, CpuCount if publisher.get() == "Ray": import ray from modin.engines.ray.utils import initialize_ray # With OmniSci backend there is only a single worker per node # and we allow it to work on all cores. if Backend.get() == "Omnisci": CpuCount.put(1) os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count()) if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client dask_client = Client(n_workers=CpuCount.get()) num_cpus = len(dask_client.ncores()) elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(Backend.get()) # import EngineDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_backends( "Ray", Backend.get()) num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))
default="out.tdb", help="Output TDB file") def recursive_glob(start, pattern): matches = [] for root, dirnames, filenames in os.walk(start): for filename in fnmatch.filter(filenames, pattern): matches.append(os.path.join(root, filename)) return sorted(matches) if __name__ == '__main__': args = parser.parse_args(sys.argv[1:]) if not args.dask_scheduler: args.dask_scheduler = LocalCluster(n_workers=int(multiprocessing.cpu_count() / 2), threads_per_worker=1, nanny=True) client = Client(args.dask_scheduler) logging.info( "Running with dask scheduler: %s [%s cores]" % ( args.dask_scheduler, sum(client.ncores().values()))) datasets = load_datasets(sorted(recursive_glob('Al-Ni', '*.json'))) recfile = open(args.iter_record, 'a') if args.iter_record else None try: dbf, mdl, model_dof = fit(args.fit_settings, datasets, scheduler=client, recfile=recfile) finally: if recfile: recfile.close() dbf.to_file(args.output_tdb, if_exists='overwrite')
parser = argparse.ArgumentParser( prog='orthophoto_feat_extraction', description='Extracts and GeoLocates Features From a UVAN Flight') parser.add_argument('-o', '--output', help='Directory where to create test datasets') parser.add_argument('-g', '--geoid', help='Path to Geoid File') parser.add_argument('-s', '--srtm', help='Path to SRTM Root') parser.add_argument('-d', '--scheduler', help='Node hosting the scheduler') parser.add_argument('-i', '--orthophoto', help='Path to OrthoPhoto') args = parser.parse_args() in_ophoto = orthophoto.VRTOrthophoto(args.orthophoto) slices = tbdb.slices_from_ophoto(in_ophoto, 5000) in_ophoto.close() slice_df = pd.DataFrame(slices, columns=['x_slice', 'y_slice']) slice_df['out_path'] = args.output slice_df['terrain_path'] = args.srtm slice_df['geoid'] = args.geoid slice_df['ophoto'] = args.orthophoto slice_df['idx'] = slice_df.index client = Client(args.scheduler) meta = {'file_path': np.str, 'num_landmarks': np.int} ddf = dd.from_pandas(slice_df, npartitions=len(client.ncores())) ask_df = ddf.apply(db_from_slice, axis=1, meta=meta) ask_df.compute()