def _solve(self, root_dir): """ Args: :root_dir: Returns: """ # initialise generation based on individual representation population, bounds = self._population_initialisation() global fs_handle fs_handle = hdfs.get_fs() global run_id new_gen_best_param = None new_gen_best = None for _ in range(self.generations): donor_population = self._mutation(population, bounds) trial_population = self._recombination(population, donor_population) population = self._selection(population, trial_population) new_gen_avg = sum(self._scores) / self.n if self.direction.upper() == Direction.MAX: new_gen_best = max(self._scores) elif self.direction.upper() == Direction.MIN: new_gen_best = min(self._scores) else: raise ValueError('invalid direction: ' + self.direction) new_gen_best_param = self._parse_back( population[self._scores.index(new_gen_best)]) index = 0 for name in self._param_names: new_gen_best_param[index] = name + "=" + str( new_gen_best_param[index]) index += 1 print("Generation " + str(self._generation) + " || " + "average metric: " + str(new_gen_avg) \ + ", best metric: " + str(new_gen_best) + ", best parameter combination: " + str(new_gen_best_param) + "\n") if cleanup: pydoop.hdfs.rmr(root_dir + '/generation.' + str(self._generation - 1)) parsed_back_population = [] for indiv in population: parsed_back_population.append(self._parse_back(indiv)) return new_gen_best_param, new_gen_best
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def save(data, path): with hdfs.get_fs().open_file(path, 'w') as data_file: data_file.write(data)
def savetf(temp_file, path, close=True): with hdfs.get_fs().open_file(path, 'w') as data_file: temp_file.seek(0) data_file.write(temp_file.read()) if close: temp_file.close()
def open_hdfs(path): temp_file = TemporaryFile() with hdfs.get_fs().open_file(path, 'r') as data_file: temp_file.write(data_file.read()) temp_file.seek(0) return temp_file