def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool, task_handler, result_handler, cache, working_dirs, ): info("terminating pool") # this is guaranteed to only be called once debug('finalizing pool') TERMINATE = 2 task_handler._state = TERMINATE for p in pool: taskqueue.put(None) # sentinel time.sleep(1) debug('helping task handler/workers to finish') cls._help_stuff_finish(inqueue, task_handler, len(pool)) assert result_handler.is_alive() or len(cache) == 0 result_handler._state = TERMINATE outqueue.put(None) # sentinel if pool and hasattr(pool[0], 'terminate'): debug('terminating workers') for p in pool: p.terminate() debug('joining task handler') task_handler.join(1e100) debug('joining result handler') result_handler.join(1e100) if pool and hasattr(pool[0], 'terminate'): debug('joining pool workers') for p in pool: p.join() # cleaning up directories # TODO investigate whether the multiprocessing.util tempdirectory # functionality can be used instead for directory in working_dirs: directory = os.path.dirname(directory) debug("deleting "+str(directory)) shutil.rmtree(directory)
def oldcsv_load_results(file_name): ''' load the specified bz2 file. the file is assumed to be saves using save_results. :param file_name: the path of the file :raises: IOError if file not found ''' outcomes = {} with tarfile.open(file_name, 'r') as z: # load experiments experiments = z.extractfile('experiments.csv') experiments = csv2rec(experiments) # load metadata metadata = z.extractfile('experiments metadata.csv').readlines() metadata = [entry.strip() for entry in metadata] metadata = [tuple(entry.split(",")) for entry in metadata] metadata = np.dtype(metadata) # cast experiments to dtype and name specified in metadata temp_experiments = np.zeros((experiments.shape[0],), dtype=metadata) for i, entry in enumerate(experiments.dtype.descr): dtype = metadata[i] name = metadata.descr[i][0] temp_experiments[name][:] = experiments[entry[0]].astype(dtype) experiments = temp_experiments # load outcomes fhs = z.getnames() fhs.remove('experiments.csv') fhs.remove('experiments metadata.csv') for fh in fhs: root = os.path.splitext(fh)[0] data = z.extractfile(fh) first_line = data.readline() shape = first_line.split(":")[1].strip()[1:-1] shape = shape.split((',')) shape = tuple([int(entry) for entry in shape if len(entry)>0]) data = np.loadtxt(data, delimiter=',') data = data.reshape(shape) outcomes[root] = data info("results loaded succesfully from {}".format(file_name)) return experiments, outcomes
def __call__(self, case, policy, name, result): ''' Method responsible for storing results. The implementation in this class only keeps track of how many runs have been completed and logging this. :param case: the case to be stored :param policy: the name of the policy being used :param name: the name of the model being used :param result: the result dict ''' self.i+=1 debug(str(self.i)+" cases completed") if self.i % self.reporting_interval == 0: info(str(self.i)+" cases completed")
def __init__(self, msis, processes=None, kwargs=None): """ :param msis: an iterable of model structure interfaces :param processes: nr. of processes to spawn, if none, it is set to equal the nr. of cores :param callback: callback function for handling the output :param kwargs: kwargs to be pased to :meth:`model_init` """ self._setup_queues() self._taskqueue = Queue(cpu_count() * 2) self._cache = {} self._state = RUN if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 info("nr of processes is " + str(processes)) self.log_queue = Queue() h = NullHandler() logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h) # This thread will read from the subprocesses and write to the # main log's handlers. log_queue_reader = LogQueueReader(self.log_queue) log_queue_reader.start() self._pool = [] working_dirs = [] # msis = [copy.deepcopy(msi) for msi in msis] debug("generating workers") workerRoot = None for i in range(processes): debug("generating worker " + str(i)) workerName = "PoolWorker" + str(i) def ignore_function(path, names): if path.find(".svn") != -1: return names else: return [] # setup working directories for parallelEMA for msi in msis: if msi.workingDirectory != None: if workerRoot == None: workerRoot = os.path.dirname(os.path.abspath(msis[0].workingDirectory)) workingDirectory = os.path.join(workerRoot, workerName, msi.name) working_dirs.append(workingDirectory) shutil.copytree(msi.workingDirectory, workingDirectory, ignore=ignore_function) msi.set_working_directory(workingDirectory) w = LoggingProcess( self.log_queue, level=logging.getLogger(ema_logging.LOGGER_NAME).getEffectiveLevel(), target=worker, args=(self._inqueue, self._outqueue, msis, kwargs), ) self._pool.append(w) w.name = w.name.replace("Process", workerName) w.daemon = True w.start() debug(" worker " + str(i) + " generated") # thread for handling tasks self._task_handler = threading.Thread( target=CalculatorPool._handle_tasks, name="task handler", args=(self._taskqueue, self._quick_put, self._outqueue, self._pool), ) self._task_handler.daemon = True self._task_handler._state = RUN self._task_handler.start() # thread for handling results self._result_handler = threading.Thread( target=CalculatorPool._handle_results, name="result handler", args=(self._outqueue, self._quick_get, self._cache, self.log_queue), ) self._result_handler.daemon = True self._result_handler._state = RUN self._result_handler.start() # function for cleaning up when finalizing object self._terminate = Finalize( self, self._terminate_pool, args=( self._taskqueue, self._inqueue, self._outqueue, self._pool, self._task_handler, self._result_handler, self._cache, working_dirs, ), exitpriority=15, ) info("pool has been set up")
def merge_results(results1, results2, downsample=None): ''' convenience function for merging the return from :meth:`~modelEnsemble.ModelEnsemble.perform_experiments`. The function merges results2 with results1. For the experiments, it generates an empty array equal to the size of the sum of the experiments. As dtype is uses the dtype from the experiments in results1. The function assumes that the ordering of dtypes and names is identical in both results. A typical use case for this function is in combination with :func:`~util.experiments_to_cases`. Using :func:`~util.experiments_to_cases` one extracts the cases from a first set of experiments. One then performs these cases on a different model or policy, and then one wants to merge these new results with the old result for further analysis. :param results1: first results to be merged :param results2: second results to be merged :param downsample: should be an integer, will be used in slicing the results in order to avoid memory problems. :return: the merged results ''' #start of merging old_exp, old_res = results1 new_exp, new_res = results2 #merge experiments dtypes = old_exp.dtype merged_exp = np.empty((old_exp.shape[0]+new_exp.shape[0],),dtype= dtypes) merged_exp[0:old_exp.shape[0]] = old_exp merged_exp[old_exp.shape[0]::] = new_exp #only merge the results that are in both keys = old_res.keys() [keys.append(key) for key in new_res.keys()] keys = set(keys) info("intersection of keys: %s" % keys) #merging results merged_res = {} for key in keys: info("merge "+key) old_value = old_res.get(key) new_value = new_res.get(key) i = old_value.shape[0]+new_value.shape[0] j = old_value.shape[1] slice_value = 1 if downsample: j = int(math.ceil(j/downsample)) slice_value = downsample merged_value = np.empty((i,j)) debug("merged shape: %s" % merged_value.shape) merged_value[0:old_value.shape[0], :] = old_value[:, ::slice_value] merged_value[old_value.shape[0]::, :] = new_value[:, ::slice_value] merged_res[key] = merged_value mr = (merged_exp, merged_res) return mr
def load_results(file_name): ''' load the specified bz2 file. the file is assumed to be saves using save_results. :param file_name: the path of the file :raises: IOError if file not found ''' outcomes = {} with tarfile.open(file_name, 'r') as z: # load experiments experiments = z.extractfile('experiments.csv') experiments = csv2rec(experiments) # load experiment metadata metadata = z.extractfile('experiments metadata.csv').readlines() metadata = [entry.strip() for entry in metadata] metadata = [tuple(entry.split(",")) for entry in metadata] metadata = np.dtype(metadata) # cast experiments to dtype and name specified in metadata temp_experiments = np.zeros((experiments.shape[0],), dtype=metadata) for i, entry in enumerate(experiments.dtype.descr): dtype = metadata[i] name = metadata.descr[i][0] temp_experiments[name][:] = experiments[entry[0]].astype(dtype) experiments = temp_experiments # load outcome metadata metadata = z.extractfile('outcomes metadata.csv').readlines() metadata = [entry.strip() for entry in metadata] metadata = [tuple(entry.split(",")) for entry in metadata] metadata = {entry[0]: entry[1:] for entry in metadata} # load outcomes for outcome, shape in metadata.iteritems(): shape = list(shape) shape[0] = shape[0][1:] shape[-1] = shape[-1][0:-1] temp_shape = [] for entry in shape: if entry: temp_shape.append(int(entry)) shape = tuple(temp_shape) if len(shape)>2: nr_files = shape[-1] data = np.empty(shape) for i in range(nr_files): values = z.extractfile("{}_{}.csv".format(outcome, i)) values = read_csv(values, index_col=False, header=None).values data[:,:,i] = values else: data = z.extractfile("{}.csv".format(outcome)) data = read_csv(data, index_col=False, header=None).values data = np.reshape(data, shape) outcomes[outcome] = data info("results loaded succesfully from {}".format(file_name)) return experiments, outcomes
def save_results(results, file_name): ''' save the results to the specified tar.gz file. The results are stored as csv files. There is an experiments.csv, and a csv for each outcome. In addition, there is a metadata csv which contains the datatype information for each of the columns in the experiments array. :param results: the return of run_experiments :param file_name: the path of the file :raises: IOError if file not found ''' def add_file(tararchive, string_to_add, filename): tarinfo = tarfile.TarInfo(filename) tarinfo.size = len(string_to_add) z.addfile(tarinfo, StringIO.StringIO(string_to_add)) def save_numpy_array(fh, data): data = pd.DataFrame(data) data.to_csv(fh, header=False, index=False) experiments, outcomes = results with tarfile.open(file_name, 'w:gz') as z: # write the experiments to the zipfile experiments_file = StringIO.StringIO() rec2csv(experiments, experiments_file, withheader=True) add_file(z, experiments_file.getvalue(), 'experiments.csv') # write experiment metadata dtype = experiments.dtype.descr dtype = ["{},{}".format(*entry) for entry in dtype] dtype = "\n".join(dtype) add_file(z, dtype, 'experiments metadata.csv') # write outcome metadata outcome_names = outcomes.keys() outcome_meta = ["{},{}".format(outcome, outcomes[outcome].shape) for outcome in outcome_names] outcome_meta = "\n".join(outcome_meta) add_file(z, outcome_meta, "outcomes metadata.csv") # outcomes for key, value in outcomes.iteritems(): fh = StringIO.StringIO() nr_dim = len(value.shape) if nr_dim==3: for i in range(value.shape[2]): data = value[:,:,i] save_numpy_array(fh, data) fh = fh.getvalue() fn = '{}_{}.csv'.format(key, i) add_file(z, fh, fn) fh = StringIO.StringIO() else: save_numpy_array(fh, value) fh = fh.getvalue() add_file(z, fh, '{}.csv'.format(key)) info("results saved successfully to {}".format(file_name))
def __init__(self, msis, processes=None, kwargs=None): ''' :param msis: an iterable of model structure interfaces :param processes: nr. of processes to spawn, if none, it is set to equal the nr. of cores :param callback: callback function for handling the output :param kwargs: kwargs to be pased to :meth:`model_init` ''' self._setup_queues() self._taskqueue = Queue.Queue(cpu_count()*2) self._cache = {} self._state = RUN if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 info("nr of processes is "+str(processes)) self.log_queue = multiprocessing.Queue() h = NullHandler() logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h) # This thread will read from the subprocesses and write to the # main log's handlers. log_queue_reader = LogQueueReader(self.log_queue) log_queue_reader.start() self._pool = [] working_dirs = [] debug('generating workers') worker_root = None for i in range(processes): debug('generating worker '+str(i)) # generate a random string helps in running repeatedly with # crashes choice_set = string.ascii_uppercase + string.digits + string.ascii_lowercase random_string = ''.join(random.choice(choice_set) for _ in range(5)) workername = 'tpm_{}_PoolWorker_{}'.format(random_string, i) #setup working directories for parallel_ema for msi in msis: if msi.working_directory != None: if worker_root == None: worker_root = os.path.dirname(os.path.abspath(msis[0].working_directory)) working_directory = os.path.join(worker_root, workername) # working_directory = tempfile.mkdtemp(suffix=workername, # prefix='tmp_', # dir=worker_root) working_dirs.append(working_directory) shutil.copytree(msi.working_directory, working_directory, ) msi.set_working_directory(working_directory) w = LoggingProcess( self.log_queue, level = logging.getLogger(ema_logging.LOGGER_NAME)\ .getEffectiveLevel(), target=worker, args=(self._inqueue, self._outqueue, msis, kwargs ) ) self._pool.append(w) w.name = w.name.replace('Process', workername) w.daemon = True w.start() debug(' worker '+str(i) + ' generated') # thread for handling tasks self._task_handler = threading.Thread( target=CalculatorPool._handle_tasks, name='task handler', args=(self._taskqueue, self._quick_put, self._outqueue, self._pool ) ) self._task_handler.daemon = True self._task_handler._state = RUN self._task_handler.start() # thread for handling results self._result_handler = threading.Thread( target=CalculatorPool._handle_results, name='result handler', args=(self._outqueue, self._quick_get, self._cache, self.log_queue) ) self._result_handler.daemon = True self._result_handler._state = RUN self._result_handler.start() # function for cleaning up when finalizing object self._terminate = Finalize(self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._task_handler, self._result_handler, self._cache, working_dirs, ), exitpriority=15 ) info("pool has been set up")
def __init__(self, msis, processes=None, kwargs=None): ''' :param msis: an iterable of model structure interfaces :param processes: nr. of processes to spawn, if none, it is set to equal the nr. of cores :param callback: callback function for handling the output :param kwargs: kwargs to be pased to :meth:`model_init` ''' if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 info("nr of processes is "+str(processes)) # setup queues etc. self._setup_queues() self._taskqueue = Queue.Queue(processes*2) self._cache = {} self._state = RUN # handling of logging self.log_queue = multiprocessing.Queue() h = NullHandler() logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h) log_queue_reader = LogQueueReader(self.log_queue) log_queue_reader.start() # setup of the actual pool self._pool = [] working_dirs = [] debug('generating workers') worker_root = None for i in range(processes): debug('generating worker '+str(i)) workername = self._get_worker_name(i) #setup working directories for parallel_ema for msi in msis: if msi.working_directory != None: if worker_root == None: wd = msis[0].working_directory abs_wd = os.path.abspath(wd) worker_root = os.path.dirname(abs_wd) wd_name = workername + msi.name working_directory = os.path.join(worker_root, wd_name) # working_directory = tempfile.mkdtemp(suffix=workername, # prefix='tmp_', # dir=worker_root) working_dirs.append(working_directory) shutil.copytree(msi.working_directory, working_directory, ) msi.set_working_directory(working_directory) w = LoggingProcess( self.log_queue, level = logging.getLogger(ema_logging.LOGGER_NAME)\ .getEffectiveLevel(), target=worker, args=(self._inqueue, self._outqueue, msis, kwargs ) ) self._pool.append(w) w.name = w.name.replace('Process', workername) w.daemon = True w.start() debug(' worker '+str(i) + ' generated') # thread for handling tasks self._task_handler = threading.Thread( target=CalculatorPool._handle_tasks, name='task handler', args=(self._taskqueue, self._quick_put, self._outqueue, self._pool ) ) self._task_handler.daemon = True self._task_handler._state = RUN self._task_handler.start() # thread for handling results self._result_handler = threading.Thread( target=CalculatorPool._handle_results, name='result handler', args=(self._outqueue, self._quick_get, self._cache, self.log_queue) ) self._result_handler.daemon = True self._result_handler._state = RUN self._result_handler.start() # function for cleaning up when finalizing object self._terminate = Finalize(self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._task_handler, self._result_handler, self._cache, working_dirs, ), exitpriority=15 ) info("pool has been set up")