def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey( lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join( [s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)) flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError( "ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError( "Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write( '[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification( 'Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification( 'Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
class Task(object): """ Parameters ---------- func: call-able function will be executed for each iteration data: single or list of odin.fuel.Data, numpy.ndarray iterate over all these data and execute function on the data. epoch: int how many epoch will be repeated p: float (0.0 - 1.0) probability the `func` will be execute for each iteration batch_size: int (> 0) number of samples for each iteration seed: int random seed for shuffling the data shuffle_level: int (0, 1, 2) if 0, shuffle the file lists if 1, shuffle the buffer (i.e. list of processing files) and all the previous if 2, shuffle the returned batch and all the previous callbacks: None, or list of `odin.training.Callback` callback will be promoted during the execution of the task labels: None, or list of string labels for printing the confusion matrix in `odin.utils.Progbar` name: None or string unique name for Task identity. verbose : {0, 1, 2, 3, 4} specific verbose level controlling the log output 0 - Turn off all log 1 - progress off, only notification 2 - progress off, notification and summary 3 - progress on, nothing else 4 - progress on, notification and summary 5 - progress on, notification, summary and batch report """ def __init__(self, func, data, epoch=1, p=1.0, batch_size=128, seed=None, shuffle_level=2, callbacks=None, labels=None, name=None, verbose=2): super(Task, self).__init__() self.set_func(func, data) # this Progbar will record the history as well self._labels = [str(l) for l in labels] \ if labels is not None else None self._progbar = Progbar(target=self.nb_samples, name=name, interval=0., print_report=True, print_summary=True) self._progbar.set_labels(self._labels) # ====== set callback and verbose ====== # self._callback = CallbackList(callbacks) self.set_verbose(verbose) # ====== assign other arguments ====== # self._nb_epoch = epoch self._p = np.clip(p, 0., 1.) self._seed = seed self.set_batch(batch_size, seed, shuffle_level) self._name = name # ====== current info ====== # self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # ====== iter tracking ====== # self._created_iter = None self._stop = False def __str__(self): return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \ (ctext(self.name, 'lightyellow'), ctext(self.probability, 'cyan'), ctext(self.batch_size, 'cyan'), ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'), ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'), ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'), ','.join([ctext(i.__class__.__name__, 'cyan') for i in self._callback._callbacks])) def __getstate__(self): return (self._progbar, self._nb_epoch, self._p, self._name, self._batch_size, self._rng, self._seed, self._shuffle_level, self._verbose) def __setstate__(self, states): (self._progbar, self._nb_epoch, self._p, self._name, self._batch_size, self._rng, self._seed, self._shuffle_level, self._verbose) = states # ====== current info ====== # self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # ====== iter tracking ====== # self._created_iter = None self._stop = False # ====== reset value of func and data ====== # self._func = None self._data = None def set_callbacks(self, callbacks): self._callback.set_callbacks(callbacks) if self._verbose == 0: self._callback.set_notification(False) else: self._callback.set_notification(True) return self def set_verbose(self, verbose): verbose = int(verbose) self._verbose = verbose if verbose == 0: # turn off everything self._callback.set_notification(False) self._progbar.print_progress = False self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 1: # progress off, only notification self._callback.set_notification(True) self._progbar.print_progress = False self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 2: # progress off, notification + summary self._callback.set_notification(True) self._progbar.print_progress = False self._progbar.print_summary = True self._progbar.print_report = False elif verbose == 3: # progress on, nothing else self._callback.set_notification(False) self._progbar.print_progress = True self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 4: # progress on, notification + summary self._callback.set_notification(True) self._progbar.print_progress = True self._progbar.print_summary = True self._progbar.print_report = False elif verbose == 5: # progress on, notification, report, summary self._callback.set_notification(True) self._progbar.print_progress = True self._progbar.print_summary = True self._progbar.print_report = True else: raise ValueError( "Only support verbose value: 0, 1, 2, 3, 4, 5; but given: %s" % str(verbose)) def set_func(self, func, data): # ====== check function ====== # self._func = func if isinstance(func, K.Function): self._output_info = [(o.name, o.shape.as_list()) for o in self._func.outputs] elif hasattr(func, '__call__'): self._output_info = [] # No info (normal function) else: raise ValueError("No support for function type: %s" % func.__class__.__name__) # ====== check data ====== # if not isinstance(data, (tuple, list)): data = [data] self._data = [fuel.as_data(i, copy=not isinstance(i, fuel.Feeder)) for i in data] self._nb_samples = min([d.iter_len for d in self._data]) return self def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): if batch_size is not None: self._batch_size = batch_size if seed is None or seed >= 0: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda x: None self._rng.rand = get_rng().rand if shuffle_level is not None: self._shuffle_level = min(max(int(shuffle_level), 0), 2) return self # ==================== Properties ==================== # @property def history(self): """ Return : dictionary type {epoch_id : {tensor_name0: [batch_return1, batch_return2, ...], tensor_name1: [batch_return1, batch_return2, ...], ...}, 1 : {tensor_name0: [batch_return1, batch_return2, ...], tensor_name1: [batch_return1, batch_return2, ...], ...}, ... } Example ------- >>> for task_name, task_hist in task.history.items(): >>> print(task_name) >>> for epoch_id, values in task_hist.items(): >>> print(' Epoch:', epoch_id) >>> for tensor_name, v in values.items(): >>> print(' ', tensor_name, len(v)) """ return self._progbar.history @property def progbar(self): return self._progbar @property def name(self): return str(self._name) @property def labels(self): return self._labels @property def nb_epoch(self): return self._nb_epoch @property def nb_samples(self): ''' Estimated number of iteration for each epoch ''' return self._nb_samples @property def probability(self): """Chance that the func will be execute during iteration""" return self._p @property def iter_per_epoch(self): ''' Estimated number of iteration for each epoch ''' return int(np.ceil(self._nb_samples / self._batch_size)) @property def batch_size(self): return self._batch_size @property def curr_epoch(self): """Total number of epoch finished since the beginning of the Task""" return self._curr_epoch @property def curr_iter(self): """Total number of iteration finished since the beginning of the Task""" return self._curr_iter @property def curr_samples(self): """Total number of samples finished since the beginning of the Task""" return self._curr_samples @property def curr_epoch_iter(self): """Number of iteration within current epoch""" return self._curr_epoch_iter @property def curr_epoch_samples(self): """Number of samples within current epoch""" return self._curr_epoch_samples @property def callback_msg(self): return self._callback_msg # ==================== control function ==================== # def stop(self): """ Stop all iterations running for this Task""" if self._created_iter is not None: self._stop = True # just run to end of the iterators for i in self._created_iter: pass self._stop = False self._created_iter = None def copy(self): return Task(self._func, self._data, epoch=self.nb_epoch, p=self.probability, batch_size=self.batch_size, seed=self._seed, shuffle_level=self._shuffle_level, name=self._name, verbose=self._verbose) def __iter(self): ''' Return ------ One of the following: * 'task_start': * 'epoch_start' : beginning of epoch * 'epoch_end' : epoch ended * 'task_end' : task ended * (results, nb_iter, nb_samples, nb_total_samples, nb_epoch) : results of execute function on data Note ---- 'end_task' also end of final epoch ''' yield None # just for initalize the iterator self._callback_msg = self._callback.task_start(self) yield 'task_start' if self._stop: yield 'task_end' else: # ====== start of training ====== # while self._curr_epoch < self._nb_epoch: self._callback_msg = self._callback.epoch_start(self, self._data) yield 'epoch_start' seed = self._rng.randint(10e8) # if only 1 Data, don't need zip or we will mess up if len(self._data) == 1: data_it = iter(self._data[0].set_batch(batch_size=self._batch_size, seed=seed, shuffle_level=self._shuffle_level)) data = data_it else: data_it = [iter(d.set_batch(batch_size=self._batch_size, seed=seed, shuffle_level=self._shuffle_level)) for d in self._data] data = zip(*data_it) # ====== start the iteration ====== # self._curr_epoch_samples = 0 self._curr_epoch_iter = 0 with self._progbar.safe_progress(): for i, x in enumerate(data): # alread terminated, try to exhausted the iterator # if forced_to_terminate: continue # preprocessed the data if not isinstance(x, (tuple, list)): x = [x] # update some info shape0 = x[0].shape[0] self._curr_samples += shape0 self._curr_iter += 1 self._curr_epoch_samples += shape0 self._curr_epoch_iter += 1 self._callback_msg = self._callback.batch_start(self, x) # apply the function if self.probability >= 1. or self._rng.rand() < self.probability: results = self._func(*x) # add msg from batch_end event self._callback_msg += self._callback.batch_end(self, results) # return results yield results # update the progress bar for (name, shape), res in zip(self._output_info, as_tuple(results)): if len(shape) == 0: # return single value self._progbar[name] = res else: # return tensor self._progbar[name] = res self._progbar.add(shape0) # check TERMINATE signal if self._stop: # send signal to the data iterators also for i in data_it: if hasattr(i, 'stop'): i.stop() else: # just iterate all over for _ in i: pass # break the epoch loop break ### Epoch end signaling self._curr_epoch += 1 self._callback_msg = self._callback.epoch_end( self, self._progbar.history[self._curr_epoch - 1]) yield 'epoch_end' # ====== check if we got the right number for epoch iter ====== # if self._curr_epoch_samples != self._nb_samples: # just for sure should not smaller than the real number self._nb_samples = self._curr_epoch_samples # ====== end_epoch or task ====== # if self._stop or self._curr_epoch >= self._nb_epoch: self._callback_msg = self._callback.task_end( self, self._progbar.history) yield 'task_end' # showing notification if self._verbose >= 1 and self._verbose != 3: self._progbar.add_notification('Task "%s" ended!' % str(self.name)) break # ====== end of iteration ====== # self._created_iter = None def __iter__(self): if self._created_iter is None: # reset all information self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # create new iter self._created_iter = self.__iter() # initialize the iteration next(self._created_iter) return self._created_iter def __del__(self): self.stop()
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey(lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError("Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError("Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'.") # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str(get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join([s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)); flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError("ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write('[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification('Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification('Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))