def _test_model( model: Component, metrics_functions: List[Tuple[str, Callable]], iterator: DataLearningIterator, batch_size=-1, data_type='valid', start_time: float = None) -> Dict[str, Union[int, OrderedDict, str]]: if start_time is None: start_time = time.time() val_y_true = [] val_y_predicted = [] for x, y_true in iterator.gen_batches(batch_size, data_type, shuffle=False): y_predicted = list(model(list(x))) val_y_true += y_true val_y_predicted += y_predicted metrics = [(s, f(val_y_true, val_y_predicted)) for s, f in metrics_functions] report = { 'eval_examples_count': len(val_y_true), 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } return report
def _test_model(model: Component, metrics_functions: List[Tuple[str, Callable]], iterator: DataLearningIterator, batch_size=-1, data_type='valid', start_time: float=None, show_examples=False) -> Dict[str, Union[int, OrderedDict, str]]: if start_time is None: start_time = time.time() val_y_true = [] val_y_predicted = [] for x, y_true in iterator.gen_batches(batch_size, data_type, shuffle=False): y_predicted = list(model(list(x))) val_y_true += y_true val_y_predicted += y_predicted metrics = [(s, f(val_y_true, val_y_predicted)) for s, f in metrics_functions] report = { 'eval_examples_count': len(val_y_true), 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if show_examples: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning(f'Could not log examples for {data_type}, assuming it\'s empty') return report
def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[Iterable[str]] = None, *, print_reports: bool = True) -> Dict[str, dict]: """ Run :meth:`test` on multiple data types using provided data iterator Args: iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation evaluation_targets: iterable of data types to evaluate on print_reports: a flag used to print evaluation reports as json lines Returns: a dictionary with data types as keys and evaluation reports as values """ self._load() if evaluation_targets is None: evaluation_targets = self.evaluation_targets res = {} for data_type in evaluation_targets: data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False) report = self.test(data_gen) res[data_type] = report if print_reports: print(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) return res
def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: self._validate(iterator) while True: impatient = False self._send_event(event_name='before_train') for x, y_true in iterator.gen_batches(self.batch_size, data_type='train'): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: self.last_result = {} elif not isinstance(self.last_result, dict): self.last_result = {'loss': self.last_result} if 'loss' in self.last_result: self.losses.append(self.last_result.pop('loss')) self.train_batches_seen += 1 self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: self._validate(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) self._send_event(event_name='after_batch') if 0 < self.max_batches <= self.train_batches_seen: impatient = True break if 0 < self.validation_patience <= self.patience: log.info('Ran out of patience') impatient = True break if impatient: break self.epoch += 1 if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) self._send_event(event_name='after_epoch') if 0 < self.max_epochs <= self.epoch: break if 0 < self.validation_patience <= self.patience: log.info('Ran out of patience') break
def _test_model( model: Chainer, metrics_functions: List[Metric], iterator: DataLearningIterator, batch_size=-1, data_type='valid', start_time: float = None, show_examples=False) -> Dict[str, Union[int, OrderedDict, str]]: if start_time is None: start_time = time.time() expected_outputs = list(set().union(model.out_params, *[m.inputs for m in metrics_functions])) outputs = {out: [] for out in expected_outputs} examples = 0 for x, y_true in iterator.gen_batches(batch_size, data_type, shuffle=False): examples += len(x) y_predicted = list( model.compute(list(x), list(y_true), targets=expected_outputs)) if len(expected_outputs) == 1: y_predicted = [y_predicted] for out, val in zip(outputs.values(), y_predicted): out += list(val) metrics = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in metrics_functions] report = { 'eval_examples_count': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if show_examples: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip( x, {k: outputs[k] for k in model.out_params}, y_true)] except NameError: log.warning( f'Could not log examples for {data_type}, assuming it\'s empty' ) return report
def _log(self, iterator: DataLearningIterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: self._send_event(event_name='before_log') if self.log_on_k_batches == 0: report = { 'time_spent': str( datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))) } else: data = islice( iterator.gen_batches(self.batch_size, data_type='train', shuffle=True), self.log_on_k_batches) report = self.test(data, self.train_metrics, start_time=self.start_time) report.update({ 'epochs_done': self.epoch, 'batches_seen': self.train_batches_seen, 'train_examples_seen': self.examples }) metrics: List[Tuple[str, float]] = list( report.get('metrics', {}).items()) + list(self.last_result.items()) report.update(self.last_result) if self.losses: report['loss'] = sum(self.losses) / len(self.losses) self.losses.clear() metrics.append(('loss', report['loss'])) if metrics and self.tensorboard_log_dir is not None: summary = self._tf.Summary() for name, score in metrics: summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) self.tb_train_writer.add_summary(summary, tensorboard_index) self.tb_train_writer.flush() self._send_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False))
def _test_model( model: Component, metrics_functions: List[Tuple[str, Callable]], iterator: DataLearningIterator, batch_size=-1, data_type='valid', start_time: float = None, show_examples=False) -> Dict[str, Union[int, OrderedDict, str]]: if start_time is None: start_time = time.time() val_y_true = [] val_y_predicted = [] for x, y_true in iterator.gen_batches(batch_size, data_type, shuffle=False): y_predicted = list(model(list(x))) val_y_true += y_true val_y_predicted += y_predicted metrics = [(s, f(val_y_true, val_y_predicted)) for s, f in metrics_functions] report = { 'eval_examples_count': len(val_y_true), 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if show_examples: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip( x, y_predicted, y_true)] except NameError: log.warning( f'Could not log examples for {data_type}, assuming it\'s empty' ) return report
def _validate(self, iterator: DataLearningIterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: self._send_event(event_name='before_validation') report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False), start_time=self.start_time) report['epochs_done'] = self.epoch report['batches_seen'] = self.train_batches_seen report['train_examples_seen'] = self.examples metrics = list(report['metrics'].items()) if tensorboard_tag is not None and self.tensorboard_log_dir is not None: summary = self._tf.Summary() for name, score in metrics: summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) if tensorboard_index is None: tensorboard_index = self.train_batches_seen self.tb_valid_writer.add_summary(summary, tensorboard_index) self.tb_valid_writer.flush() m_name, score = metrics[0] if self.improved(score): self.patience = 0 log.info('New best {} of {}'.format(m_name, score)) self.best = score log.info('Saving model') self.save() else: self.patience += 1 log.info('Did not improve on the {} of {}'.format( m_name, self.best)) report['impatience'] = self.patience if self.validation_patience > 0: report['patience_limit'] = self.validation_patience self._send_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False))
def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: dict, metrics_functions: List[Tuple[str, Callable]]) -> NNModel: default_train_config = { 'epochs': 0, 'max_batches': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, # 'show_examples': False, 'validate_best': True, 'test_best': True, 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) if 'train_metrics' in train_config: train_metrics_functions = list( zip(train_config['train_metrics'], get_metrics_by_names(train_config['train_metrics']))) else: train_metrics_functions = metrics_functions if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format( ['maximize', 'minimize'])) i = 0 epochs = 0 examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config[ 'log_every_n_epochs'] > 0 train_y_true = [] train_y_predicted = [] losses = [] start_time = time.time() break_flag = False if train_config['tensorboard_log_dir'] is not None: import tensorflow as tf tb_log_dir = expand_path(train_config['tensorboard_log_dir']) tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): if log_on: y_predicted = list(model(list(x))) train_y_true += y_true train_y_predicted += y_predicted loss = model.train_on_batch(x, y_true) if loss is not None: losses.append(loss) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config[ 'log_every_n_batches'] == 0: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str( datetime.timedelta( seconds=round(time.time() - start_time + 0.5))) } if losses: report['loss'] = sum(losses) / len(losses) losses = [] report = {'train': report} if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_batches/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, i) if losses: loss_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_batches/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, i) print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if i >= train_config['max_batches'] > 0: break_flag = True break report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_batch', data=report) if break_flag: break epochs += 1 report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_epoch', data=report) if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and train_y_true: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if losses: report['loss'] = sum(losses) / len(losses) losses = [] if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, epochs) if losses: loss_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, epochs) model.process_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if train_config['val_every_n_epochs'] > 0 and epochs % train_config[ 'val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_valid_writer.add_summary(metric_sum, epochs) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format( m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config[ 'validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model
def _train_batches(model: Chainer, iterator: DataLearningIterator, train_config: dict, metrics_functions: List[Metric], *, start_epoch_num: Optional[int] = None) -> NNModel: default_train_config = { 'epochs': 0, 'start_epoch_num': 0, 'max_batches': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'val_every_n_batches': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, 'validate_best': True, 'test_best': True, 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) if 'train_metrics' in train_config: train_metrics_functions = _parse_metrics(train_config['train_metrics'], model.in_y, model.out_params) else: train_metrics_functions = metrics_functions expected_outputs = list(set().union(model.out_params, *[m.inputs for m in train_metrics_functions])) if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize'])) i = 0 epochs = start_epoch_num if start_epoch_num is not None else train_config['start_epoch_num'] examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config['log_every_n_epochs'] > 0 outputs = {key: [] for key in expected_outputs} losses = [] start_time = time.time() break_flag = False if train_config['tensorboard_log_dir'] is not None: import tensorflow as tf tb_log_dir = expand_path(train_config['tensorboard_log_dir']) tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) # validate first (important if model is pre-trained) if train_config['val_every_n_epochs'] > 0 or train_config['val_every_n_batches'] > 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time, train_config['show_examples']) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format(m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config['validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): if log_on and len(train_metrics_functions) > 0: y_predicted = list(model.compute(list(x), list(y_true), targets=expected_outputs)) if len(expected_outputs) == 1: y_predicted = [y_predicted] for out, val in zip(outputs.values(), y_predicted): out += list(val) result = model.train_on_batch(x, y_true) if not isinstance(result, dict): result = {'loss': result} if result is not None else {} if 'loss' in result: losses.append(result['loss']) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config['log_every_n_batches'] == 0: metrics = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } default_report_keys = list(report.keys()) report.update(result) if train_config['show_examples']: try: y_predicted = zip(*[y_predicted_group for out_name, y_predicted_group in zip(expected_outputs, y_predicted) if out_name in model.out_params]) if len(model.out_params) == 1: y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted] report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples as y_predicted is not defined') if losses: report['loss'] = sum(losses)/len(losses) losses = [] model.process_event(event_name='after_train_log', data=report) if train_config['tensorboard_log_dir'] is not None: summ = tf.Summary() for name, score in metrics: summ.value.add(tag='every_n_batches/' + name, simple_value=score) for name, score in report.items(): if name not in default_report_keys: summ.value.add(tag='every_n_batches/' + name, simple_value=score) tb_train_writer.add_summary(summ, i) tb_train_writer.flush() report = {'train': report} print(json.dumps(report, ensure_ascii=False)) for out in outputs.values(): out.clear() if train_config['val_every_n_batches'] > 0 and i % train_config['val_every_n_batches'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time, train_config['show_examples']) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: summ = tf.Summary() for name, score in metrics: summ.value.add(tag='every_n_batches/' + name, simple_value=score) tb_valid_writer.add_summary(summ, i) tb_valid_writer.flush() m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format(m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config['validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break_flag = True break if i >= train_config['max_batches'] > 0: break_flag = True break report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_batch', data=report) if break_flag: break epochs += 1 report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_epoch', data=report) if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and outputs: metrics = [(m.name, m.fn(*[outputs[i] for i in m.inputs])) for m in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } default_report_keys = list(report.keys()) report.update(result) if train_config['show_examples']: try: y_predicted = zip(*[y_predicted_group for out_name, y_predicted_group in zip(expected_outputs, y_predicted) if out_name in model.out_params]) if len(model.out_params) == 1: y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted] report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples') if losses: report['loss'] = sum(losses)/len(losses) losses = [] model.process_event(event_name='after_train_log', data=report) if train_config['tensorboard_log_dir'] is not None: summ = tf.Summary() for name, score in metrics: summ.value.add(tag='every_n_epochs/' + name, simple_value=score) for name, score in report.items(): if name not in default_report_keys: summ.value.add(tag='every_n_epochs/' + name, simple_value=score) tb_train_writer.add_summary(summ, epochs) tb_train_writer.flush() report = {'train': report} print(json.dumps(report, ensure_ascii=False)) for out in outputs.values(): out.clear() if train_config['val_every_n_epochs'] > 0 and epochs % train_config['val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time, train_config['show_examples']) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: summ = tf.Summary() for name, score in metrics: summ.value.add(tag='every_n_epochs/' + name, simple_value=score) tb_valid_writer.add_summary(summ, epochs) tb_valid_writer.flush() m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format(m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config['validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model
def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: dict, metrics_functions: List[Tuple[str, Callable]]) -> NNModel: default_train_config = { 'epochs': 0, 'max_batches': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, 'validate_best': True, 'test_best': True, 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) if 'train_metrics' in train_config: train_metrics_functions = list(zip(train_config['train_metrics'], get_metrics_by_names(train_config['train_metrics']))) else: train_metrics_functions = metrics_functions if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize'])) i = 0 epochs = 0 examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config['log_every_n_epochs'] > 0 train_y_true = [] train_y_predicted = [] losses = [] start_time = time.time() break_flag = False if train_config['tensorboard_log_dir'] is not None: import tensorflow as tf tb_log_dir = expand_path(train_config['tensorboard_log_dir']) tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): if log_on: y_predicted = list(model(list(x))) train_y_true += y_true train_y_predicted += y_predicted loss = model.train_on_batch(x, y_true) if loss is not None: losses.append(loss) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config['log_every_n_batches'] == 0: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if train_config['show_examples']: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples as y_predicted is not defined') if losses: report['loss'] = sum(losses)/len(losses) losses = [] report = {'train': report} if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, i) if losses: loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, i) print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if i >= train_config['max_batches'] > 0: break_flag = True break report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_batch', data=report) if break_flag: break epochs += 1 report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_epoch', data=report) if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and train_y_true: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if train_config['show_examples']: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples') if losses: report['loss'] = sum(losses)/len(losses) losses = [] if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, epochs) if losses: loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, epochs) model.process_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if train_config['val_every_n_epochs'] > 0 and epochs % train_config['val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time, train_config['show_examples']) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_valid_writer.add_summary(metric_sum, epochs) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format(m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config['validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model