def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: dict, metrics_functions: List[Tuple[str, Callable]]) -> NNModel: default_train_config = { 'epochs': 0, 'max_batches': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, 'validate_best': True, 'test_best': True, 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) if 'train_metrics' in train_config: train_metrics_functions = list(zip(train_config['train_metrics'], get_metrics_by_names(train_config['train_metrics']))) else: train_metrics_functions = metrics_functions if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize'])) i = 0 epochs = 0 examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config['log_every_n_epochs'] > 0 train_y_true = [] train_y_predicted = [] losses = [] start_time = time.time() break_flag = False if train_config['tensorboard_log_dir'] is not None: import tensorflow as tf tb_log_dir = expand_path(train_config['tensorboard_log_dir']) tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): if log_on: y_predicted = list(model(list(x))) train_y_true += y_true train_y_predicted += y_predicted loss = model.train_on_batch(x, y_true) if loss is not None: losses.append(loss) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config['log_every_n_batches'] == 0: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if train_config['show_examples']: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples as y_predicted is not defined') if losses: report['loss'] = sum(losses)/len(losses) losses = [] report = {'train': report} if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, i) if losses: loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, i) print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if i >= train_config['max_batches'] > 0: break_flag = True break report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_batch', data=report) if break_flag: break epochs += 1 report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_epoch', data=report) if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and train_y_true: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if train_config['show_examples']: try: report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, 'y_true': y_true_item } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)] except NameError: log.warning('Could not log examples') if losses: report['loss'] = sum(losses)/len(losses) losses = [] if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, epochs) if losses: loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, epochs) model.process_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if train_config['val_every_n_epochs'] > 0 and epochs % train_config['val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time, train_config['show_examples']) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_valid_writer.add_summary(metric_sum, epochs) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format(m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config['validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model
def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: dict, metrics_functions: List[Tuple[str, Callable]]) -> NNModel: default_train_config = { 'epochs': 0, 'max_batches': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, # 'show_examples': False, 'validate_best': True, 'test_best': True, 'tensorboard_log_dir': None, } train_config = dict(default_train_config, **train_config) if 'train_metrics' in train_config: train_metrics_functions = list( zip(train_config['train_metrics'], get_metrics_by_names(train_config['train_metrics']))) else: train_metrics_functions = metrics_functions if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format( ['maximize', 'minimize'])) i = 0 epochs = 0 examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config[ 'log_every_n_epochs'] > 0 train_y_true = [] train_y_predicted = [] losses = [] start_time = time.time() break_flag = False if train_config['tensorboard_log_dir'] is not None: import tensorflow as tf tb_log_dir = expand_path(train_config['tensorboard_log_dir']) tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log')) tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log')) try: while True: for x, y_true in iterator.gen_batches(train_config['batch_size']): if log_on: y_predicted = list(model(list(x))) train_y_true += y_true train_y_predicted += y_predicted loss = model.train_on_batch(x, y_true) if loss is not None: losses.append(loss) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config[ 'log_every_n_batches'] == 0: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str( datetime.timedelta( seconds=round(time.time() - start_time + 0.5))) } if losses: report['loss'] = sum(losses) / len(losses) losses = [] report = {'train': report} if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_batches/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, i) if losses: loss_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_batches/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, i) print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if i >= train_config['max_batches'] > 0: break_flag = True break report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_batch', data=report) if break_flag: break epochs += 1 report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } model.process_event(event_name='after_epoch', data=report) if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and train_y_true: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'train_examples_seen': examples, 'metrics': prettify_metrics(metrics), 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time + 0.5))) } if losses: report['loss'] = sum(losses) / len(losses) losses = [] if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_train_writer.add_summary(metric_sum, epochs) if losses: loss_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + 'loss', simple_value=report['loss']), ]) tb_train_writer.add_summary(loss_sum, epochs) model.process_event(event_name='after_train_log', data=report) report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true.clear() train_y_predicted.clear() if train_config['val_every_n_epochs'] > 0 and epochs % train_config[ 'val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, iterator, train_config['batch_size'], 'valid', start_time) report['epochs_done'] = epochs report['batches_seen'] = i report['train_examples_seen'] = examples metrics = list(report['metrics'].items()) if train_config['tensorboard_log_dir'] is not None: for name, score in metrics: metric_sum = tf.Summary(value=[ tf.Summary.Value(tag='every_n_epochs/' + name, simple_value=score), ]) tb_valid_writer.add_summary(metric_sum, epochs) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format( m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config[ 'validation_patience'] model.process_event(event_name='after_validation', data=report) report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model
def _train_batches(model: NNModel, dataset: Dataset, train_config: dict, metrics_functions: List[Tuple[str, Callable]]): default_train_config = { 'epochs': 0, 'batch_size': 1, 'metric_optimization': 'maximize', 'validation_patience': 5, 'val_every_n_epochs': 0, 'log_every_n_batches': 0, 'log_every_n_epochs': 0, # 'show_examples': False, 'validate_best': True, 'test_best': True } train_config = dict(default_train_config, **train_config) if train_config['metric_optimization'] == 'maximize': def improved(score, best): return score > best best = float('-inf') elif train_config['metric_optimization'] == 'minimize': def improved(score, best): return score < best best = float('inf') else: raise ConfigError('metric_optimization has to be one of {}'.format( ['maximize', 'minimize'])) i = 0 epochs = 0 examples = 0 saved = False patience = 0 log_on = train_config['log_every_n_batches'] > 0 or train_config[ 'log_every_n_epochs'] > 0 train_y_true = [] train_y_predicted = [] start_time = time.time() try: while True: for x, y_true in dataset.batch_generator( train_config['batch_size']): if log_on: y_predicted = list(model(list(x))) train_y_true += y_true train_y_predicted += y_predicted model.train_on_batch(x, y_true) i += 1 examples += len(x) if train_config['log_every_n_batches'] > 0 and i % train_config[ 'log_every_n_batches'] == 0: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': dict(metrics), 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time))) } report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true = [] train_y_predicted = [] epochs += 1 if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\ and train_y_true: metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in metrics_functions] report = { 'epochs_done': epochs, 'batches_seen': i, 'examples_seen': examples, 'metrics': dict(metrics), 'time_spent': str( datetime.timedelta(seconds=round(time.time() - start_time))) } report = {'train': report} print(json.dumps(report, ensure_ascii=False)) train_y_true = [] train_y_predicted = [] if train_config['val_every_n_epochs'] > 0 and epochs % train_config[ 'val_every_n_epochs'] == 0: report = _test_model(model, metrics_functions, dataset, train_config['batch_size'], 'valid', start_time) metrics = list(report['metrics'].items()) m_name, score = metrics[0] if improved(score, best): patience = 0 log.info('New best {} of {}'.format(m_name, score)) best = score log.info('Saving model') model.save() saved = True else: patience += 1 log.info('Did not improve on the {} of {}'.format( m_name, best)) report['impatience'] = patience if train_config['validation_patience'] > 0: report['patience_limit'] = train_config[ 'validation_patience'] report = {'valid': report} print(json.dumps(report, ensure_ascii=False)) if patience >= train_config['validation_patience'] > 0: log.info('Ran out of patience') break if epochs >= train_config['epochs'] > 0: break except KeyboardInterrupt: log.info('Stopped training') if not saved: log.info('Saving model') model.save() return model