def check_metrics_registry(model_name):
     try:
         get_metrics_by_names(model_name)
     except BaseException:
         return False
     return True
Exemplo n.º 2
0
def train_evaluate_model_from_config(config: [str, Path, dict],
                                     to_train=True,
                                     to_validate=True) -> None:
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    data = []
    reader_config = config.get('dataset_reader', None)

    if reader_config:
        reader_config = config['dataset_reader']
        if 'class' in reader_config:
            c = reader_config.pop('class')
            try:
                module_name, cls_name = c.split(':')
                reader = getattr(importlib.import_module(module_name),
                                 cls_name)()
            except ValueError:
                e = ConfigError(
                    'Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                    .format(c))
                log.exception(e)
                raise e
        else:
            reader = get_model(reader_config.pop('name'))()
        data_path = expand_path(reader_config.pop('data_path', ''))
        data = reader.read(data_path, **reader_config)
    else:
        log.warning("No dataset reader is provided in the JSON config.")

    iterator_config = config['dataset_iterator']
    iterator: Union[DataLearningIterator,
                    DataFittingIterator] = from_params(iterator_config,
                                                       data=data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))
Exemplo n.º 3
0
def _train_batches(model: NNModel, iterator: DataLearningIterator,
                   train_config: dict,
                   metrics_functions: List[Tuple[str, Callable]]) -> NNModel:

    default_train_config = {
        'epochs': 0,
        'max_batches': 0,
        'batch_size': 1,
        'metric_optimization': 'maximize',
        'validation_patience': 5,
        'val_every_n_epochs': 0,
        'log_every_n_batches': 0,
        'log_every_n_epochs': 0,
        # 'show_examples': False,
        'validate_best': True,
        'test_best': True,
        'tensorboard_log_dir': None,
    }

    train_config = dict(default_train_config, **train_config)

    if 'train_metrics' in train_config:
        train_metrics_functions = list(
            zip(train_config['train_metrics'],
                get_metrics_by_names(train_config['train_metrics'])))
    else:
        train_metrics_functions = metrics_functions

    if train_config['metric_optimization'] == 'maximize':

        def improved(score, best):
            return score > best

        best = float('-inf')
    elif train_config['metric_optimization'] == 'minimize':

        def improved(score, best):
            return score < best

        best = float('inf')
    else:
        raise ConfigError('metric_optimization has to be one of {}'.format(
            ['maximize', 'minimize']))

    i = 0
    epochs = 0
    examples = 0
    saved = False
    patience = 0
    log_on = train_config['log_every_n_batches'] > 0 or train_config[
        'log_every_n_epochs'] > 0
    train_y_true = []
    train_y_predicted = []
    losses = []
    start_time = time.time()
    break_flag = False

    if train_config['tensorboard_log_dir'] is not None:
        import tensorflow as tf
        tb_log_dir = expand_path(train_config['tensorboard_log_dir'])

        tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log'))
        tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log'))

    try:
        while True:
            for x, y_true in iterator.gen_batches(train_config['batch_size']):
                if log_on:
                    y_predicted = list(model(list(x)))
                    train_y_true += y_true
                    train_y_predicted += y_predicted
                loss = model.train_on_batch(x, y_true)
                if loss is not None:
                    losses.append(loss)
                i += 1
                examples += len(x)

                if train_config['log_every_n_batches'] > 0 and i % train_config[
                        'log_every_n_batches'] == 0:
                    metrics = [(s, f(train_y_true, train_y_predicted))
                               for s, f in train_metrics_functions]
                    report = {
                        'epochs_done':
                        epochs,
                        'batches_seen':
                        i,
                        'examples_seen':
                        examples,
                        'metrics':
                        prettify_metrics(metrics),
                        'time_spent':
                        str(
                            datetime.timedelta(
                                seconds=round(time.time() - start_time + 0.5)))
                    }

                    if losses:
                        report['loss'] = sum(losses) / len(losses)
                        losses = []
                    report = {'train': report}

                    if train_config['tensorboard_log_dir'] is not None:
                        for name, score in metrics:
                            metric_sum = tf.Summary(value=[
                                tf.Summary.Value(tag='every_n_batches/' + name,
                                                 simple_value=score),
                            ])
                            tb_train_writer.add_summary(metric_sum, i)

                        if losses:
                            loss_sum = tf.Summary(value=[
                                tf.Summary.Value(tag='every_n_batches/' +
                                                 'loss',
                                                 simple_value=report['loss']),
                            ])
                            tb_train_writer.add_summary(loss_sum, i)

                    print(json.dumps(report, ensure_ascii=False))
                    train_y_true.clear()
                    train_y_predicted.clear()

                if i >= train_config['max_batches'] > 0:
                    break_flag = True
                    break

                report = {
                    'epochs_done':
                    epochs,
                    'batches_seen':
                    i,
                    'train_examples_seen':
                    examples,
                    'time_spent':
                    str(
                        datetime.timedelta(seconds=round(time.time() -
                                                         start_time + 0.5)))
                }
                model.process_event(event_name='after_batch', data=report)
            if break_flag:
                break

            epochs += 1

            report = {
                'epochs_done':
                epochs,
                'batches_seen':
                i,
                'train_examples_seen':
                examples,
                'time_spent':
                str(
                    datetime.timedelta(seconds=round(time.time() - start_time +
                                                     0.5)))
            }
            model.process_event(event_name='after_epoch', data=report)

            if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\
                    and train_y_true:
                metrics = [(s, f(train_y_true, train_y_predicted))
                           for s, f in train_metrics_functions]
                report = {
                    'epochs_done':
                    epochs,
                    'batches_seen':
                    i,
                    'train_examples_seen':
                    examples,
                    'metrics':
                    prettify_metrics(metrics),
                    'time_spent':
                    str(
                        datetime.timedelta(seconds=round(time.time() -
                                                         start_time + 0.5)))
                }
                if losses:
                    report['loss'] = sum(losses) / len(losses)
                    losses = []

                if train_config['tensorboard_log_dir'] is not None:
                    for name, score in metrics:
                        metric_sum = tf.Summary(value=[
                            tf.Summary.Value(tag='every_n_epochs/' + name,
                                             simple_value=score),
                        ])
                        tb_train_writer.add_summary(metric_sum, epochs)

                    if losses:
                        loss_sum = tf.Summary(value=[
                            tf.Summary.Value(tag='every_n_epochs/' + 'loss',
                                             simple_value=report['loss']),
                        ])
                        tb_train_writer.add_summary(loss_sum, epochs)

                model.process_event(event_name='after_train_log', data=report)
                report = {'train': report}
                print(json.dumps(report, ensure_ascii=False))
                train_y_true.clear()
                train_y_predicted.clear()

            if train_config['val_every_n_epochs'] > 0 and epochs % train_config[
                    'val_every_n_epochs'] == 0:
                report = _test_model(model, metrics_functions, iterator,
                                     train_config['batch_size'], 'valid',
                                     start_time)
                report['epochs_done'] = epochs
                report['batches_seen'] = i
                report['train_examples_seen'] = examples

                metrics = list(report['metrics'].items())

                if train_config['tensorboard_log_dir'] is not None:
                    for name, score in metrics:
                        metric_sum = tf.Summary(value=[
                            tf.Summary.Value(tag='every_n_epochs/' + name,
                                             simple_value=score),
                        ])
                        tb_valid_writer.add_summary(metric_sum, epochs)

                m_name, score = metrics[0]
                if improved(score, best):
                    patience = 0
                    log.info('New best {} of {}'.format(m_name, score))
                    best = score
                    log.info('Saving model')
                    model.save()
                    saved = True
                else:
                    patience += 1
                    log.info('Did not improve on the {} of {}'.format(
                        m_name, best))

                report['impatience'] = patience
                if train_config['validation_patience'] > 0:
                    report['patience_limit'] = train_config[
                        'validation_patience']

                model.process_event(event_name='after_validation', data=report)
                report = {'valid': report}
                print(json.dumps(report, ensure_ascii=False))

                if patience >= train_config['validation_patience'] > 0:
                    log.info('Ran out of patience')
                    break

            if epochs >= train_config['epochs'] > 0:
                break
    except KeyboardInterrupt:
        log.info('Stopped training')

    if not saved:
        log.info('Saving model')
        model.save()

    return model
Exemplo n.º 4
0
def train_evaluate_model_from_config(config: [str, Path, dict], to_train: bool = True, to_validate: bool = True) -> None:
    """Make training and evaluation of the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    data = []
    reader_config = config.get('dataset_reader', None)

    if reader_config:
        reader_config = config['dataset_reader']
        if 'class' in reader_config:
            c = reader_config.pop('class')
            try:
                module_name, cls_name = c.split(':')
                reader = getattr(importlib.import_module(module_name), cls_name)()
            except ValueError:
                e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                                .format(c))
                log.exception(e)
                raise e
        else:
            reader = get_model(reader_config.pop('name'))()
        data_path = reader_config.pop('data_path', '')
        if isinstance(data_path, list):
            data_path = [expand_path(x) for x in data_path]
        else:
            data_path = expand_path(data_path)
        data = reader.read(data_path, **reader_config)
    else:
        log.warning("No dataset reader is provided in the JSON config.")

    iterator_config = config['dataset_iterator']
    iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config,
                                                                             data=data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True,
        'show_examples': False
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(zip(train_config['metrics'], get_metrics_by_names(train_config['metrics'])))

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid': _test_model(model, metrics_functions, iterator,
                                     train_config.get('batch_size', -1), 'valid',
                                     show_examples=train_config['show_examples'])
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test': _test_model(model, metrics_functions, iterator,
                                    train_config.get('batch_size', -1), 'test',
                                    show_examples=train_config['show_examples'])
            }

            print(json.dumps(report, ensure_ascii=False))
Exemplo n.º 5
0
def _train_batches(model: NNModel, iterator: DataLearningIterator, train_config: dict,
                   metrics_functions: List[Tuple[str, Callable]]) -> NNModel:

    default_train_config = {
        'epochs': 0,
        'max_batches': 0,
        'batch_size': 1,

        'metric_optimization': 'maximize',

        'validation_patience': 5,
        'val_every_n_epochs': 0,

        'log_every_n_batches': 0,
        'log_every_n_epochs': 0,

        'validate_best': True,
        'test_best': True,
        'tensorboard_log_dir': None,
    }

    train_config = dict(default_train_config, **train_config)

    if 'train_metrics' in train_config:
        train_metrics_functions = list(zip(train_config['train_metrics'],
                                           get_metrics_by_names(train_config['train_metrics'])))
    else:
        train_metrics_functions = metrics_functions

    if train_config['metric_optimization'] == 'maximize':
        def improved(score, best):
            return score > best
        best = float('-inf')
    elif train_config['metric_optimization'] == 'minimize':
        def improved(score, best):
            return score < best
        best = float('inf')
    else:
        raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize']))

    i = 0
    epochs = 0
    examples = 0
    saved = False
    patience = 0
    log_on = train_config['log_every_n_batches'] > 0 or train_config['log_every_n_epochs'] > 0
    train_y_true = []
    train_y_predicted = []
    losses = []
    start_time = time.time()
    break_flag = False

    if train_config['tensorboard_log_dir'] is not None:
        import tensorflow as tf
        tb_log_dir = expand_path(train_config['tensorboard_log_dir'])

        tb_train_writer = tf.summary.FileWriter(str(tb_log_dir / 'train_log'))
        tb_valid_writer = tf.summary.FileWriter(str(tb_log_dir / 'valid_log'))

    try:
        while True:
            for x, y_true in iterator.gen_batches(train_config['batch_size']):
                if log_on:
                    y_predicted = list(model(list(x)))
                    train_y_true += y_true
                    train_y_predicted += y_predicted
                loss = model.train_on_batch(x, y_true)
                if loss is not None:
                    losses.append(loss)
                i += 1
                examples += len(x)

                if train_config['log_every_n_batches'] > 0 and i % train_config['log_every_n_batches'] == 0:
                    metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions]
                    report = {
                        'epochs_done': epochs,
                        'batches_seen': i,
                        'examples_seen': examples,
                        'metrics': prettify_metrics(metrics),
                        'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))
                    }

                    if train_config['show_examples']:
                        try:
                            report['examples'] = [{
                                'x': x_item,
                                'y_predicted': y_predicted_item,
                                'y_true': y_true_item
                            } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)]
                        except NameError:
                            log.warning('Could not log examples as y_predicted is not defined')

                    if losses:
                        report['loss'] = sum(losses)/len(losses)
                        losses = []
                    report = {'train': report}

                    if train_config['tensorboard_log_dir'] is not None:
                        for name, score in metrics:
                            metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + name,
                                                                            simple_value=score), ])
                            tb_train_writer.add_summary(metric_sum, i)

                        if losses:
                            loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_batches/' + 'loss',
                                                                            simple_value=report['loss']), ])
                            tb_train_writer.add_summary(loss_sum, i)

                    print(json.dumps(report, ensure_ascii=False))
                    train_y_true.clear()
                    train_y_predicted.clear()

                if i >= train_config['max_batches'] > 0:
                    break_flag = True
                    break

                report = {
                    'epochs_done': epochs,
                    'batches_seen': i,
                    'train_examples_seen': examples,
                    'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))
                }
                model.process_event(event_name='after_batch', data=report)
            if break_flag:
                break

            epochs += 1

            report = {
                'epochs_done': epochs,
                'batches_seen': i,
                'train_examples_seen': examples,
                'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))
            }
            model.process_event(event_name='after_epoch', data=report)

            if train_config['log_every_n_epochs'] > 0 and epochs % train_config['log_every_n_epochs'] == 0\
                    and train_y_true:
                metrics = [(s, f(train_y_true, train_y_predicted)) for s, f in train_metrics_functions]
                report = {
                    'epochs_done': epochs,
                    'batches_seen': i,
                    'train_examples_seen': examples,
                    'metrics': prettify_metrics(metrics),
                    'time_spent': str(datetime.timedelta(seconds=round(time.time() - start_time + 0.5)))
                }

                if train_config['show_examples']:
                    try:
                        report['examples'] = [{
                            'x': x_item,
                            'y_predicted': y_predicted_item,
                            'y_true': y_true_item
                        } for x_item, y_predicted_item, y_true_item in zip(x, y_predicted, y_true)]
                    except NameError:
                        log.warning('Could not log examples')

                if losses:
                    report['loss'] = sum(losses)/len(losses)
                    losses = []

                if train_config['tensorboard_log_dir'] is not None:
                    for name, score in metrics:
                        metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name,
                                                                        simple_value=score), ])
                        tb_train_writer.add_summary(metric_sum, epochs)

                    if losses:
                        loss_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + 'loss',
                                                                        simple_value=report['loss']), ])
                        tb_train_writer.add_summary(loss_sum, epochs)

                model.process_event(event_name='after_train_log', data=report)
                report = {'train': report}
                print(json.dumps(report, ensure_ascii=False))
                train_y_true.clear()
                train_y_predicted.clear()

            if train_config['val_every_n_epochs'] > 0 and epochs % train_config['val_every_n_epochs'] == 0:
                report = _test_model(model, metrics_functions, iterator,
                                     train_config['batch_size'], 'valid', start_time, train_config['show_examples'])
                report['epochs_done'] = epochs
                report['batches_seen'] = i
                report['train_examples_seen'] = examples

                metrics = list(report['metrics'].items())

                if train_config['tensorboard_log_dir'] is not None:
                    for name, score in metrics:
                        metric_sum = tf.Summary(value=[tf.Summary.Value(tag='every_n_epochs/' + name,
                                                                        simple_value=score), ])
                        tb_valid_writer.add_summary(metric_sum, epochs)

                m_name, score = metrics[0]
                if improved(score, best):
                    patience = 0
                    log.info('New best {} of {}'.format(m_name, score))
                    best = score
                    log.info('Saving model')
                    model.save()
                    saved = True
                else:
                    patience += 1
                    log.info('Did not improve on the {} of {}'.format(m_name, best))

                report['impatience'] = patience
                if train_config['validation_patience'] > 0:
                    report['patience_limit'] = train_config['validation_patience']

                model.process_event(event_name='after_validation', data=report)
                report = {'valid': report}
                print(json.dumps(report, ensure_ascii=False))

                if patience >= train_config['validation_patience'] > 0:
                    log.info('Ran out of patience')
                    break

            if epochs >= train_config['epochs'] > 0:
                break
    except KeyboardInterrupt:
        log.info('Stopped training')

    if not saved:
        log.info('Saving model')
        model.save()

    return model
Exemplo n.º 6
0
def train_model_from_config(config_path: str):
    config = read_json(config_path)
    set_deeppavlov_root(config)

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    data = reader.read(data_path)

    dataset_config = config['dataset']
    dataset: Dataset = from_params(dataset_config, data=data)

    if 'chainer' in config:
        model = fit_chainer(config, dataset)
    else:
        vocabs = {}
        for vocab_param_name, vocab_config in config.get('vocabs', {}).items():
            v: Estimator = from_params(vocab_config, mode='train')
            vocabs[vocab_param_name] = _fit(v, dataset)

        model_config = config['model']
        model = from_params(model_config, vocabs=vocabs, mode='train')

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': True,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if callable(getattr(model, 'train_on_batch', None)):
        _train_batches(model, dataset, train_config, metrics_functions)
    elif callable(getattr(model, 'fit', None)):
        _fit(model, dataset, train_config)
    elif not isinstance(model, Chainer):
        log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, dataset,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, dataset,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))
Exemplo n.º 7
0
def train_evaluate_model_from_config(
        config: [str, Path, dict],
        iterator=None,
        to_train=True,
        to_validate=True) -> Dict[str, Dict[str, float]]:
    """Make training and evaluation of the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)
    import_packages(config.get('metadata', {}).get('imports', []))

    if iterator is None:
        data = read_data_by_config(config)
        iterator = get_iterator_from_config(config, data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True,
        'show_examples': False
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

        model.destroy()

    res = {}

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model,
                            metrics_functions,
                            iterator,
                            train_config.get('batch_size', -1),
                            'valid',
                            show_examples=train_config['show_examples'])
            }

            res['valid'] = report['valid']['metrics']

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model,
                            metrics_functions,
                            iterator,
                            train_config.get('batch_size', -1),
                            'test',
                            show_examples=train_config['show_examples'])
            }

            res['test'] = report['test']['metrics']

            print(json.dumps(report, ensure_ascii=False))

        model.destroy()

    return res
Exemplo n.º 8
0
def train_model_from_config(config_path: str) -> None:
    config = read_json(config_path)
    set_deeppavlov_root(config)

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    kwargs = {
        k: v
        for k, v in reader_config.items() if k not in ['name', 'data_path']
    }
    data = reader.read(data_path, **kwargs)

    iterator_config = config['dataset_iterator']
    iterator: BasicDatasetIterator = from_params(iterator_config, data=data)

    if 'chainer' in config:
        model = fit_chainer(config, iterator)
    else:
        vocabs = config.get('vocabs', {})
        for vocab_param_name, vocab_config in vocabs.items():
            v: Estimator = from_params(vocab_config, mode='train')
            vocabs[vocab_param_name] = _fit(v, iterator)

        model_config = config['model']
        model = from_params(model_config, vocabs=vocabs, mode='train')

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': True,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if callable(getattr(model, 'train_on_batch', None)):
        _train_batches(model, iterator, train_config, metrics_functions)
    elif callable(getattr(model, 'fit', None)):
        _fit(model, iterator, train_config)
    elif not isinstance(model, Chainer):
        log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))