Пример #1
0
def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False,
                            as_component: bool = False) -> Chainer:
    """Build and return the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    model_config = config['chainer']

    model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component)

    for component_config in model_config['pipe']:
        if load_trained and ('fit_on' in component_config or 'in_y' in component_config):
            try:
                component_config['load_path'] = component_config['save_path']
            except KeyError:
                log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed'
                            .format(component_config.get('name', component_config.get('ref', 'UNKNOWN'))))
        component = from_params(component_config, mode=mode)

        if 'in' in component_config:
            c_in = component_config['in']
            c_out = component_config['out']
            in_y = component_config.get('in_y', None)
            main = component_config.get('main', False)
            model.append(component, c_in, c_out, in_y, main)

    return model
Пример #2
0
def build_model_from_config(config,
                            mode='infer',
                            load_trained=False,
                            as_component=False):
    set_deeppavlov_root(config)
    model_config = config['chainer']

    model = Chainer(model_config['in'],
                    model_config['out'],
                    model_config.get('in_y'),
                    as_component=as_component)

    for component_config in model_config['pipe']:
        if load_trained and ('fit_on' in component_config
                             or 'in_y' in component_config):
            try:
                component_config['load_path'] = component_config['save_path']
            except KeyError:
                log.warning(
                    'No "save_path" parameter for the {} component, so "load_path" will not be renewed'
                    .format(
                        component_config.get(
                            'name', component_config.get('ref', 'UNKNOWN'))))
        component = from_params(component_config, mode=mode)

        if 'in' in component_config:
            c_in = component_config['in']
            c_out = component_config['out']
            in_y = component_config.get('in_y', None)
            main = component_config.get('main', False)
            model.append(component, c_in, c_out, in_y, main)

    return model
Пример #3
0
def predict_with_model(config_path):
    config = read_json(config_path)
    set_deeppavlov_root(config)

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    read_params = {k: v for k, v in reader_config.items() if k not in ['name', 'data_path']}
    data = reader.read(data_path, **read_params)

    iterator_config = config['dataset_iterator']
    iterator: MorphoTaggerDatasetIterator =\
        from_params(iterator_config, data=data)

    model = build_model_from_config(config, load_trained=True)
    answers = [None] * len(iterator.test)
    batch_size = config['predict'].get("batch_size", -1)
    for indexes, (x, _) in iterator.gen_batches(
            batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True):
        y = model(x)
        for i, elem in zip(indexes, y):
            answers[i] = elem
    outfile = config['predict'].get("outfile")
    if outfile is not None:
        outfile = Path(outfile)
        if not outfile.exists():
            outfile.parent.mkdir(parents=True, exist_ok=True)
        with open(outfile, "w", encoding="utf8") as fout:
            for elem in answers:
                fout.write(elem + "\n")
    return answers
Пример #4
0
def get_config_downloads(config_path):
    dp_root_back = get_deeppavlov_root()
    config = read_json(config_path)
    set_deeppavlov_root(config)

    downloads = set()
    if 'metadata' in config and 'download' in config['metadata']:
        for resource in config['metadata']['download']:
            if isinstance(resource, str):
                resource = {
                    'url': resource
                }

            url = resource['url']
            dest = expand_path(resource.get('subdir', ''))

            downloads.add((url, dest))

    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]

    downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)}

    set_deeppavlov_root({'deeppavlov_root': dp_root_back})

    return downloads
Пример #5
0
def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False) -> Chainer:
    """Build and return the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    model_config = config['chainer']

    model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'))

    for component_config in model_config['pipe']:
        if load_trained and ('fit_on' in component_config or 'in_y' in component_config):
            try:
                component_config['load_path'] = component_config['save_path']
            except KeyError:
                log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed'
                            .format(component_config.get('name', component_config.get('ref', 'UNKNOWN'))))
        component = from_params(component_config, mode=mode)

        if 'in' in component_config:
            c_in = component_config['in']
            c_out = component_config['out']
            in_y = component_config.get('in_y', None)
            main = component_config.get('main', False)
            model.append(component, c_in, c_out, in_y, main)

    return model
Пример #6
0
def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Component:
    """Builds and returns the Component from corresponding dictionary of parameters."""
    # what is passed in json:
    config_params = {k: _resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            return _refs[config_params['ref']]
        except KeyError:
            e = ConfigError(
                'Component with id "{id}" was referenced but not initialized'.
                format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model_from_config
        deeppavlov_root = get_deeppavlov_root()
        refs = _refs.copy()
        _refs.clear()
        config = read_json(expand_path(config_params['config_path']))
        model = build_model_from_config(config)
        set_deeppavlov_root({'deeppavlov_root': deeppavlov_root})
        _refs.clear()
        _refs.update(refs)
        return model

    elif 'class' in config_params:
        cls = cls_from_str(config_params.pop('class'))
    else:
        cls_name = config_params.pop('name', None)
        if not cls_name:
            e = ConfigError(
                'Component config has no `name` nor `ref` or `class` fields')
            log.exception(e)
            raise e
        cls = get_model(cls_name)

    # find the submodels params recursively
    config_params = {k: _init_param(v, mode) for k, v in config_params.items()}

    try:
        spec = inspect.getfullargspec(cls)
        if 'mode' in spec.args + spec.kwonlyargs or spec.varkw is not None:
            kwargs['mode'] = mode

        component = cls(**dict(config_params, **kwargs))
        try:
            _refs[config_params['id']] = component
        except KeyError:
            pass
    except Exception:
        log.exception("Exception in {}".format(cls))
        raise

    return component
Пример #7
0
def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Component:
    """Builds and returns the Component from corresponding dictionary of parameters."""
    # what is passed in json:
    config_params = {k: _resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            return _refs[config_params['ref']]
        except KeyError:
            e = ConfigError('Component with id "{id}" was referenced but not initialized'
                            .format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model_from_config
        deeppavlov_root = get_deeppavlov_root()
        refs = _refs.copy()
        _refs.clear()
        config = read_json(expand_path(config_params['config_path']))
        model = build_model_from_config(config, as_component=True)
        set_deeppavlov_root({'deeppavlov_root': deeppavlov_root})
        _refs.clear()
        _refs.update(refs)
        return model

    elif 'class' in config_params:
        cls = cls_from_str(config_params.pop('class'))
    else:
        cls_name = config_params.pop('name', None)
        if not cls_name:
            e = ConfigError('Component config has no `name` nor `ref` or `class` fields')
            log.exception(e)
            raise e
        cls = get_model(cls_name)

    # find the submodels params recursively
    config_params = {k: _init_param(v, mode) for k, v in config_params.items()}

    try:
        spec = inspect.getfullargspec(cls)
        if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None:
            kwargs['mode'] = mode

        component = cls(**dict(config_params, **kwargs))
        try:
            _refs[config_params['id']] = component
        except KeyError:
            pass
    except Exception:
        log.exception("Exception in {}".format(cls))
        raise

    return component
Пример #8
0
def predict_with_model(config_path: [Path, str]) -> List[List[str]]:
    """Returns predictions of morphotagging model given in config :config_path:.

    Args:
        config_path: a path to config

    Returns:
        a list of morphological analyses for each sentence. Each analysis is either a list of tags
        or a list of full CONLL-U descriptions.

    """
    config = read_json(config_path)
    set_deeppavlov_root(config)

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    read_params = {
        k: v
        for k, v in reader_config.items() if k not in ['name', 'data_path']
    }
    data: Dict = reader.read(data_path, **read_params)

    iterator_config = config['dataset_iterator']
    iterator: MorphoTaggerDatasetIterator = from_params(iterator_config,
                                                        data=data)

    model = build_model_from_config(config, load_trained=True)
    answers = [None] * len(iterator.test)
    batch_size = config['predict'].get("batch_size", -1)
    for indexes, (x, _) in iterator.gen_batches(batch_size=batch_size,
                                                data_type="test",
                                                shuffle=False,
                                                return_indexes=True):
        y = model(x)
        for i, elem in zip(indexes, y):
            answers[i] = elem
    outfile = config['predict'].get("outfile")
    if outfile is not None:
        outfile = Path(outfile)
        if not outfile.exists():
            outfile.parent.mkdir(parents=True, exist_ok=True)
        with open(outfile, "w", encoding="utf8") as fout:
            for elem in answers:
                fout.write(elem + "\n")
    return answers
Пример #9
0
def build_model_from_config(config, mode='infer', load_trained=False):
    set_deeppavlov_root(config)
    if 'chainer' in config:
        model_config = config['chainer']

        model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'))

        for component_config in model_config['pipe']:
            if load_trained and ('fit_on' in component_config or 'in_y' in component_config):
                try:
                    component_config['load_path'] = component_config['save_path']
                except KeyError:
                    log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed'
                                .format(component_config.get('name', component_config.get('ref', 'UNKNOWN'))))
            component = from_params(component_config, vocabs=[], mode=mode)

            if 'in' in component_config:
                c_in = component_config['in']
                c_out = component_config['out']
                in_y = component_config.get('in_y', None)
                main = component_config.get('main', False)
                model.append(c_in, c_out, component, in_y, main)

        return model

    model_config = config['model']
    if load_trained:
        try:
            model_config['load_path'] = model_config['save_path']
        except KeyError:
            log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')

    vocabs = {}
    if 'vocabs' in config:
        for vocab_param_name, vocab_config in config['vocabs'].items():
            v = from_params(vocab_config, mode=mode)
            vocabs[vocab_param_name] = v
    model = from_params(model_config, vocabs=vocabs, mode=mode)
    model.reset()
    return model
Пример #10
0
def main():
    args = parser.parse_args()

    pipeline_config_path = find_config(args.config_path)
    key_main_model = args.key_main_model
    population_size = args.p_size
    gpus = [int(gpu) for gpu in args.gpus.split(",")]
    train_partition = int(args.train_partition)
    start_from_population = int(args.start_from_population)
    path_to_population = args.path_to_population
    elitism_with_weights = args.elitism_with_weights
    iterations = int(args.iterations)

    p_crossover = args.p_cross
    pow_crossover = args.pow_cross
    p_mutation = args.p_mut
    pow_mutation = args.pow_mut

    if os.environ.get("CUDA_VISIBLE_DEVICES") is None:
        pass
    else:
        cvd = [
            int(gpu)
            for gpu in os.environ.get("CUDA_VISIBLE_DEVICES").split(",")
        ]
        if gpus == [-1]:
            gpus = cvd
        else:
            try:
                gpus = [cvd[gpu] for gpu in gpus]
            except IndexError:
                raise ConfigError(
                    "Can not use gpus `{}` with CUDA_VISIBLE_DEVICES='{}'".
                    format(",".join(map(str, gpus)), ",".join(map(str, cvd))))

    basic_params = read_json(pipeline_config_path)
    log.info("Given basic params: {}\n".format(
        json.dumps(basic_params, indent=2)))

    # Initialize evolution
    evolution = ParamsEvolution(population_size=population_size,
                                p_crossover=p_crossover,
                                crossover_power=pow_crossover,
                                p_mutation=p_mutation,
                                mutation_power=pow_mutation,
                                key_main_model=key_main_model,
                                seed=42,
                                train_partition=train_partition,
                                elitism_with_weights=elitism_with_weights,
                                **basic_params)

    considered_metrics = evolution.get_value_from_config(
        evolution.basic_config,
        list(evolution.find_model_path(evolution.basic_config, "metrics"))[0] +
        ["metrics"])

    log.info(considered_metrics)
    evolve_metric = considered_metrics[0]

    # Create table variable for gathering results
    set_deeppavlov_root(evolution.basic_config)

    expand_path(
        Path(
            evolution.get_value_from_config(
                evolution.basic_config, evolution.main_model_path +
                ["save_path"]))).mkdir(parents=True, exist_ok=True)

    result_file = expand_path(
        Path(
            evolution.get_value_from_config(
                evolution.basic_config, evolution.main_model_path +
                ["save_path"])).joinpath("result_table.csv"))

    result_table_columns = []
    result_table_dict = {}
    for el in considered_metrics:
        result_table_dict[el + "_valid"] = []
        result_table_dict[el + "_test"] = []
        result_table_columns.extend([el + "_valid", el + "_test"])

    result_table_dict["params"] = []
    result_table_columns.append("params")

    if start_from_population == 0:
        # if starting evolution from scratch
        iters = 0
        result_table = pd.DataFrame(result_table_dict)
        # write down result table file
        result_table.loc[:, result_table_columns].to_csv(result_file,
                                                         index=False,
                                                         sep='\t')

        log.info("Iteration #{} starts".format(iters))
        # randomly generate the first population
        population = evolution.first_generation()
    else:
        # if starting evolution from already existing population
        iters = start_from_population
        log.info("Iteration #{} starts".format(iters))

        population = []
        for i in range(population_size):
            population.append(
                read_json(
                    expand_path(
                        Path(path_to_population).joinpath(
                            "model_" + str(i)).joinpath("config.json"))))
            population[i] = evolution.insert_value_or_dict_into_config(
                population[i], evolution.main_model_path + ["save_path"],
                str(
                    Path(
                        evolution.get_value_from_config(
                            evolution.basic_config, evolution.main_model_path +
                            ["save_path"
                             ])).joinpath("population_" +
                                          str(start_from_population)).joinpath(
                                              "model_" +
                                              str(i)).joinpath("model")))

            population[i] = evolution.insert_value_or_dict_into_config(
                population[i], evolution.main_model_path + ["load_path"],
                str(
                    Path(
                        evolution.get_value_from_config(
                            population[i],
                            evolution.main_model_path + ["load_path"]))))

            for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts):
                population[i] = evolution.insert_value_or_dict_into_config(
                    population[i], path_ + ["save_path"],
                    str(
                        Path(
                            evolution.get_value_from_config(
                                evolution.basic_config,
                                evolution.main_model_path +
                                ["save_path"])).joinpath("population_" +
                                                         str(iters)).
                        joinpath("model_" + str(i)).joinpath("fitted_model_" +
                                                             str(path_id))))

            for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts):
                population[i] = evolution.insert_value_or_dict_into_config(
                    population[i], path_ + ["load_path"],
                    str(
                        Path(
                            evolution.get_value_from_config(
                                population[i], path_ + ["load_path"]))))

    run_population(population, evolution, gpus)
    population_scores = results_to_table(population, evolution,
                                         considered_metrics, result_file,
                                         result_table_columns)[evolve_metric]
    log.info("Population scores: {}".format(population_scores))
    log.info("Iteration #{} was done".format(iters))
    iters += 1

    while True:
        if iterations != -1 and start_from_population + iterations == iters:
            log.info("End of evolution on iteration #{}".format(iters))
            break
        log.info("Iteration #{} starts".format(iters))
        population = evolution.next_generation(population, population_scores,
                                               iters)
        run_population(population, evolution, gpus)
        population_scores = results_to_table(
            population, evolution, considered_metrics, result_file,
            result_table_columns)[evolve_metric]
        log.info("Population scores: {}".format(population_scores))
        log.info("Iteration #{} was done".format(iters))
        iters += 1
Пример #11
0
def train_evaluate_model_from_config(config: [str, Path, dict], to_train: bool = True, to_validate: bool = True) -> None:
    """Make training and evaluation of the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    data = []
    reader_config = config.get('dataset_reader', None)

    if reader_config:
        reader_config = config['dataset_reader']
        if 'class' in reader_config:
            c = reader_config.pop('class')
            try:
                module_name, cls_name = c.split(':')
                reader = getattr(importlib.import_module(module_name), cls_name)()
            except ValueError:
                e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                                .format(c))
                log.exception(e)
                raise e
        else:
            reader = get_model(reader_config.pop('name'))()
        data_path = reader_config.pop('data_path', '')
        if isinstance(data_path, list):
            data_path = [expand_path(x) for x in data_path]
        else:
            data_path = expand_path(data_path)
        data = reader.read(data_path, **reader_config)
    else:
        log.warning("No dataset reader is provided in the JSON config.")

    iterator_config = config['dataset_iterator']
    iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config,
                                                                             data=data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True,
        'show_examples': False
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(zip(train_config['metrics'], get_metrics_by_names(train_config['metrics'])))

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid': _test_model(model, metrics_functions, iterator,
                                     train_config.get('batch_size', -1), 'valid',
                                     show_examples=train_config['show_examples'])
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test': _test_model(model, metrics_functions, iterator,
                                    train_config.get('batch_size', -1), 'test',
                                    show_examples=train_config['show_examples'])
            }

            print(json.dumps(report, ensure_ascii=False))
Пример #12
0
def train_evaluate_model_from_config(config: [str, Path, dict],
                                     to_train=True,
                                     to_validate=True) -> None:
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)

    import_packages(config.get('metadata', {}).get('imports', []))

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    data = []
    reader_config = config.get('dataset_reader', None)

    if reader_config:
        reader_config = config['dataset_reader']
        if 'class' in reader_config:
            c = reader_config.pop('class')
            try:
                module_name, cls_name = c.split(':')
                reader = getattr(importlib.import_module(module_name),
                                 cls_name)()
            except ValueError:
                e = ConfigError(
                    'Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                    .format(c))
                log.exception(e)
                raise e
        else:
            reader = get_model(reader_config.pop('name'))()
        data_path = expand_path(reader_config.pop('data_path', ''))
        data = reader.read(data_path, **reader_config)
    else:
        log.warning("No dataset reader is provided in the JSON config.")

    iterator_config = config['dataset_iterator']
    iterator: Union[DataLearningIterator,
                    DataFittingIterator] = from_params(iterator_config,
                                                       data=data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))
Пример #13
0
def train_model_from_config(config_path: str):
    config = read_json(config_path)
    set_deeppavlov_root(config)

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    data = reader.read(data_path)

    dataset_config = config['dataset']
    dataset: Dataset = from_params(dataset_config, data=data)

    if 'chainer' in config:
        model = fit_chainer(config, dataset)
    else:
        vocabs = {}
        for vocab_param_name, vocab_config in config.get('vocabs', {}).items():
            v: Estimator = from_params(vocab_config, mode='train')
            vocabs[vocab_param_name] = _fit(v, dataset)

        model_config = config['model']
        model = from_params(model_config, vocabs=vocabs, mode='train')

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': True,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if callable(getattr(model, 'train_on_batch', None)):
        _train_batches(model, dataset, train_config, metrics_functions)
    elif callable(getattr(model, 'fit', None)):
        _fit(model, dataset, train_config)
    elif not isinstance(model, Chainer):
        log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, dataset,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, dataset,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))
Пример #14
0
def from_params(params: Dict, mode='infer', **kwargs) -> Component:
    # what is passed in json:
    config_params = {k: _resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            return _refs[config_params['ref']]
        except KeyError:
            e = ConfigError('Component with id "{id}" was referenced but not initialized'
                            .format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model_from_config
        deeppavlov_root = get_deeppavlov_root()
        refs = _refs.copy()
        _refs.clear()
        config = read_json(expand_path(config_params['config_path']))
        model = build_model_from_config(config, as_component=True)
        set_deeppavlov_root({'deeppavlov_root': deeppavlov_root})
        _refs.clear()
        _refs.update(refs)
        return model

    elif 'class' in config_params:
        c = config_params.pop('class')
        try:
            module_name, cls_name = c.split(':')
            cls = getattr(importlib.import_module(module_name), cls_name)
        except ValueError:
            e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                            .format(c))
            log.exception(e)
            raise e
    else:
        cls_name = config_params.pop('name', None)
        if not cls_name:
            e = ConfigError('Component config has no `name` nor `ref` or `class` fields')
            log.exception(e)
            raise e
        try:
            cls = REGISTRY[cls_name]
        except KeyError:
            e = ConfigError('Class {} is not registered.'.format(cls_name))
            log.exception(e)
            raise e

    # find the submodels params recursively
    config_params = {k: _init_param(v, mode) for k, v in config_params.items()}

    try:
        spec = inspect.getfullargspec(cls)
        if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None:
            kwargs['mode'] = mode

        component = cls(**dict(config_params, **kwargs))
        try:
            _refs[config_params['id']] = component
        except KeyError:
            pass
    except Exception:
        log.exception("Exception in {}".format(cls))
        raise

    return component
Пример #15
0
def train_model_from_config(config_path: str) -> None:
    config = read_json(config_path)
    set_deeppavlov_root(config)

    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'name': 'basic_classification_reader'}
            iterator = {'name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    kwargs = {
        k: v
        for k, v in reader_config.items() if k not in ['name', 'data_path']
    }
    data = reader.read(data_path, **kwargs)

    iterator_config = config['dataset_iterator']
    iterator: BasicDatasetIterator = from_params(iterator_config, data=data)

    if 'chainer' in config:
        model = fit_chainer(config, iterator)
    else:
        vocabs = config.get('vocabs', {})
        for vocab_param_name, vocab_config in vocabs.items():
            v: Estimator = from_params(vocab_config, mode='train')
            vocabs[vocab_param_name] = _fit(v, iterator)

        model_config = config['model']
        model = from_params(model_config, vocabs=vocabs, mode='train')

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': True,
        'test_best': True
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    metrics_functions = list(
        zip(train_config['metrics'],
            get_metrics_by_names(train_config['metrics'])))

    if callable(getattr(model, 'train_on_batch', None)):
        _train_batches(model, iterator, train_config, metrics_functions)
    elif callable(getattr(model, 'fit', None)):
        _fit(model, iterator, train_config)
    elif not isinstance(model, Chainer):
        log.warning('Nothing to train')

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'valid')
            }

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model, metrics_functions, iterator,
                            train_config.get('batch_size', -1), 'test')
            }

            print(json.dumps(report, ensure_ascii=False))
Пример #16
0
def train_evaluate_model_from_config(
        config: [str, Path, dict],
        iterator=None,
        to_train=True,
        to_validate=True) -> Dict[str, Dict[str, float]]:
    """Make training and evaluation of the model described in corresponding configuration file."""
    if isinstance(config, (str, Path)):
        config = read_json(config)
    set_deeppavlov_root(config)
    import_packages(config.get('metadata', {}).get('imports', []))

    if iterator is None:
        data = read_data_by_config(config)
        iterator = get_iterator_from_config(config, data)

    train_config = {
        'metrics': ['accuracy'],
        'validate_best': to_validate,
        'test_best': True,
        'show_examples': False
    }

    try:
        train_config.update(config['train'])
    except KeyError:
        log.warning('Train config is missing. Populating with default values')

    in_y = config['chainer'].get('in_y', ['y'])
    if isinstance(in_y, str):
        in_y = [in_y]
    if isinstance(config['chainer']['out'], str):
        config['chainer']['out'] = [config['chainer']['out']]
    metrics_functions = _parse_metrics(train_config['metrics'], in_y,
                                       config['chainer']['out'])

    if to_train:
        model = fit_chainer(config, iterator)

        if callable(getattr(model, 'train_on_batch', None)):
            _train_batches(model, iterator, train_config, metrics_functions)
        elif callable(getattr(model, 'fit_batches', None)):
            _fit_batches(model, iterator, train_config)
        elif callable(getattr(model, 'fit', None)):
            _fit(model, iterator, train_config)
        elif not isinstance(model, Chainer):
            log.warning('Nothing to train')

        model.destroy()

    res = {}

    if train_config['validate_best'] or train_config['test_best']:
        # try:
        #     model_config['load_path'] = model_config['save_path']
        # except KeyError:
        #     log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed')
        model = build_model_from_config(config, load_trained=True)
        log.info('Testing the best saved model')

        if train_config['validate_best']:
            report = {
                'valid':
                _test_model(model,
                            metrics_functions,
                            iterator,
                            train_config.get('batch_size', -1),
                            'valid',
                            show_examples=train_config['show_examples'])
            }

            res['valid'] = report['valid']['metrics']

            print(json.dumps(report, ensure_ascii=False))

        if train_config['test_best']:
            report = {
                'test':
                _test_model(model,
                            metrics_functions,
                            iterator,
                            train_config.get('batch_size', -1),
                            'test',
                            show_examples=train_config['show_examples'])
            }

            res['test'] = report['test']['metrics']

            print(json.dumps(report, ensure_ascii=False))

        model.destroy()

    return res
Пример #17
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May 10 16:28:38 2018

@author: lsm
"""

from deeppavlov.core.commands.train import train_evaluate_model_from_config
from deeppavlov.core.commands.utils import expand_path, set_deeppavlov_root
from deeppavlov.core.common.file import read_json
import sys
sys.path.insert(0, '../..')
from model.pipeline.text_normalizer import *
from model.pipeline.embedder import *
from model.pipeline.CNN_model import *

config = read_json('/subs/deliver/deliver_config.json')
set_deeppavlov_root(config)
train_evaluate_model_from_config('model/subs/deliver/deliver_config.json')
Пример #18
0
def from_params(params: Dict, **kwargs) -> Component:
    # what is passed in json:
    config_params = {k: _resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            return _refs[config_params['ref']]
        except KeyError:
            e = ConfigError(
                'Component with id "{id}" was referenced but not initialized'.
                format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model_from_config
        deeppavlov_root = get_deeppavlov_root()
        config = read_json(expand_path(config_params['config_path']))
        model = build_model_from_config(config, as_component=True)
        set_deeppavlov_root({'deeppavlov_root': deeppavlov_root})
        return model

    elif 'class' in config_params:
        c = config_params.pop('class')
        try:
            module_name, cls_name = c.split(':')
            cls = getattr(importlib.import_module(module_name), cls_name)
        except ValueError:
            e = ConfigError(
                'Expected class description in a `module.submodules:ClassName` form, but got `{}`'
                .format(c))
            log.exception(e)
            raise e
    else:
        cls_name = config_params.pop('name', None)
        if not cls_name:
            e = ConfigError(
                'Component config has no `name` nor `ref` or `class` fields')
            log.exception(e)
            raise e
        try:
            cls = REGISTRY[cls_name]
        except KeyError:
            e = ConfigError('Class {} is not registered.'.format(cls_name))
            log.exception(e)
            raise e

    # find the submodels params recursively
    for param_name, subcls_params in config_params.items():
        if isinstance(subcls_params, dict):
            if not {'ref', 'name', 'class', 'config_path'
                    }.intersection(subcls_params):
                "This parameter is passed as dict to the class constructor."
                " The user didn't intent it to be a component."
                for k, v in subcls_params.items():
                    subcls_params[k] = _resolve(v)
                continue

            config_params[param_name] = from_params(subcls_params,
                                                    vocabs=kwargs['vocabs'],
                                                    mode=kwargs['mode'])

    try:
        component = cls(**dict(config_params, **kwargs))
        try:
            _refs[config_params['id']] = component
        except KeyError:
            pass
    except Exception:
        log.exception("Exception in {}".format(cls))
        raise

    return component
    def train(self,
              model_level,
              model_name,
              path_to_data,
              path_to_config,
              path_to_global_embeddings,
              test_size=0.15,
              aug_method='word_dropout',
              samples_per_class=None,
              class_names=None,
              path_to_save_file=None,
              path_to_resulting_file=None):
        # preparing training/testing data
        df_raw = pd.read_csv(path_to_data)
        # preparing config
        config = read_json(path_to_config)

        if 'labels' not in df_raw or 'text' not in df_raw:
            raise InvalidDataFormatError(
                '\'labels\' and \'text\' columns must be in the dataframe')

        if model_level not in ['root', 'subs']:
            raise InvalidModelLevelError(
                'model level should be either \'root\' or \'subs\'')

        __df_train, df_test, _, _ = train_test_split(df_raw,
                                                     df_raw,
                                                     test_size=test_size)
        df_train, df_val, _, _ = train_test_split(__df_train,
                                                  __df_train,
                                                  test_size=test_size)

        if aug_method not in ['word_dropout', 'duplicate']:
            raise InvalidDataAugmentationMethodError(
                '\'aug_method\' should be  \'word_dropout\' or \'duplicate\'')

        df_train_equalized = self.__data_equalizer.equalize_classes(
            df_train, samples_per_class, aug_method)

        model_path = config['model_path']

        if not os.path.isdir(model_path):
            os.mkdir(model_path)
        if not os.path.isdir(model_path + 'data/'):
            os.mkdir(model_path + 'data/')
        df_train_equalized.to_csv(model_path + 'data/train.csv')
        df_val[['text',
                'labels']].sample(frac=1).to_csv(model_path + 'data/valid.csv')
        df_test[['text',
                 'labels']].sample(frac=1).to_csv(model_path + 'df_test.csv')

        # making embeddings
        emb_len = IntentsClassifier.get_config_element_by_name(
            config=config['chainer']['pipe'], name='embedder')['emb_len']
        eb = EmbeddingsBuilder(
            resulting_dim=emb_len,
            path_to_original_embeddings=path_to_global_embeddings)
        tc = TextCorrector()
        corpus_cleaned = tc.tn.transform(df_raw.text.tolist())
        if not os.path.isfile(model_path + 'ft_compressed.pkl'):
            eb.compress_embeddings(corpus_cleaned,
                                   model_path + 'ft_compressed.pkl', 'pca',
                                   eb.path_to_original_embeddings)
        gc.collect()
        if not os.path.isfile(model_path + 'ft_compressed_local.pkl'):
            eb.build_local_embeddings(corpus_cleaned,
                                      model_path + 'ft_compressed_local.pkl')
        # dealing with class_names
        if type(class_names) == list:
            pickle.dump(class_names, open(model_path + 'class_names.pkl',
                                          'wb'))
        else:
            pickle.dump(df_train['labels'].value_counts().index.tolist(),
                        open(model_path + 'class_names.pkl', 'wb'))
        # setting up saving and loading
        if not path_to_save_file == None:
            config['chainer']['pipe'][-1][
                'save_path'] = path_to_save_file + '/' + 'weights.hdf5'
        if not os.path.isdir(
                path_to_save_file) and not path_to_save_file == None:
            os.mkdir(path_to_save_file)

        if not os.path.isdir(
                path_to_resulting_file) and not path_to_resulting_file == None:
            os.mkdir(path_to_resulting_file)
        emb_config = IntentsClassifier.get_config_element_by_name(
            config['chainer']['pipe'], 'embedder')
        cnn_config = IntentsClassifier.get_config_element_by_name(
            config['chainer']['pipe'], 'cnn_model')
        config['chainer']['pipe'][config['chainer']['pipe'].index(emb_config)][
            'load_path'][0] = model_path + config['chainer']['pipe'][
                config['chainer']['pipe'].index(emb_config)]['load_path'][0]
        config['chainer']['pipe'][config['chainer']['pipe'].index(emb_config)][
            'load_path'][1] = model_path + config['chainer']['pipe'][
                config['chainer']['pipe'].index(emb_config)]['load_path'][1]
        config['chainer']['pipe'][config['chainer']['pipe'].index(
            cnn_config)]['classes'] = model_path + config['chainer']['pipe'][
                config['chainer']['pipe'].index(cnn_config)]['classes']
        config['dataset_reader'][
            'data_path'] = model_path + config['dataset_reader']['data_path']
        config['train']['tensorboard_log_dir'] = model_path + config['train'][
            'tensorboard_log_dir']
        load_path_bckp = config['chainer']['pipe'][-1]['load_path']
        check_results = self.check_config(config)
        if len(check_results) > 0:
            raise InvalidConfig(check_results, model_path,
                                'Config file is invalid')

        # training
        set_deeppavlov_root(config)
        # update training status
        training_status = 'Classification model {} {} is currently training. Total number of epochs is set to {}'.format(
            model_level, model_name, config['train']['epochs'])
        with open(model_path + 'status.txt', 'w') as f:
            f.writelines(training_status)
        # fukken training
        train_evaluate_model_from_config(config)
        # fixing load_path
        # updating status
        perf = IntentsClassifier.get_latest_accuracy(
            config)  #self.get_performance(config, model_path + 'df_test.csv')
        training_status = 'Classification model {} {} is trained \nf1_score (macro avg): {}'.format(
            model_level, model_name, perf)
        with open(model_path + 'status.txt', 'w') as f:
            f.writelines(training_status)
        # getting performance
        config['chainer']['pipe'][-1]['load_path'] = load_path_bckp
        copy(
            path_to_save_file + '/' + 'weights.hdf5', path_to_resulting_file +
            '/' + config['chainer']['pipe'][-1]['load_path'])
        copy(path_to_save_file + '/' + 'weights.hdf5',
             model_path + config['chainer']['pipe'][-1]['load_path'])
Пример #20
0
def main():
    args = parser.parse_args()

    pipeline_config_path = find_config(args.config_path)
    key_main_model = args.key_main_model
    population_size = args.p_size
    gpus = [int(gpu) for gpu in args.gpus.split(",")]
    train_partition = int(args.train_partition)
    start_from_population = int(args.start_from_population)
    path_to_population = args.path_to_population
    elitism_with_weights = args.elitism_with_weights
    iterations = int(args.iterations)

    p_crossover = args.p_cross
    pow_crossover = args.pow_cross
    p_mutation = args.p_mut
    pow_mutation = args.pow_mut

    if os.environ.get("CUDA_VISIBLE_DEVICES") is None:
        pass
    else:
        cvd = [int(gpu) for gpu in os.environ.get("CUDA_VISIBLE_DEVICES").split(",")]
        if gpus == [-1]:
            gpus = cvd
        else:
            try:
                gpus = [cvd[gpu] for gpu in gpus]
            except:
                raise ConfigError("Can not use gpus `{}` with CUDA_VISIBLE_DEVICES='{}'".format(
                    ",".join(gpus), ",".join(cvd)
                ))

    basic_params = read_json(pipeline_config_path)
    log.info("Given basic params: {}\n".format(json.dumps(basic_params, indent=2)))

    # Initialize evolution
    evolution = ParamsEvolution(population_size=population_size,
                                p_crossover=p_crossover, crossover_power=pow_crossover,
                                p_mutation=p_mutation, mutation_power=pow_mutation,
                                key_main_model=key_main_model,
                                seed=42,
                                train_partition=train_partition,
                                elitism_with_weights=elitism_with_weights,
                                **basic_params)

    considered_metrics = evolution.get_value_from_config(evolution.basic_config,
                                                         list(evolution.find_model_path(
                                                             evolution.basic_config, "metrics"))[0] + ["metrics"])

    log.info(considered_metrics)
    evolve_metric = considered_metrics[0]

    # Create table variable for gathering results
    set_deeppavlov_root(evolution.basic_config)

    expand_path(Path(evolution.get_value_from_config(
        evolution.basic_config, evolution.main_model_path + ["save_path"]))).mkdir(parents=True, exist_ok=True)

    result_file = expand_path(Path(evolution.get_value_from_config(evolution.basic_config,
                                                                   evolution.main_model_path + ["save_path"])
                                   ).joinpath("result_table.csv"))

    result_table_columns = []
    result_table_dict = {}
    for el in considered_metrics:
        result_table_dict[el + "_valid"] = []
        result_table_dict[el + "_test"] = []
        result_table_columns.extend([el + "_valid", el + "_test"])

    result_table_dict["params"] = []
    result_table_columns.append("params")

    if start_from_population == 0:
        # if starting evolution from scratch
        iters = 0
        result_table = pd.DataFrame(result_table_dict)
        # write down result table file
        result_table.loc[:, result_table_columns].to_csv(result_file, index=False, sep='\t')

        log.info("Iteration #{} starts".format(iters))
        # randomly generate the first population
        population = evolution.first_generation()
    else:
        # if starting evolution from already existing population
        iters = start_from_population
        log.info("Iteration #{} starts".format(iters))

        population = []
        for i in range(population_size):
            population.append(read_json(expand_path(Path(path_to_population).joinpath(
                "model_" + str(i)).joinpath("config.json"))))
            population[i] = evolution.insert_value_or_dict_into_config(
                population[i], evolution.main_model_path + ["save_path"],
                str(Path(
                    evolution.get_value_from_config(evolution.basic_config, evolution.main_model_path + ["save_path"])
                    ).joinpath(
                    "population_" + str(start_from_population)).joinpath(
                    "model_" + str(i)).joinpath(
                    "model")))

            population[i] = evolution.insert_value_or_dict_into_config(
                population[i], evolution.main_model_path + ["load_path"],
                str(Path(
                    evolution.get_value_from_config(population[i], evolution.main_model_path + ["load_path"]))))

            for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts):
                population[i] = evolution.insert_value_or_dict_into_config(
                    population[i], path_ + ["save_path"],
                    str(Path(evolution.get_value_from_config(evolution.basic_config,
                                                             evolution.main_model_path + ["save_path"])
                             ).joinpath("population_" + str(iters)).joinpath("model_" + str(i)).joinpath(
                        "fitted_model_" + str(path_id))))

            for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts):
                population[i] = evolution.insert_value_or_dict_into_config(
                    population[i], path_ + ["load_path"],
                    str(Path(evolution.get_value_from_config(
                        population[i], path_ + ["load_path"]))))

    run_population(population, evolution, gpus)
    population_scores = results_to_table(population, evolution, considered_metrics,
                                         result_file, result_table_columns)[evolve_metric]
    log.info("Population scores: {}".format(population_scores))
    log.info("Iteration #{} was done".format(iters))
    iters += 1

    while True:
        if iterations != -1 and start_from_population + iterations == iters:
            log.info("End of evolution on iteration #{}".format(iters))
            break
        log.info("Iteration #{} starts".format(iters))
        population = evolution.next_generation(population, population_scores, iters)
        run_population(population, evolution, gpus)
        population_scores = results_to_table(population, evolution, considered_metrics,
                                             result_file, result_table_columns)[evolve_metric]
        log.info("Population scores: {}".format(population_scores))
        log.info("Iteration #{} was done".format(iters))
        iters += 1