def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False, as_component: bool = False) -> Chainer: """Build and return the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def build_model_from_config(config, mode='infer', load_trained=False, as_component=False): set_deeppavlov_root(config) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning( 'No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format( component_config.get( 'name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def predict_with_model(config_path): config = read_json(config_path) set_deeppavlov_root(config) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = {k: v for k, v in reader_config.items() if k not in ['name', 'data_path']} data = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator =\ from_params(iterator_config, data=data) model = build_model_from_config(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches( batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def get_config_downloads(config_path): dp_root_back = get_deeppavlov_root() config = read_json(config_path) set_deeppavlov_root(config) downloads = set() if 'metadata' in config and 'download' in config['metadata']: for resource in config['metadata']['download']: if isinstance(resource, str): resource = { 'url': resource } url = resource['url'] dest = expand_path(resource.get('subdir', '')) downloads.add((url, dest)) config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')] downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)} set_deeppavlov_root({'deeppavlov_root': dp_root_back}) return downloads
def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False) -> Chainer: """Build and return the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Component: """Builds and returns the Component from corresponding dictionary of parameters.""" # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: return _refs[config_params['ref']] except KeyError: e = ConfigError( 'Component with id "{id}" was referenced but not initialized'. format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model_from_config deeppavlov_root = get_deeppavlov_root() refs = _refs.copy() _refs.clear() config = read_json(expand_path(config_params['config_path'])) model = build_model_from_config(config) set_deeppavlov_root({'deeppavlov_root': deeppavlov_root}) _refs.clear() _refs.update(refs) return model elif 'class' in config_params: cls = cls_from_str(config_params.pop('class')) else: cls_name = config_params.pop('name', None) if not cls_name: e = ConfigError( 'Component config has no `name` nor `ref` or `class` fields') log.exception(e) raise e cls = get_model(cls_name) # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} try: spec = inspect.getfullargspec(cls) if 'mode' in spec.args + spec.kwonlyargs or spec.varkw is not None: kwargs['mode'] = mode component = cls(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(cls)) raise return component
def from_params(params: Dict, mode: str = 'infer', **kwargs) -> Component: """Builds and returns the Component from corresponding dictionary of parameters.""" # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: return _refs[config_params['ref']] except KeyError: e = ConfigError('Component with id "{id}" was referenced but not initialized' .format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model_from_config deeppavlov_root = get_deeppavlov_root() refs = _refs.copy() _refs.clear() config = read_json(expand_path(config_params['config_path'])) model = build_model_from_config(config, as_component=True) set_deeppavlov_root({'deeppavlov_root': deeppavlov_root}) _refs.clear() _refs.update(refs) return model elif 'class' in config_params: cls = cls_from_str(config_params.pop('class')) else: cls_name = config_params.pop('name', None) if not cls_name: e = ConfigError('Component config has no `name` nor `ref` or `class` fields') log.exception(e) raise e cls = get_model(cls_name) # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} try: spec = inspect.getfullargspec(cls) if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None: kwargs['mode'] = mode component = cls(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(cls)) raise return component
def predict_with_model(config_path: [Path, str]) -> List[List[str]]: """Returns predictions of morphotagging model given in config :config_path:. Args: config_path: a path to config Returns: a list of morphological analyses for each sentence. Each analysis is either a list of tags or a list of full CONLL-U descriptions. """ config = read_json(config_path) set_deeppavlov_root(config) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = { k: v for k, v in reader_config.items() if k not in ['name', 'data_path'] } data: Dict = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator = from_params(iterator_config, data=data) model = build_model_from_config(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches(batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def build_model_from_config(config, mode='infer', load_trained=False): set_deeppavlov_root(config) if 'chainer' in config: model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, vocabs=[], mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(c_in, c_out, component, in_y, main) return model model_config = config['model'] if load_trained: try: model_config['load_path'] = model_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') vocabs = {} if 'vocabs' in config: for vocab_param_name, vocab_config in config['vocabs'].items(): v = from_params(vocab_config, mode=mode) vocabs[vocab_param_name] = v model = from_params(model_config, vocabs=vocabs, mode=mode) model.reset() return model
def main(): args = parser.parse_args() pipeline_config_path = find_config(args.config_path) key_main_model = args.key_main_model population_size = args.p_size gpus = [int(gpu) for gpu in args.gpus.split(",")] train_partition = int(args.train_partition) start_from_population = int(args.start_from_population) path_to_population = args.path_to_population elitism_with_weights = args.elitism_with_weights iterations = int(args.iterations) p_crossover = args.p_cross pow_crossover = args.pow_cross p_mutation = args.p_mut pow_mutation = args.pow_mut if os.environ.get("CUDA_VISIBLE_DEVICES") is None: pass else: cvd = [ int(gpu) for gpu in os.environ.get("CUDA_VISIBLE_DEVICES").split(",") ] if gpus == [-1]: gpus = cvd else: try: gpus = [cvd[gpu] for gpu in gpus] except IndexError: raise ConfigError( "Can not use gpus `{}` with CUDA_VISIBLE_DEVICES='{}'". format(",".join(map(str, gpus)), ",".join(map(str, cvd)))) basic_params = read_json(pipeline_config_path) log.info("Given basic params: {}\n".format( json.dumps(basic_params, indent=2))) # Initialize evolution evolution = ParamsEvolution(population_size=population_size, p_crossover=p_crossover, crossover_power=pow_crossover, p_mutation=p_mutation, mutation_power=pow_mutation, key_main_model=key_main_model, seed=42, train_partition=train_partition, elitism_with_weights=elitism_with_weights, **basic_params) considered_metrics = evolution.get_value_from_config( evolution.basic_config, list(evolution.find_model_path(evolution.basic_config, "metrics"))[0] + ["metrics"]) log.info(considered_metrics) evolve_metric = considered_metrics[0] # Create table variable for gathering results set_deeppavlov_root(evolution.basic_config) expand_path( Path( evolution.get_value_from_config( evolution.basic_config, evolution.main_model_path + ["save_path"]))).mkdir(parents=True, exist_ok=True) result_file = expand_path( Path( evolution.get_value_from_config( evolution.basic_config, evolution.main_model_path + ["save_path"])).joinpath("result_table.csv")) result_table_columns = [] result_table_dict = {} for el in considered_metrics: result_table_dict[el + "_valid"] = [] result_table_dict[el + "_test"] = [] result_table_columns.extend([el + "_valid", el + "_test"]) result_table_dict["params"] = [] result_table_columns.append("params") if start_from_population == 0: # if starting evolution from scratch iters = 0 result_table = pd.DataFrame(result_table_dict) # write down result table file result_table.loc[:, result_table_columns].to_csv(result_file, index=False, sep='\t') log.info("Iteration #{} starts".format(iters)) # randomly generate the first population population = evolution.first_generation() else: # if starting evolution from already existing population iters = start_from_population log.info("Iteration #{} starts".format(iters)) population = [] for i in range(population_size): population.append( read_json( expand_path( Path(path_to_population).joinpath( "model_" + str(i)).joinpath("config.json")))) population[i] = evolution.insert_value_or_dict_into_config( population[i], evolution.main_model_path + ["save_path"], str( Path( evolution.get_value_from_config( evolution.basic_config, evolution.main_model_path + ["save_path" ])).joinpath("population_" + str(start_from_population)).joinpath( "model_" + str(i)).joinpath("model"))) population[i] = evolution.insert_value_or_dict_into_config( population[i], evolution.main_model_path + ["load_path"], str( Path( evolution.get_value_from_config( population[i], evolution.main_model_path + ["load_path"])))) for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): population[i] = evolution.insert_value_or_dict_into_config( population[i], path_ + ["save_path"], str( Path( evolution.get_value_from_config( evolution.basic_config, evolution.main_model_path + ["save_path"])).joinpath("population_" + str(iters)). joinpath("model_" + str(i)).joinpath("fitted_model_" + str(path_id)))) for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): population[i] = evolution.insert_value_or_dict_into_config( population[i], path_ + ["load_path"], str( Path( evolution.get_value_from_config( population[i], path_ + ["load_path"])))) run_population(population, evolution, gpus) population_scores = results_to_table(population, evolution, considered_metrics, result_file, result_table_columns)[evolve_metric] log.info("Population scores: {}".format(population_scores)) log.info("Iteration #{} was done".format(iters)) iters += 1 while True: if iterations != -1 and start_from_population + iterations == iters: log.info("End of evolution on iteration #{}".format(iters)) break log.info("Iteration #{} starts".format(iters)) population = evolution.next_generation(population, population_scores, iters) run_population(population, evolution, gpus) population_scores = results_to_table( population, evolution, considered_metrics, result_file, result_table_columns)[evolve_metric] log.info("Population scores: {}".format(population_scores)) log.info("Iteration #{} was done".format(iters)) iters += 1
def train_evaluate_model_from_config(config: [str, Path, dict], to_train: bool = True, to_validate: bool = True) -> None: """Make training and evaluation of the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) data = [] reader_config = config.get('dataset_reader', None) if reader_config: reader_config = config['dataset_reader'] if 'class' in reader_config: c = reader_config.pop('class') try: module_name, cls_name = c.split(':') reader = getattr(importlib.import_module(module_name), cls_name)() except ValueError: e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: reader = get_model(reader_config.pop('name'))() data_path = reader_config.pop('data_path', '') if isinstance(data_path, list): data_path = [expand_path(x) for x in data_path] else: data_path = expand_path(data_path) data = reader.read(data_path, **reader_config) else: log.warning("No dataset reader is provided in the JSON config.") iterator_config = config['dataset_iterator'] iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config, data=data) train_config = { 'metrics': ['accuracy'], 'validate_best': to_validate, 'test_best': True, 'show_examples': False } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list(zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if to_train: model = fit_chainer(config, iterator) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit_batches', None)): _fit_batches(model, iterator, train_config) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid', show_examples=train_config['show_examples']) } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test', show_examples=train_config['show_examples']) } print(json.dumps(report, ensure_ascii=False))
def train_evaluate_model_from_config(config: [str, Path, dict], to_train=True, to_validate=True) -> None: if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) data = [] reader_config = config.get('dataset_reader', None) if reader_config: reader_config = config['dataset_reader'] if 'class' in reader_config: c = reader_config.pop('class') try: module_name, cls_name = c.split(':') reader = getattr(importlib.import_module(module_name), cls_name)() except ValueError: e = ConfigError( 'Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: reader = get_model(reader_config.pop('name'))() data_path = expand_path(reader_config.pop('data_path', '')) data = reader.read(data_path, **reader_config) else: log.warning("No dataset reader is provided in the JSON config.") iterator_config = config['dataset_iterator'] iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config, data=data) train_config = { 'metrics': ['accuracy'], 'validate_best': to_validate, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if to_train: model = fit_chainer(config, iterator) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit_batches', None)): _fit_batches(model, iterator, train_config) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))
def train_model_from_config(config_path: str): config = read_json(config_path) set_deeppavlov_root(config) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) data = reader.read(data_path) dataset_config = config['dataset'] dataset: Dataset = from_params(dataset_config, data=data) if 'chainer' in config: model = fit_chainer(config, dataset) else: vocabs = {} for vocab_param_name, vocab_config in config.get('vocabs', {}).items(): v: Estimator = from_params(vocab_config, mode='train') vocabs[vocab_param_name] = _fit(v, dataset) model_config = config['model'] model = from_params(model_config, vocabs=vocabs, mode='train') train_config = { 'metrics': ['accuracy'], 'validate_best': True, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, dataset, train_config, metrics_functions) elif callable(getattr(model, 'fit', None)): _fit(model, dataset, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, dataset, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, dataset, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))
def from_params(params: Dict, mode='infer', **kwargs) -> Component: # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: return _refs[config_params['ref']] except KeyError: e = ConfigError('Component with id "{id}" was referenced but not initialized' .format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model_from_config deeppavlov_root = get_deeppavlov_root() refs = _refs.copy() _refs.clear() config = read_json(expand_path(config_params['config_path'])) model = build_model_from_config(config, as_component=True) set_deeppavlov_root({'deeppavlov_root': deeppavlov_root}) _refs.clear() _refs.update(refs) return model elif 'class' in config_params: c = config_params.pop('class') try: module_name, cls_name = c.split(':') cls = getattr(importlib.import_module(module_name), cls_name) except ValueError: e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: cls_name = config_params.pop('name', None) if not cls_name: e = ConfigError('Component config has no `name` nor `ref` or `class` fields') log.exception(e) raise e try: cls = REGISTRY[cls_name] except KeyError: e = ConfigError('Class {} is not registered.'.format(cls_name)) log.exception(e) raise e # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} try: spec = inspect.getfullargspec(cls) if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None: kwargs['mode'] = mode component = cls(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(cls)) raise return component
def train_model_from_config(config_path: str) -> None: config = read_json(config_path) set_deeppavlov_root(config) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) kwargs = { k: v for k, v in reader_config.items() if k not in ['name', 'data_path'] } data = reader.read(data_path, **kwargs) iterator_config = config['dataset_iterator'] iterator: BasicDatasetIterator = from_params(iterator_config, data=data) if 'chainer' in config: model = fit_chainer(config, iterator) else: vocabs = config.get('vocabs', {}) for vocab_param_name, vocab_config in vocabs.items(): v: Estimator = from_params(vocab_config, mode='train') vocabs[vocab_param_name] = _fit(v, iterator) model_config = config['model'] model = from_params(model_config, vocabs=vocabs, mode='train') train_config = { 'metrics': ['accuracy'], 'validate_best': True, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))
def train_evaluate_model_from_config( config: [str, Path, dict], iterator=None, to_train=True, to_validate=True) -> Dict[str, Dict[str, float]]: """Make training and evaluation of the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) if iterator is None: data = read_data_by_config(config) iterator = get_iterator_from_config(config, data) train_config = { 'metrics': ['accuracy'], 'validate_best': to_validate, 'test_best': True, 'show_examples': False } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') in_y = config['chainer'].get('in_y', ['y']) if isinstance(in_y, str): in_y = [in_y] if isinstance(config['chainer']['out'], str): config['chainer']['out'] = [config['chainer']['out']] metrics_functions = _parse_metrics(train_config['metrics'], in_y, config['chainer']['out']) if to_train: model = fit_chainer(config, iterator) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit_batches', None)): _fit_batches(model, iterator, train_config) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') model.destroy() res = {} if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid', show_examples=train_config['show_examples']) } res['valid'] = report['valid']['metrics'] print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test', show_examples=train_config['show_examples']) } res['test'] = report['test']['metrics'] print(json.dumps(report, ensure_ascii=False)) model.destroy() return res
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu May 10 16:28:38 2018 @author: lsm """ from deeppavlov.core.commands.train import train_evaluate_model_from_config from deeppavlov.core.commands.utils import expand_path, set_deeppavlov_root from deeppavlov.core.common.file import read_json import sys sys.path.insert(0, '../..') from model.pipeline.text_normalizer import * from model.pipeline.embedder import * from model.pipeline.CNN_model import * config = read_json('/subs/deliver/deliver_config.json') set_deeppavlov_root(config) train_evaluate_model_from_config('model/subs/deliver/deliver_config.json')
def from_params(params: Dict, **kwargs) -> Component: # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: return _refs[config_params['ref']] except KeyError: e = ConfigError( 'Component with id "{id}" was referenced but not initialized'. format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model_from_config deeppavlov_root = get_deeppavlov_root() config = read_json(expand_path(config_params['config_path'])) model = build_model_from_config(config, as_component=True) set_deeppavlov_root({'deeppavlov_root': deeppavlov_root}) return model elif 'class' in config_params: c = config_params.pop('class') try: module_name, cls_name = c.split(':') cls = getattr(importlib.import_module(module_name), cls_name) except ValueError: e = ConfigError( 'Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: cls_name = config_params.pop('name', None) if not cls_name: e = ConfigError( 'Component config has no `name` nor `ref` or `class` fields') log.exception(e) raise e try: cls = REGISTRY[cls_name] except KeyError: e = ConfigError('Class {} is not registered.'.format(cls_name)) log.exception(e) raise e # find the submodels params recursively for param_name, subcls_params in config_params.items(): if isinstance(subcls_params, dict): if not {'ref', 'name', 'class', 'config_path' }.intersection(subcls_params): "This parameter is passed as dict to the class constructor." " The user didn't intent it to be a component." for k, v in subcls_params.items(): subcls_params[k] = _resolve(v) continue config_params[param_name] = from_params(subcls_params, vocabs=kwargs['vocabs'], mode=kwargs['mode']) try: component = cls(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(cls)) raise return component
def train(self, model_level, model_name, path_to_data, path_to_config, path_to_global_embeddings, test_size=0.15, aug_method='word_dropout', samples_per_class=None, class_names=None, path_to_save_file=None, path_to_resulting_file=None): # preparing training/testing data df_raw = pd.read_csv(path_to_data) # preparing config config = read_json(path_to_config) if 'labels' not in df_raw or 'text' not in df_raw: raise InvalidDataFormatError( '\'labels\' and \'text\' columns must be in the dataframe') if model_level not in ['root', 'subs']: raise InvalidModelLevelError( 'model level should be either \'root\' or \'subs\'') __df_train, df_test, _, _ = train_test_split(df_raw, df_raw, test_size=test_size) df_train, df_val, _, _ = train_test_split(__df_train, __df_train, test_size=test_size) if aug_method not in ['word_dropout', 'duplicate']: raise InvalidDataAugmentationMethodError( '\'aug_method\' should be \'word_dropout\' or \'duplicate\'') df_train_equalized = self.__data_equalizer.equalize_classes( df_train, samples_per_class, aug_method) model_path = config['model_path'] if not os.path.isdir(model_path): os.mkdir(model_path) if not os.path.isdir(model_path + 'data/'): os.mkdir(model_path + 'data/') df_train_equalized.to_csv(model_path + 'data/train.csv') df_val[['text', 'labels']].sample(frac=1).to_csv(model_path + 'data/valid.csv') df_test[['text', 'labels']].sample(frac=1).to_csv(model_path + 'df_test.csv') # making embeddings emb_len = IntentsClassifier.get_config_element_by_name( config=config['chainer']['pipe'], name='embedder')['emb_len'] eb = EmbeddingsBuilder( resulting_dim=emb_len, path_to_original_embeddings=path_to_global_embeddings) tc = TextCorrector() corpus_cleaned = tc.tn.transform(df_raw.text.tolist()) if not os.path.isfile(model_path + 'ft_compressed.pkl'): eb.compress_embeddings(corpus_cleaned, model_path + 'ft_compressed.pkl', 'pca', eb.path_to_original_embeddings) gc.collect() if not os.path.isfile(model_path + 'ft_compressed_local.pkl'): eb.build_local_embeddings(corpus_cleaned, model_path + 'ft_compressed_local.pkl') # dealing with class_names if type(class_names) == list: pickle.dump(class_names, open(model_path + 'class_names.pkl', 'wb')) else: pickle.dump(df_train['labels'].value_counts().index.tolist(), open(model_path + 'class_names.pkl', 'wb')) # setting up saving and loading if not path_to_save_file == None: config['chainer']['pipe'][-1][ 'save_path'] = path_to_save_file + '/' + 'weights.hdf5' if not os.path.isdir( path_to_save_file) and not path_to_save_file == None: os.mkdir(path_to_save_file) if not os.path.isdir( path_to_resulting_file) and not path_to_resulting_file == None: os.mkdir(path_to_resulting_file) emb_config = IntentsClassifier.get_config_element_by_name( config['chainer']['pipe'], 'embedder') cnn_config = IntentsClassifier.get_config_element_by_name( config['chainer']['pipe'], 'cnn_model') config['chainer']['pipe'][config['chainer']['pipe'].index(emb_config)][ 'load_path'][0] = model_path + config['chainer']['pipe'][ config['chainer']['pipe'].index(emb_config)]['load_path'][0] config['chainer']['pipe'][config['chainer']['pipe'].index(emb_config)][ 'load_path'][1] = model_path + config['chainer']['pipe'][ config['chainer']['pipe'].index(emb_config)]['load_path'][1] config['chainer']['pipe'][config['chainer']['pipe'].index( cnn_config)]['classes'] = model_path + config['chainer']['pipe'][ config['chainer']['pipe'].index(cnn_config)]['classes'] config['dataset_reader'][ 'data_path'] = model_path + config['dataset_reader']['data_path'] config['train']['tensorboard_log_dir'] = model_path + config['train'][ 'tensorboard_log_dir'] load_path_bckp = config['chainer']['pipe'][-1]['load_path'] check_results = self.check_config(config) if len(check_results) > 0: raise InvalidConfig(check_results, model_path, 'Config file is invalid') # training set_deeppavlov_root(config) # update training status training_status = 'Classification model {} {} is currently training. Total number of epochs is set to {}'.format( model_level, model_name, config['train']['epochs']) with open(model_path + 'status.txt', 'w') as f: f.writelines(training_status) # fukken training train_evaluate_model_from_config(config) # fixing load_path # updating status perf = IntentsClassifier.get_latest_accuracy( config) #self.get_performance(config, model_path + 'df_test.csv') training_status = 'Classification model {} {} is trained \nf1_score (macro avg): {}'.format( model_level, model_name, perf) with open(model_path + 'status.txt', 'w') as f: f.writelines(training_status) # getting performance config['chainer']['pipe'][-1]['load_path'] = load_path_bckp copy( path_to_save_file + '/' + 'weights.hdf5', path_to_resulting_file + '/' + config['chainer']['pipe'][-1]['load_path']) copy(path_to_save_file + '/' + 'weights.hdf5', model_path + config['chainer']['pipe'][-1]['load_path'])
def main(): args = parser.parse_args() pipeline_config_path = find_config(args.config_path) key_main_model = args.key_main_model population_size = args.p_size gpus = [int(gpu) for gpu in args.gpus.split(",")] train_partition = int(args.train_partition) start_from_population = int(args.start_from_population) path_to_population = args.path_to_population elitism_with_weights = args.elitism_with_weights iterations = int(args.iterations) p_crossover = args.p_cross pow_crossover = args.pow_cross p_mutation = args.p_mut pow_mutation = args.pow_mut if os.environ.get("CUDA_VISIBLE_DEVICES") is None: pass else: cvd = [int(gpu) for gpu in os.environ.get("CUDA_VISIBLE_DEVICES").split(",")] if gpus == [-1]: gpus = cvd else: try: gpus = [cvd[gpu] for gpu in gpus] except: raise ConfigError("Can not use gpus `{}` with CUDA_VISIBLE_DEVICES='{}'".format( ",".join(gpus), ",".join(cvd) )) basic_params = read_json(pipeline_config_path) log.info("Given basic params: {}\n".format(json.dumps(basic_params, indent=2))) # Initialize evolution evolution = ParamsEvolution(population_size=population_size, p_crossover=p_crossover, crossover_power=pow_crossover, p_mutation=p_mutation, mutation_power=pow_mutation, key_main_model=key_main_model, seed=42, train_partition=train_partition, elitism_with_weights=elitism_with_weights, **basic_params) considered_metrics = evolution.get_value_from_config(evolution.basic_config, list(evolution.find_model_path( evolution.basic_config, "metrics"))[0] + ["metrics"]) log.info(considered_metrics) evolve_metric = considered_metrics[0] # Create table variable for gathering results set_deeppavlov_root(evolution.basic_config) expand_path(Path(evolution.get_value_from_config( evolution.basic_config, evolution.main_model_path + ["save_path"]))).mkdir(parents=True, exist_ok=True) result_file = expand_path(Path(evolution.get_value_from_config(evolution.basic_config, evolution.main_model_path + ["save_path"]) ).joinpath("result_table.csv")) result_table_columns = [] result_table_dict = {} for el in considered_metrics: result_table_dict[el + "_valid"] = [] result_table_dict[el + "_test"] = [] result_table_columns.extend([el + "_valid", el + "_test"]) result_table_dict["params"] = [] result_table_columns.append("params") if start_from_population == 0: # if starting evolution from scratch iters = 0 result_table = pd.DataFrame(result_table_dict) # write down result table file result_table.loc[:, result_table_columns].to_csv(result_file, index=False, sep='\t') log.info("Iteration #{} starts".format(iters)) # randomly generate the first population population = evolution.first_generation() else: # if starting evolution from already existing population iters = start_from_population log.info("Iteration #{} starts".format(iters)) population = [] for i in range(population_size): population.append(read_json(expand_path(Path(path_to_population).joinpath( "model_" + str(i)).joinpath("config.json")))) population[i] = evolution.insert_value_or_dict_into_config( population[i], evolution.main_model_path + ["save_path"], str(Path( evolution.get_value_from_config(evolution.basic_config, evolution.main_model_path + ["save_path"]) ).joinpath( "population_" + str(start_from_population)).joinpath( "model_" + str(i)).joinpath( "model"))) population[i] = evolution.insert_value_or_dict_into_config( population[i], evolution.main_model_path + ["load_path"], str(Path( evolution.get_value_from_config(population[i], evolution.main_model_path + ["load_path"])))) for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): population[i] = evolution.insert_value_or_dict_into_config( population[i], path_ + ["save_path"], str(Path(evolution.get_value_from_config(evolution.basic_config, evolution.main_model_path + ["save_path"]) ).joinpath("population_" + str(iters)).joinpath("model_" + str(i)).joinpath( "fitted_model_" + str(path_id)))) for path_id, path_ in enumerate(evolution.paths_to_fiton_dicts): population[i] = evolution.insert_value_or_dict_into_config( population[i], path_ + ["load_path"], str(Path(evolution.get_value_from_config( population[i], path_ + ["load_path"])))) run_population(population, evolution, gpus) population_scores = results_to_table(population, evolution, considered_metrics, result_file, result_table_columns)[evolve_metric] log.info("Population scores: {}".format(population_scores)) log.info("Iteration #{} was done".format(iters)) iters += 1 while True: if iterations != -1 and start_from_population + iterations == iters: log.info("End of evolution on iteration #{}".format(iters)) break log.info("Iteration #{} starts".format(iters)) population = evolution.next_generation(population, population_scores, iters) run_population(population, evolution, gpus) population_scores = results_to_table(population, evolution, considered_metrics, result_file, result_table_columns)[evolve_metric] log.info("Population scores: {}".format(population_scores)) log.info("Iteration #{} was done".format(iters)) iters += 1