def train_agent_models(config_path: str): usr_dir = paths.USR_PATH a = build_agent_from_config(config_path) for skill_config in a.skill_configs: model_config = skill_config['model'] model_name = model_config['name'] if issubclass(REGISTRY[model_name], Trainable): reader_config = skill_config['dataset_reader'] reader = from_params(REGISTRY[reader_config['name']], {}) data = reader.read(reader_config.get('data_path', usr_dir)) dataset_config = skill_config['dataset'] dataset_name = dataset_config['name'] dataset = from_params(REGISTRY[dataset_name], dataset_config, data=data) model = from_params(REGISTRY[model_name], model_config) model.train(dataset) else: print('Model {} is not an instance of Trainable, skip training.'. format(model_name), file=sys.stderr)
def train_model_from_config(config_path: str, mode='train'): usr_dir = paths.USR_PATH config = read_json(config_path) reader_config = config['dataset_reader'] # NOTE: Why there are no params for dataset reader? Because doesn't have __init__() reader = from_params(REGISTRY[reader_config['name']], {}) data = reader.read(reader_config.get('data_path', usr_dir)) dataset_config = config['dataset'] dataset_name = dataset_config['name'] dataset = from_params(REGISTRY[dataset_name], dataset_config, data=data) vocabs = {} if 'vocabs' in config: for vocab_param_name, vocab_config in config['vocabs'].items(): vocab_name = vocab_config['name'] v = from_params(REGISTRY[vocab_name], vocab_config, mode=mode) v.train(dataset.iter_all('train')) vocabs[vocab_param_name] = v model_config = config['model'] model_name = model_config['name'] model = from_params(REGISTRY[model_name], model_config, vocabs=vocabs, mode=mode) model.train(dataset)
def build_model_from_config(config, mode='infer', load_trained=False, as_component=False): set_deeppavlov_root(config) if 'chainer' in config: model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config[ 'save_path'] except KeyError: log.warning( 'No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format( component_config.get( 'name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, vocabs=[], mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model model_config = config['model'] if load_trained: try: model_config['load_path'] = model_config['save_path'] except KeyError: log.warning( 'No "save_path" parameter for the model, so "load_path" will not be renewed' ) vocabs = {} if 'vocabs' in config: for vocab_param_name, vocab_config in config['vocabs'].items(): v = from_params(vocab_config, mode=mode) vocabs[vocab_param_name] = v model = from_params(model_config, vocabs=vocabs, mode=mode) model.reset() return model
def build_model_from_config(config, mode='infer'): model_config = config['model'] model_name = model_config['name'] vocabs = {} if 'vocabs' in config: for vocab_param_name, vocab_config in config['vocabs'].items(): vocab_name = vocab_config['name'] v = from_params(REGISTRY[vocab_name], vocab_config, mode=mode) vocabs[vocab_param_name] = v model = from_params(REGISTRY[model_name], model_config, vocabs=vocabs, mode=mode) model.reset() return model
def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False, as_component: bool = False) -> Chainer: """Build and return the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def fit_chainer(config: dict, iterator: BasicDatasetIterator) -> Chainer: chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: component = from_params(component_config, vocabs=[], mode='train') if 'fit_on' in component_config: component: Estimator preprocessed = chainer(*iterator.iter_all('train'), to_return=component_config['fit_on']) if len(component_config['fit_on']) == 1: preprocessed = [preprocessed] else: preprocessed = zip(*preprocessed) component.fit(*preprocessed) component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) chainer.append(c_in, c_out, component, in_y, main) return chainer
def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]) -> Chainer: """Fit and return the chainer described in corresponding configuration dictionary.""" chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: component = from_params(component_config, mode='train') if 'fit_on' in component_config: component: Estimator targets = component_config['fit_on'] if isinstance(targets, str): targets = [targets] preprocessed = chainer.compute(*iterator.get_instances('train'), targets=targets) if len(component_config['fit_on']) == 1: preprocessed = [preprocessed] component.fit(*preprocessed) component.save() if 'fit_on_batch' in component_config: component: Estimator component.fit_batches(iterator, config['train']['batch_size']) component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) chainer.append(component, c_in, c_out, in_y, main) return chainer
def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]): chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: component = from_params(component_config, mode='train') if 'fit_on' in component_config: component: Estimator preprocessed = chainer(*iterator.get_instances('train'), to_return=component_config['fit_on']) if len(component_config['fit_on']) == 1: preprocessed = [preprocessed] else: preprocessed = zip(*preprocessed) component.fit(*preprocessed) component.save() if 'fit_on_batch' in component_config: component: Estimator component.fit_batches(iterator, config['train']['batch_size']) component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) chainer.append(component, c_in, c_out, in_y, main) return chainer
def interact_agent(config_path: str) -> None: """Start interaction with the agent described in corresponding configuration file.""" a = build_agent_from_config(config_path) commutator = from_params(a.commutator_config) models = [build_model_from_config(sk) for sk in a.skill_configs] while True: # get input from user context = input(':: ') # check for exit command if context == 'exit' or context == 'stop' or context == 'quit' or context == 'q': return predictions = [] for model in models: predictions.append( {model.__class__.__name__: model.infer(context, )}) idx, name, pred = commutator.infer(predictions, ) print('>>', pred) a.history.append({ 'context': context, "predictions": predictions, "winner": { "idx": idx, "model": name, "prediction": pred } }) log.debug("Current history: {}".format(a.history))
def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]) -> Chainer: """Fit and return the chainer described in corresponding configuration dictionary.""" chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: component = from_params(component_config, mode='train') if 'fit_on' in component_config: component: Estimator preprocessed = chainer(*iterator.get_instances('train'), to_return=component_config['fit_on']) if len(component_config['fit_on']) == 1: preprocessed = [preprocessed] else: preprocessed = zip(*preprocessed) component.fit(*preprocessed) component.save() if 'fit_on_batch' in component_config: component: Estimator component.fit_batches(iterator, config['train']['batch_size']) component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) chainer.append(component, c_in, c_out, in_y, main) return chainer
def get_iterator_from_config(config: dict, data: dict): """Create iterator (from config) for specified data.""" iterator_config = config['dataset_iterator'] iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config, data=data) return iterator
def read(self, data_path, tasks: Dict[str, Dict[str, str]]): """Creates dataset readers for tasks and returns what task dataset readers `read()` methods return. Args: data_path: can be anything since it is not used. `data_path` is present because it is required in train.py script. tasks: dictionary which keys are task names and values are dictionaries with `DatasetReader` subclasses specs. `DatasetReader` specs are provided in the same format as "dataset_reader" in the model config except for "class_name" field which has to be named "reader_class_name". ```json "tasks": { "query_prediction": { "reader_class_name": "basic_classification_reader", "x": "Question", "y": "Class", "data_path": "{DOWNLOADS_PATH}/query_prediction" } } ``` Returns: dictionary which keys are task names and values are what task readers `read()` methods returned. """ data = {} for task_name, reader_params in tasks.items(): reader_params = copy.deepcopy(reader_params) tasks[task_name] = from_params( {"class_name": reader_params['reader_class_name']}) del reader_params['reader_class_name'] reader_params['data_path'] = Path( reader_params['data_path']).expanduser() data[task_name] = tasks[task_name].read(**reader_params) return data
def build_model_from_config(config: [str, Path, dict], mode: str = 'infer', load_trained: bool = False) -> Chainer: """Build and return the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def predict_with_model(config_path): config = read_json(config_path) set_deeppavlov_root(config) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = {k: v for k, v in reader_config.items() if k not in ['name', 'data_path']} data = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator =\ from_params(iterator_config, data=data) model = build_model_from_config(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches( batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator]) -> None: """ Build the pipeline :class:`~deeppavlov.core.common.chainer.Chainer` and successively fit :class:`Estimator <deeppavlov.core.models.estimator.Estimator>` components using a provided data iterator """ if self._built: raise RuntimeError('Cannot fit already built chainer') for component_index, component_config in enumerate(self.chainer_config['pipe'], 1): component = from_params(component_config, mode='train') if 'fit_on' in component_config: component: Estimator targets = component_config['fit_on'] if isinstance(targets, str): targets = [targets] if self.batch_size > 0 and callable(getattr(component, 'partial_fit', None)): writer = None for i, (x, y) in enumerate(iterator.gen_batches(self.batch_size, shuffle=False)): preprocessed = self._chainer.compute(x, y, targets=targets) # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) if result is not None and self.tensorboard_log_dir is not None: if writer is None: writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / f'partial_fit_{component_index}_log')) for name, score in result.items(): summary = self._tf.Summary() summary.value.add(tag='partial_fit/' + name, simple_value=score) writer.add_summary(summary, i) writer.flush() else: preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets) if len(targets) == 1: preprocessed = [preprocessed] result: Optional[Dict[str, Iterable[float]]] = component.fit(*preprocessed) if result is not None and self.tensorboard_log_dir is not None: writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / f'fit_log_{component_index}')) for name, scores in result.items(): for i, score in enumerate(scores): summary = self._tf.Summary() summary.value.add(tag='fit/' + name, simple_value=score) writer.add_summary(summary, i) writer.flush() component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) self._chainer.append(component, c_in, c_out, in_y, main) self._built = True
def build_model(config: Union[str, Path, dict], mode: str = 'infer', load_trained: bool = False, download: bool = False, serialized: Optional[bytes] = None) -> Chainer: """Build and return the model described in corresponding configuration file.""" config = parse_config(config) if serialized: serialized: list = pickle.loads(serialized) if download: deep_download(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning( 'No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format( component_config.get( 'class_name', component_config.get('ref', 'UNKNOWN')))) if serialized and 'in' in component_config: component_serialized = serialized.pop(0) else: component_serialized = None component = from_params(component_config, mode=mode, serialized=component_serialized) if 'id' in component_config: model._components_dict[component_config['id']] = component if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def train_agent_models(config_path: str): usr_dir = paths.USR_PATH a = build_agent_from_config(config_path) for skill_config in a.skill_configs: model_config = skill_config['model'] model_name = model_config['name'] if issubclass(REGISTRY[model_name], Trainable): reader_config = skill_config['dataset_reader'] reader = from_params(REGISTRY[reader_config['name']], {}) data = reader.read(reader_config.get('data_path', usr_dir)) dataset_config = skill_config['dataset'] dataset_name = dataset_config['name'] dataset = from_params(REGISTRY[dataset_name], dataset_config, data=data) model = from_params(REGISTRY[model_name], model_config) model.train(dataset) else: print('Model {} is not an instance of Trainable, skip training.'.format(model_name), file=sys.stderr)
def predict_with_model(config_path: [Path, str]) -> List[Optional[List[str]]]: """Returns predictions of morphotagging model given in config :config_path:. Args: config_path: a path to config Returns: a list of morphological analyses for each sentence. Each analysis is either a list of tags or a list of full CONLL-U descriptions. """ config = parse_config(config_path) reader_config = config['dataset_reader'] reader = get_model(reader_config['class_name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = { k: v for k, v in reader_config.items() if k not in ['class_name', 'data_path'] } data: Dict = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator = from_params(iterator_config, data=data) model = build_model(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches(batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def __init__(self, data: dict, tasks: dict): self.task_iterators = {} for task_name, task_iterator_params in tasks.items(): task_iterator_params = copy.deepcopy(task_iterator_params) task_iterator_params['class_name'] = task_iterator_params['iterator_class_name'] del task_iterator_params['iterator_class_name'] self.task_iterators[task_name] = from_params(task_iterator_params, data=data[task_name]) self.train = self._extract_data_type('train') self.valid = self._extract_data_type('valid') self.test = self._extract_data_type('test') self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, 'all': self._unite_dataset_parts(self.train, self.valid, self.test) }
def build_model(config: Union[str, Path, dict], mode: str = 'infer', load_trained: bool = False, download: bool = False, serialized: Optional[bytes] = None) -> Chainer: """Build and return the model described in corresponding configuration file.""" config = parse_config(config) if serialized: serialized: list = pickle.loads(serialized) if download: deep_download(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y')) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning('No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format(component_config.get('class_name', component_config.get('ref', 'UNKNOWN')))) if serialized and 'in' in component_config: component_serialized = serialized.pop(0) else: component_serialized = None component = from_params(component_config, mode=mode, serialized=component_serialized) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def predict_with_model(config_path: [Path, str]) -> List[Optional[List[str]]]: """Returns predictions of morphotagging model given in config :config_path:. Args: config_path: a path to config Returns: a list of morphological analyses for each sentence. Each analysis is either a list of tags or a list of full CONLL-U descriptions. """ config = parse_config(config_path) reader_config = config['dataset_reader'] reader = get_model(reader_config['class_name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = {k: v for k, v in reader_config.items() if k not in ['class_name', 'data_path']} data: Dict = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator = from_params(iterator_config, data=data) model = build_model(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches( batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def interact_agent(config_path: str) -> None: """Start interaction with the agent described in corresponding configuration file.""" a = build_agent_from_config(config_path) commutator = from_params(a.commutator_config) models = [build_model_from_config(sk) for sk in a.skill_configs] while True: # get input from user context = input(':: ') # check for exit command if context == 'exit' or context == 'stop' or context == 'quit' or context == 'q': return predictions = [] for model in models: predictions.append({model.__class__.__name__: model.infer(context, )}) idx, name, pred = commutator.infer(predictions, ) print('>>', pred) a.history.append({'context': context, "predictions": predictions, "winner": {"idx": idx, "model": name, "prediction": pred}}) log.debug("Current history: {}".format(a.history))
def interact_agent(config_path): a = build_agent_from_config(config_path) commutator_name = a.commutator_config['name'] commutator = from_params(REGISTRY[commutator_name], a.commutator_config) models = [build_model_from_config(sk) for sk in a.skill_configs] while True: # get input from user context = input(':: ') # check for exit command if context == 'exit' or context == 'stop' or context == 'quit' or context == 'q': return predictions = [] for model in models: predictions.append({model.__class__.__name__: model.infer(context)}) idx, name, pred = commutator.infer(predictions, a.history) print('>>', pred) a.history.append({'context': context, "predictions": predictions, "winner": {"idx": idx, "model": name, "prediction": pred}}) print("Current history: {}".format(a.history))
def build_model_from_config(config, mode='infer', load_trained=False, as_component=False): set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) model_config = config['chainer'] model = Chainer(model_config['in'], model_config['out'], model_config.get('in_y'), as_component=as_component) for component_config in model_config['pipe']: if load_trained and ('fit_on' in component_config or 'in_y' in component_config): try: component_config['load_path'] = component_config['save_path'] except KeyError: log.warning( 'No "save_path" parameter for the {} component, so "load_path" will not be renewed' .format( component_config.get( 'name', component_config.get('ref', 'UNKNOWN')))) component = from_params(component_config, mode=mode) if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) model.append(component, c_in, c_out, in_y, main) return model
def train_model_from_config(config_path: str) -> None: config = read_json(config_path) set_deeppavlov_root(config) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) kwargs = { k: v for k, v in reader_config.items() if k not in ['name', 'data_path'] } data = reader.read(data_path, **kwargs) iterator_config = config['dataset_iterator'] iterator: BasicDatasetIterator = from_params(iterator_config, data=data) if 'chainer' in config: model = fit_chainer(config, iterator) else: vocabs = config.get('vocabs', {}) for vocab_param_name, vocab_config in vocabs.items(): v: Estimator = from_params(vocab_config, mode='train') vocabs[vocab_param_name] = _fit(v, iterator) model_config = config['model'] model = from_params(model_config, vocabs=vocabs, mode='train') train_config = { 'metrics': ['accuracy'], 'validate_best': True, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))
}, "tokenizer": { "name": "stream_spacy_tokenizer", "lowercase": False }, "tracker": { "name": "featurized_tracker", "slot_names": ["pricerange", "this", "area", "food", "name"] }, "main": True, "debug": False } mode_train = {"mode": "train"} tracker = from_params(bot_dict["tracker"], "train") tokenizer = from_params(bot_dict["tokenizer"], "train") network_parameters = bot_dict["network_parameters"] template_path = bot_dict["template_path"] save_path = bot_dict["save_path"] load_path = bot_dict["load_path"] template_type = bot_dict["template_type"] # 'str' object has no attribute 'items' word_vocab = from_params(bot_dict["word_vocab"], "train") bow_embedder = from_params(bot_dict["bow_embedder"], "train")
def fit_chainer(config: dict, iterator: Union[DataLearningIterator, DataFittingIterator]) -> Chainer: """Fit and return the chainer described in corresponding configuration dictionary.""" chainer_config: dict = config['chainer'] chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) for component_config in chainer_config['pipe']: component = from_params(component_config, mode='train') if ('fit_on' in component_config) and \ (not callable(getattr(component, 'partial_fit', None))): component: Estimator targets = component_config['fit_on'] if isinstance(targets, str): targets = [targets] preprocessed = chainer.compute(*iterator.get_instances('train'), targets=targets) if len(component_config['fit_on']) == 1: preprocessed = [preprocessed] result = component.fit(*preprocessed) if result is not None and config['train'].get('tensorboard_log_dir') is not None: import tensorflow as tf tb_log_dir = expand_path(config['train']['tensorboard_log_dir']) writer = tf.summary.FileWriter(str(tb_log_dir / 'fit_log')) for name, scores in result.items(): for i, score in enumerate(scores): summ = tf.Summary() summ.value.add(tag='fit/' + name, simple_value=score) writer.add_summary(summ, i) writer.flush() component.save() if 'fit_on_batch' in component_config: log.warning('`fit_on_batch` is deprecated and will be removed in future versions.' ' Please use `fit_on` instead.') if ('fit_on_batch' in component_config) or \ (('fit_on' in component_config) and callable(getattr(component, 'partial_fit', None))): component: Estimator targets = component_config.get('fit_on', component_config['fit_on_batch']) if isinstance(targets, str): targets = [targets] for i, data in enumerate(iterator.gen_batches(config['train']['batch_size'], shuffle=False)): preprocessed = chainer.compute(*data, targets=targets) if len(targets) == 1: preprocessed = [preprocessed] result = component.partial_fit(*preprocessed) if result is not None and config['train'].get('tensorboard_log_dir') is not None: if i == 0: import tensorflow as tf tb_log_dir = expand_path(config['train']['tensorboard_log_dir']) writer = tf.summary.FileWriter(str(tb_log_dir / 'fit_batches_log')) for name, score in result.items(): summ = tf.Summary() summ.value.add(tag='fit_batches/' + name, simple_value=score) writer.add_summary(summ, i) writer.flush() component.save() if 'in' in component_config: c_in = component_config['in'] c_out = component_config['out'] in_y = component_config.get('in_y', None) main = component_config.get('main', False) chainer.append(component, c_in, c_out, in_y, main) return chainer
def train_evaluate_model_from_config(config: [str, Path, dict], to_train: bool = True, to_validate: bool = True) -> None: """Make training and evaluation of the model described in corresponding configuration file.""" if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) data = [] reader_config = config.get('dataset_reader', None) if reader_config: reader_config = config['dataset_reader'] if 'class' in reader_config: c = reader_config.pop('class') try: module_name, cls_name = c.split(':') reader = getattr(importlib.import_module(module_name), cls_name)() except ValueError: e = ConfigError('Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: reader = get_model(reader_config.pop('name'))() data_path = reader_config.pop('data_path', '') if isinstance(data_path, list): data_path = [expand_path(x) for x in data_path] else: data_path = expand_path(data_path) data = reader.read(data_path, **reader_config) else: log.warning("No dataset reader is provided in the JSON config.") iterator_config = config['dataset_iterator'] iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config, data=data) train_config = { 'metrics': ['accuracy'], 'validate_best': to_validate, 'test_best': True, 'show_examples': False } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list(zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if to_train: model = fit_chainer(config, iterator) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit_batches', None)): _fit_batches(model, iterator, train_config) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid', show_examples=train_config['show_examples']) } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test', show_examples=train_config['show_examples']) } print(json.dumps(report, ensure_ascii=False))
def train_evaluate_model_from_config(config: [str, Path, dict], to_train=True, to_validate=True) -> None: if isinstance(config, (str, Path)): config = read_json(config) set_deeppavlov_root(config) import_packages(config.get('metadata', {}).get('imports', [])) dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'name': 'basic_classification_reader'} iterator = {'name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) data = [] reader_config = config.get('dataset_reader', None) if reader_config: reader_config = config['dataset_reader'] if 'class' in reader_config: c = reader_config.pop('class') try: module_name, cls_name = c.split(':') reader = getattr(importlib.import_module(module_name), cls_name)() except ValueError: e = ConfigError( 'Expected class description in a `module.submodules:ClassName` form, but got `{}`' .format(c)) log.exception(e) raise e else: reader = get_model(reader_config.pop('name'))() data_path = expand_path(reader_config.pop('data_path', '')) data = reader.read(data_path, **reader_config) else: log.warning("No dataset reader is provided in the JSON config.") iterator_config = config['dataset_iterator'] iterator: Union[DataLearningIterator, DataFittingIterator] = from_params(iterator_config, data=data) train_config = { 'metrics': ['accuracy'], 'validate_best': to_validate, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if to_train: model = fit_chainer(config, iterator) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, iterator, train_config, metrics_functions) elif callable(getattr(model, 'fit_batches', None)): _fit_batches(model, iterator, train_config) elif callable(getattr(model, 'fit', None)): _fit(model, iterator, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, iterator, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))
def load_elmo(elmo_output_names=("word_emb",)): config = parse_config(getattr(configs.elmo_embedder, "elmo_ru-news")) elmo_config = config["chainer"]["pipe"][-1] elmo_config['elmo_output_names'] = elmo_output_names embedder = from_params(elmo_config) return embedder
def train_model_from_config(config_path: str): config = read_json(config_path) set_deeppavlov_root(config) reader_config = config['dataset_reader'] reader = get_model(reader_config['name'])() data_path = expand_path(reader_config.get('data_path', '')) data = reader.read(data_path) dataset_config = config['dataset'] dataset: Dataset = from_params(dataset_config, data=data) if 'chainer' in config: model = fit_chainer(config, dataset) else: vocabs = {} for vocab_param_name, vocab_config in config.get('vocabs', {}).items(): v: Estimator = from_params(vocab_config, mode='train') vocabs[vocab_param_name] = _fit(v, dataset) model_config = config['model'] model = from_params(model_config, vocabs=vocabs, mode='train') train_config = { 'metrics': ['accuracy'], 'validate_best': True, 'test_best': True } try: train_config.update(config['train']) except KeyError: log.warning('Train config is missing. Populating with default values') metrics_functions = list( zip(train_config['metrics'], get_metrics_by_names(train_config['metrics']))) if callable(getattr(model, 'train_on_batch', None)): _train_batches(model, dataset, train_config, metrics_functions) elif callable(getattr(model, 'fit', None)): _fit(model, dataset, train_config) elif not isinstance(model, Chainer): log.warning('Nothing to train') if train_config['validate_best'] or train_config['test_best']: # try: # model_config['load_path'] = model_config['save_path'] # except KeyError: # log.warning('No "save_path" parameter for the model, so "load_path" will not be renewed') model = build_model_from_config(config, load_trained=True) log.info('Testing the best saved model') if train_config['validate_best']: report = { 'valid': _test_model(model, metrics_functions, dataset, train_config.get('batch_size', -1), 'valid') } print(json.dumps(report, ensure_ascii=False)) if train_config['test_best']: report = { 'test': _test_model(model, metrics_functions, dataset, train_config.get('batch_size', -1), 'test') } print(json.dumps(report, ensure_ascii=False))