def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str, context_limit: int = 450, question_limit: int = 150, char_limit: int = 16, level: str = 'token', *args, **kwargs): self.emb_folder = expand_path(emb_folder) self.level = level self.emb_url = emb_url self.emb_file_name = Path(emb_url).name self.save_path = expand_path(save_path) self.load_path = expand_path(load_path) self.context_limit = context_limit self.question_limit = question_limit self.char_limit = char_limit self.loaded = False self.NULL = "<NULL>" self.OOV = "<OOV>" self.emb_folder.mkdir(parents=True, exist_ok=True) if not (self.emb_folder / self.emb_file_name).exists(): download(self.emb_folder / self.emb_file_name, self.emb_url) if self.load_path.exists(): self.load()
def get_config_downloads( config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]: config = parse_config(config) downloads = set() if 'metadata' in config and 'download' in config['metadata']: for resource in config['metadata']['download']: if isinstance(resource, str): resource = {'url': resource} url = resource['url'] dest = expand_path(resource.get('subdir', '')) downloads.add((url, dest)) config_references = [ expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path') ] downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)} return downloads
def __init__( self, vocabs_path, save_path, load_path, max_sequence_length, padding="post", truncating="pre", ): self.max_sequence_length = max_sequence_length self.padding = padding self.truncating = truncating save_path = expand_path(save_path).resolve().parent load_path = expand_path(load_path).resolve().parent self.vocabs_path = expand_path(vocabs_path) self.tok_save_path = save_path / "tok2int.dict" self.tok_load_path = load_path / "tok2int.dict" self.cont_save_path = save_path / "cont2toks.dict" self.cont_load_path = load_path / "cont2toks.dict" self.resp_save_path = save_path / "resp2toks.dict" self.resp_load_path = load_path / "resp2toks.dict" self.cemb_save_path = str(save_path / "context_embs.npy") self.cemb_load_path = str(load_path / "context_embs.npy") self.remb_save_path = str(save_path / "response_embs.npy") self.remb_load_path = str(load_path / "response_embs.npy") self.int2tok_vocab = {} self.tok2int_vocab = {} self.response2toks_vocab = {} self.response2emb_vocab = {} self.context2toks_vocab = {} self.context2emb_vocab = {}
def read_data_by_config(config: dict): """Read data by dataset_reader from specified config.""" dataset_config = config.get('dataset', None) if dataset_config: config.pop('dataset') ds_type = dataset_config['type'] if ds_type == 'classification': reader = {'class_name': 'basic_classification_reader'} iterator = {'class_name': 'basic_classification_iterator'} config['dataset_reader'] = {**dataset_config, **reader} config['dataset_iterator'] = {**dataset_config, **iterator} else: raise Exception("Unsupported dataset type: {}".format(ds_type)) try: reader_config = dict(config['dataset_reader']) except KeyError: raise ConfigError("No dataset reader is provided in the JSON config.") reader = get_model(reader_config.pop('class_name'))() data_path = reader_config.pop('data_path', '') if isinstance(data_path, list): data_path = [expand_path(x) for x in data_path] else: data_path = expand_path(data_path) return reader.read(data_path, **reader_config)
def __init__(self, save_path: str = './tok.dict', load_path: str = './tok.dict', max_sequence_length: int = None, dynamic_batch: bool = False, padding: str = 'post', truncating: str = 'post', use_matrix: bool = True, num_context_turns: int = 1, num_ranking_samples: int = 1, add_raw_text: bool = False, tokenizer: Component = None, vocab: Optional[Estimator] = None, embedder: Optional[Component] = None, sent_vocab: Optional[Estimator] = None, **kwargs): self.max_sequence_length = max_sequence_length self.padding = padding self.truncating = truncating self.dynamic_batch = dynamic_batch self.use_matrix = use_matrix self.num_ranking_samples = num_ranking_samples self.num_context_turns = num_context_turns self.add_raw_text = add_raw_text self.tokenizer = tokenizer self.embedder = embedder self.vocab = vocab self.sent_vocab = sent_vocab self.save_path = expand_path(save_path).resolve() self.load_path = expand_path(load_path).resolve() super().__init__(load_path=self.load_path, save_path=self.save_path, **kwargs)
def get_config_downloads(config_path): dp_root_back = get_deeppavlov_root() config = read_json(config_path) set_deeppavlov_root(config) downloads = set() if 'metadata' in config and 'download' in config['metadata']: for resource in config['metadata']['download']: if isinstance(resource, str): resource = { 'url': resource } url = resource['url'] dest = expand_path(resource.get('subdir', '')) downloads.add((url, dest)) config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')] downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)} set_deeppavlov_root({'deeppavlov_root': dp_root_back}) return downloads
def __init__(self, save_path, load_path=None, mode='infer', *args, **kwargs): if save_path: self.save_path = expand_path(save_path) self.save_path.parent.mkdir(parents=True, exist_ok=True) else: self.save_path = None if load_path: self.load_path = expand_path(load_path) if mode != 'train' and self.save_path and self.load_path != self.save_path: log.warning( "Load path '{}' differs from save path '{}' in '{}' mode for {}." .format(self.load_path, self.save_path, mode, self.__class__.__name__)) elif mode != 'train' and self.save_path: self.load_path = self.save_path log.warning( "No load path is set for {} in '{}' mode. Using save path instead" .format(self.__class__.__name__, mode)) else: self.load_path = None log.warning("No load path is set for {}!".format( self.__class__.__name__))
def __init__(self, bert_config_file: str, n_tags: List[str], keep_prob: float, attention_probs_keep_prob: float = None, hidden_keep_prob: float = None, encoder_layer_ids: List[int] = tuple(range(12)), optimizer: str = None, num_warmup_steps: int = None, weight_decay_rate: float = 0.01, return_probas: bool = False, pretrained_bert: str = None, min_learning_rate: float = 1e-06, **kwargs) -> None: super().__init__(**kwargs) self.return_probas = return_probas self.n_tags = n_tags self.min_learning_rate = min_learning_rate self.keep_prob = keep_prob self.optimizer = optimizer self.encoder_layer_ids = encoder_layer_ids self.num_warmup_steps = num_warmup_steps self.weight_decay_rate = weight_decay_rate self.bert_config = BertConfig.from_json_file( str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob self.sess_config = tf.ConfigProto(allow_soft_placement=True) self.sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.sess_config) self._init_graph() self._init_optimizer() self.sess.run(tf.global_variables_initializer()) if pretrained_bert is not None: pretrained_bert = str(expand_path(pretrained_bert)) if tf.train.checkpoint_exists(pretrained_bert) \ and not tf.train.checkpoint_exists(str(self.load_path.resolve())): logger.info('[initializing model with Bert from {}]'.format( pretrained_bert)) # Exclude optimizer and classification variables from saved variables var_list = self._get_saveable_variables( exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'ner')) saver = tf.train.Saver(var_list) saver.restore(self.sess, pretrained_bert) if self.load_path is not None: self.load()
def load(self, fname=None): if fname is not None: self.load_path = fname self.pretrained_bert = str(expand_path(self.pretrained_bert)) if self.pretrained_bert: config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained_bert, config=config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = AutoConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = AutoModelForTokenClassification( config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info( f"Init from scratch. Load path {weights_path} does not exist." )
def run_population(population, evolution, gpus): """ Change save and load paths for obtained population, save config.json with model config, run population via current python executor (with which evolve.py already run) and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs) Args: population: list of dictionaries - configs of current population evolution: ParamsEvolution gpus: list of given devices (list of integers) Returns: None """ population_size = len(population) for k in range(population_size // len(gpus) + 1): procs = [] for j in range(len(gpus)): i = k * len(gpus) + j if i < population_size: save_path = expand_path( Path( evolution.get_value_from_config( population[i], evolution.main_model_path + ["save_path"])).parent) save_path.mkdir(parents=True, exist_ok=True) f_name = save_path.joinpath("config.json") save_json(population[i], f_name) with save_path.joinpath('out.txt').open('w', encoding='utf8') as outlog,\ save_path.joinpath('err.txt').open('w', encoding='utf8') as errlog: env = dict(os.environ) if len(gpus) > 1 or gpus[0] != -1: env['CUDA_VISIBLE_DEVICES'] = str(gpus[j]) procs.append( Popen("{} -m deeppavlov train {}".format( sys.executable, str(f_name)), shell=True, stdout=outlog, stderr=errlog, env=env)) for j, proc in enumerate(procs): i = k * len(gpus) + j log.info(f'Waiting on {i}th proc') if proc.wait() != 0: save_path = expand_path( Path( evolution.get_value_from_config( population[i], evolution.main_model_path + ["save_path"])).parent) with save_path.joinpath('err.txt').open( encoding='utf8') as errlog: log.warning( f'Population {i} returned an error code {proc.returncode} and an error log:\n' + errlog.read()) return None
def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True, **kwargs) -> None: pop_dict_path = expand_path(pop_dict_path) logger.info(f"Reading popularity dictionary from {pop_dict_path}") self.pop_dict = read_json(pop_dict_path) self.mean_pop = np.mean(list(self.pop_dict.values())) load_path = expand_path(load_path) logger.info(f"Loading popularity ranker from {load_path}") self.clf = joblib.load(load_path) self.top_n = top_n self.active = active
def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True, **kwargs) -> None: pop_dict_path = expand_path(pop_dict_path) logger.info(f"Reading popularity dictionary from {pop_dict_path}") self.pop_dict = read_json(pop_dict_path) self.mean_pop = np.mean(list(self.pop_dict.values())) load_path = expand_path(load_path) logger.info(f"Loading popularity ranker from {load_path}") self.clf = joblib.load(load_path) self.top_n = top_n self.active = active
def __init__(self, bert_config_file, n_classes, keep_prob, one_hot_labels=False, multilabel=False, return_probas=False, attention_probs_keep_prob=None, hidden_keep_prob=None, optimizer=None, num_warmup_steps=None, weight_decay_rate=0.01, pretrained_bert=None, min_learning_rate=1e-06, **kwargs) -> None: super().__init__(**kwargs) self.return_probas = return_probas self.n_classes = n_classes self.min_learning_rate = min_learning_rate self.keep_prob = keep_prob self.one_hot_labels = one_hot_labels self.multilabel = multilabel self.optimizer = optimizer self.num_warmup_steps = num_warmup_steps self.weight_decay_rate = weight_decay_rate if self.multilabel and not self.one_hot_labels: raise RuntimeError('Use one-hot encoded labels for multilabel classification!') if self.multilabel and not self.return_probas: raise RuntimeError('Set return_probas to True for multilabel classification!') self.bert_config = BertConfig.from_json_file(str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob self.sess_config = tf.ConfigProto(allow_soft_placement=True) self.sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.sess_config) self._init_graph() self._init_optimizer() self.sess.run(tf.global_variables_initializer()) if pretrained_bert is not None: pretrained_bert = str(expand_path(pretrained_bert)) if tf.train.checkpoint_exists(pretrained_bert) \ and not (self.load_path and tf.train.checkpoint_exists(str(self.load_path.resolve()))): logger.info('[initializing model with Bert from {}]'.format(pretrained_bert)) # Exclude optimizer and classification variables from saved variables var_list = self._get_saveable_variables( exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias')) saver = tf.train.Saver(var_list) saver.restore(self.sess, pretrained_bert) if self.load_path is not None: self.load()
def __init__(self, bert_config_file, keep_prob=0.9, attention_probs_keep_prob=None, hidden_keep_prob=None, optimizer=None, weight_decay_rate=0.01, pretrained_bert=None, min_learning_rate=1e-06, **kwargs) -> None: super().__init__(**kwargs) self.min_learning_rate = min_learning_rate self.keep_prob = keep_prob self.optimizer = optimizer self.weight_decay_rate = weight_decay_rate self.bert_config = BertConfig.from_json_file( str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob self.sess_config = tf.ConfigProto(allow_soft_placement=True) self.sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.sess_config) self._init_graph() self._init_optimizer() if pretrained_bert is not None: pretrained_bert = str(expand_path(pretrained_bert)) if tf.train.checkpoint_exists(pretrained_bert) \ and not tf.train.checkpoint_exists(str(self.load_path.resolve())): logger.info('[initializing model with Bert from {}]'.format( pretrained_bert)) # Exclude optimizer and classification variables from saved variables var_list = self._get_saveable_variables( exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias')) assignment_map = self.get_variables_to_restore( var_list, pretrained_bert) tf.train.init_from_checkpoint(pretrained_bert, assignment_map) self.sess.run(tf.global_variables_initializer()) if self.load_path is not None: self.load()
def __init__(self, embedder: Component, tokenizer: Component = None, pad_zero: bool = False, mean: bool = False, tags_vocab_path: str = None, vectorizer: Component = None, counter_vocab_path: str = None, idf_base_count: int = 100, log_base: int = 10, min_idf_weight=0.0, **kwargs) -> None: """ Initialize embedder with given parameters. """ self.embedder = embedder self.dim = self.embedder.dim self.mean = mean self.pad_zero = pad_zero if tokenizer is None: self.tokenizer = self.space_detokenizer else: self.tokenizer = tokenizer if vectorizer and counter_vocab_path: raise ConfigError( "TfidfWeightedEmbedder got vectorizer and counter_vocab_path simultaneously." " Remove one of them, please") elif vectorizer: self.vectorizer = vectorizer self.vocabulary = np.array( self.vectorizer.model.get_feature_names()) elif counter_vocab_path: self.counter_vocab_path = expand_path(counter_vocab_path) self.counter_vocab, self.min_count = self.load_counter_vocab( self.vocab_path) self.idf_base_count = idf_base_count self.log_base = log_base self.min_idf_weight = min_idf_weight else: raise ConfigError( "TfidfWeightedEmbedder did not get vectorizer or counter_vocab_path." " Set one of them, please") if tags_vocab_path: self.tags_vocab = self.load_tags_vocab( expand_path(tags_vocab_path)) else: self.tags_vocab = None
def __init__(self, bert_config_file: str, keep_prob: float, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, optimizer: Optional[str] = None, weight_decay_rate: Optional[float] = 0.01, pretrained_bert: Optional[str] = None, min_learning_rate: float = 1e-06, **kwargs) -> None: super().__init__(**kwargs) self.min_learning_rate = min_learning_rate self.keep_prob = keep_prob self.optimizer = optimizer self.weight_decay_rate = weight_decay_rate self.bert_config = BertConfig.from_json_file( str(expand_path(bert_config_file))) if attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob if hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob self.sess_config = tf.ConfigProto(allow_soft_placement=True) self.sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.sess_config) self._init_graph() self._init_optimizer() self.sess.run(tf.global_variables_initializer()) if pretrained_bert is not None: pretrained_bert = str(expand_path(pretrained_bert)) if tf.train.checkpoint_exists(pretrained_bert) \ and not (self.load_path and tf.train.checkpoint_exists(str(self.load_path.resolve()))): logger.info('[initializing model with Bert from {}]'.format( pretrained_bert)) var_list = self._get_saveable_variables( exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'squad')) saver = tf.train.Saver(var_list) saver.restore(self.sess, pretrained_bert) if self.load_path is not None: self.load()
def __init__(self, save_path: str, load_path: str, max_sequence_length: int, max_token_length: int, padding: str = 'post', truncating: str = 'post', token_embeddings: bool = True, char_embeddings: bool = False, char_pad: str = 'post', char_trunc: str = 'post', tok_dynamic_batch: bool = False, char_dynamic_batch: bool = False, update_embeddings: bool = False): self.max_sequence_length = max_sequence_length self.token_embeddings = token_embeddings self.char_embeddings = char_embeddings self.max_token_length = max_token_length self.padding = padding self.truncating = truncating self.char_pad = char_pad self.char_trunc = char_trunc self.tok_dynamic_batch = tok_dynamic_batch self.char_dynamic_batch = char_dynamic_batch self.upd_embs = update_embeddings save_path = expand_path(save_path).resolve().parent load_path = expand_path(load_path).resolve().parent self.char_save_path = save_path / "char2int.dict" self.char_load_path = load_path / "char2int.dict" self.tok_save_path = save_path / "tok2int.dict" self.tok_load_path = load_path / "tok2int.dict" self.cont_save_path = save_path / "cont2toks.dict" self.cont_load_path = load_path / "cont2toks.dict" self.resp_save_path = save_path / "resp2toks.dict" self.resp_load_path = load_path / "resp2toks.dict" self.cemb_save_path = str(save_path / "context_embs.npy") self.cemb_load_path = str(load_path / "context_embs.npy") self.remb_save_path = str(save_path / "response_embs.npy") self.remb_load_path = str(load_path / "response_embs.npy") self.int2tok_vocab = {} self.tok2int_vocab = {} self.response2toks_vocab = {} self.response2emb_vocab = {} self.context2toks_vocab = {} self.context2emb_vocab = {}
def __init__(self, save_path: str, load_path: str, max_sequence_length: int, max_token_length: int, padding: str = 'post', truncating: str = 'post', token_embeddings: bool = True, char_embeddings: bool = False, char_pad: str = 'post', char_trunc: str = 'post', tok_dynamic_batch: bool = False, char_dynamic_batch: bool = False, update_embeddings: bool = False): self.max_sequence_length = max_sequence_length self.token_embeddings = token_embeddings self.char_embeddings = char_embeddings self.max_token_length = max_token_length self.padding = padding self.truncating = truncating self.char_pad = char_pad self.char_trunc = char_trunc self.tok_dynamic_batch = tok_dynamic_batch self.char_dynamic_batch = char_dynamic_batch self.upd_embs = update_embeddings save_path = expand_path(save_path).resolve().parent load_path = expand_path(load_path).resolve().parent self.char_save_path = save_path / "char2int.dict" self.char_load_path = load_path / "char2int.dict" self.tok_save_path = save_path / "tok2int.dict" self.tok_load_path = load_path / "tok2int.dict" self.cont_save_path = save_path / "cont2toks.dict" self.cont_load_path = load_path / "cont2toks.dict" self.resp_save_path = save_path / "resp2toks.dict" self.resp_load_path = load_path / "resp2toks.dict" self.cemb_save_path = str(save_path / "context_embs.npy") self.cemb_load_path = str(load_path / "context_embs.npy") self.remb_save_path = str(save_path / "response_embs.npy") self.remb_load_path = str(load_path / "response_embs.npy") self.int2tok_vocab = {} self.tok2int_vocab = {} self.response2toks_vocab = {} self.response2emb_vocab = {} self.context2toks_vocab = {} self.context2emb_vocab = {}
def __init__(self, freq_dict_filename: str, candidate_nouns: int = 10, **kwargs): """ Args: freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies candidate_nouns: how many candidate nouns to leave after search **kwargs: """ self.candidate_nouns = candidate_nouns alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя-" self.alphabet_length = len(alphabet) self.max_word_length = 24 self.letter_nums = {letter: num for num, letter in enumerate(alphabet)} with open(str(expand_path(freq_dict_filename)), 'r') as fl: lines = fl.readlines() pos_freq_dict = defaultdict(list) for line in lines: line_split = line.strip('\n').split('\t') if re.match("[\d]+\.[\d]+", line_split[2]): pos_freq_dict[line_split[1]].append( (line_split[0], float(line_split[2]))) self.nouns_with_freq = pos_freq_dict["s"] + pos_freq_dict["s.PROP"] self.adj_set = set([word for word, freq in pos_freq_dict["a"]]) self.nouns = [noun[0] for noun in self.nouns_with_freq] self.matrix = self.make_sparse_matrix(self.nouns).transpose()
def __init__(self, spec: str, elmo_output_names: Optional[List] = None, dim: Optional[int] = None, pad_zero: bool = False, concat_last_axis: bool = True, max_token: Optional[int] = None, mini_batch_size: int = 32, **kwargs) -> None: self.spec = spec if '://' in spec else str(expand_path(spec)) self.elmo_output_dims = {'word_emb': 512, 'lstm_outputs1': 1024, 'lstm_outputs2': 1024, 'elmo': 1024, 'default': 1024} elmo_output_names = elmo_output_names or ['default'] self.elmo_output_names = elmo_output_names elmo_output_names_set = set(self.elmo_output_names) if elmo_output_names_set - set(self.elmo_output_dims.keys()): log.error(f'Incorrect elmo_output_names = {elmo_output_names} . You can use either ["default"] or some of' '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]') sys.exit(1) if elmo_output_names_set - {'default'} and elmo_output_names_set - {"word_emb", "lstm_outputs1", "lstm_outputs2", "elmo"}: log.error('Incompatible conditions: you can use either ["default"] or list of ' '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"] ') sys.exit(1) self.pad_zero = pad_zero self.concat_last_axis = concat_last_axis self.max_token = max_token self.mini_batch_size = mini_batch_size self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load() self.dim = self._get_dims(self.elmo_output_names, dim, concat_last_axis)
def __init__(self, data_path: Union[Path, str], *args, **kwargs): log.info(f"Initializing `{self.__class__.__name__}`") data_path = Path(expand_path(data_path)) required_files = [ 'obscenity_words.json', 'obscenity_words_exception.json', 'obscenity_words_extended.json' ] for file in required_files: if not (data_path / file).exists(): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(data_path / file)) self.obscenity_words = set( json.load( open(data_path / 'obscenity_words.json', encoding="utf-8"))) self.obscenity_words_extended = set( json.load( open(data_path / 'obscenity_words_extended.json', encoding="utf-8"))) self.obscenity_words_exception = set( json.load( open(data_path / 'obscenity_words_exception.json', encoding="utf-8"))) self.obscenity_words.update(self.obscenity_words_extended) PATTERN_1, PATTERN_2 = self._get_patterns() self.regexp = re.compile(PATTERN_1, re.U | re.I) self.regexp2 = re.compile(PATTERN_2, re.U | re.I) self.morph = pymorphy2.MorphAnalyzer() self.word_pattern = re.compile(r'[А-яЁё]+')
def __init__(self, data_dir: str = '', data_url: str = DB_URL, batch_size: int = None, shuffle: bool = None, seed: int = None, **kwargs): """ :param data_dir: a directory name where DB is located :param data_url: an URL to SQLite DB :param batch_size: a batch size for reading from the database """ download_dir = expand_path(data_dir) download_path = download_dir.joinpath(data_url.split("/")[-1]) download(download_path, data_url, force_download=False) # if not download_dir.exists() or is_empty(download_dir): # logger.info('[downloading wiki.db from {} to {}]'.format(data_url, download_path)) # download(download_path, data_url) self.connect = sqlite3.connect(str(download_path), check_same_thread=False) self.db_name = self.get_db_name() self.doc_ids = self.get_doc_ids() self.doc2index = self.map_doc2idx() self.batch_size = batch_size self.shuffle = shuffle self.random = Random(seed)
def __init__(self, squad_model_config: str, vocab_file: str, do_lower_case: bool, max_seq_length: int = 512, batch_size: int = 10, lang: str = 'en', **kwargs) -> None: config = json.load(open(squad_model_config)) config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length self.model = build_model(config) self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) self.batch_size = batch_size if lang == 'en': from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize elif lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: raise RuntimeError('en and ru languages are supported only')
def __init__(self, dictionary: StaticDictionary, window=1, lm_file=None, *args, **kwargs): super().__init__(*args, **kwargs) self.costs = defaultdict(itertools.repeat(float('-inf')).__next__) self.dictionary = dictionary self.window = window if self.window == 0: self.find_candidates = self._find_candidates_window_0 else: self.find_candidates = self._find_candidates_window_n self.costs[('', '')] = log(1) self.costs[('⟬', '⟬')] = log(1) self.costs[('⟭', '⟭')] = log(1) for c in self.dictionary.alphabet: self.costs[(c, c)] = log(1) # if self.ser_path.is_file(): self.load() if lm_file: self.lm = kenlm.Model(str(expand_path(lm_file))) self.beam_size = 4 self.candidates_count = 4 self._infer_instance = self._infer_instance_lm
def save(self) -> None: encoder_weights_path = expand_path( self.encoder_save_path).with_suffix(".pth.tar") log.info(f"Saving encoder to {encoder_weights_path}.") torch.save({"model_state_dict": self.encoder.cpu().state_dict()}, encoder_weights_path) self.encoder.to(self.device)
def _load_actions2slots_formfilling_info_from_json(self, actions_required_acquired_slots_path: Optional[Union[str, Path]] = None)\ -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: """ loads the formfilling mapping of actions onto the required slots from the json of the following structure: {action1: {"required": [required_slot_name_1], "acquired": [acquired_slot_name_1, acquired_slot_name_2]}, action2: {"required": [required_slot_name_21, required_slot_name_22], "acquired": [acquired_slot_name_21]}, ..} Returns: the dictionary represented by the passed json """ actions_required_acquired_slots_path = expand_path( actions_required_acquired_slots_path) with open(actions_required_acquired_slots_path, encoding="utf-8") as actions2slots_json_f: actions2slots = json.load(actions2slots_json_f) actions2required_slots = { act: act_slots["required"] for act, act_slots in actions2slots.items() } actions2acquired_slots = { act: act_slots["acquired"] for act, act_slots in actions2slots.items() } return actions2required_slots, actions2acquired_slots
def read(self, data_path: str, **kwargs) -> Dict[str, List[Dict[str, Union[int, List[int]]]]]: """Read the InsuranceQA data from files and forms the dataset. Args: data_path: A path to a folder where dataset files are stored. **kwargs: Other parameters. Returns: A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. """ data_path = expand_path(data_path) self._download_data(data_path) dataset = {'train': None, 'valid': None, 'test': None} train_fname = Path( data_path) / 'insuranceQA-master/V1/question.train.token_idx.label' valid_fname = Path( data_path ) / 'insuranceQA-master/V1/question.dev.label.token_idx.pool' test_fname = Path( data_path ) / 'insuranceQA-master/V1/question.test1.label.token_idx.pool' self.idxs2cont_vocab = self._build_context2toks_vocabulary( train_fname, valid_fname, test_fname) dataset["valid"] = self._preprocess_data_valid_test(valid_fname) dataset["train"] = self._preprocess_data_train(train_fname) dataset["test"] = self._preprocess_data_valid_test(test_fname) return dataset
def __init__(self, sparql_queries_filename: str, lang: str = "rus", adj_to_noun: RuAdjToNoun = None, **kwargs): """ Args: sparql_queries_filename: file with sparql query templates lang: english or russian adj_to_noun: component deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun **kwargs: """ self.lang = lang if self.lang == "rus": self.q_pronouns = { "какой", "какая", "какое", "каком", "каким", "какую", "кто", "что", "как", "когда", "где", "чем", "сколько" } self.how_many = "сколько" self.change_root_tokens = {"каким был", "какой была"} self.temporal_order_tokens = {"первый", "последний"} elif self.lang == "eng": self.q_pronouns = {"what", "who", "how", "when", "where", "which"} self.how_many = "how many" self.change_root_tokens = "" self.temporal_order_tokens = {"first", "last"} else: raise ValueError(f"unsupported language {lang}") self.sparql_queries_filename = expand_path(sparql_queries_filename) self.template_queries = read_json(self.sparql_queries_filename) self.adj_to_noun = adj_to_noun self.morph = pymorphy2.MorphAnalyzer()
def __init__(self, chainer_config: dict, *, batch_size: int = -1, metrics: Iterable[Union[str, dict]] = ('accuracy',), evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, tensorboard_log_dir: Optional[Union[str, Path]] = None, max_test_batches: int = -1, **kwargs) -> None: if kwargs: log.info(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') self.chainer_config = chainer_config self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) self.batch_size = batch_size self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params) self.evaluation_targets = tuple(evaluation_targets) self.show_examples = show_examples self.max_test_batches = None if max_test_batches < 0 else max_test_batches self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir if tensorboard_log_dir is not None: try: # noinspection PyPackageRequirements # noinspection PyUnresolvedReferences import tensorflow except ImportError: log.warning('TensorFlow could not be imported, so tensorboard log directory' f'`{self.tensorboard_log_dir}` will be ignored') self.tensorboard_log_dir = None else: self.tensorboard_log_dir = expand_path(tensorboard_log_dir) self._tf = tensorflow self._built = False self._saved = False self._loaded = False
def load(self) -> None: if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") self.pretrained_bert = str(expand_path(self.pretrained_bert)) self.config = AutoConfig.from_pretrained(self.pretrained_bert, output_hidden_states=True) self.encoder = AutoModel.from_pretrained(self.pretrained_bert, config=self.config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.config = AutoConfig.from_json_file( str(expand_path(self.bert_config_file))) self.encoder = AutoModel.from_config(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.encoder.to(self.device)
def __init__(self, tags_file, **kwargs): tags_file = str(expand_path(tags_file)) self.tags_list = [] with open(tags_file, "r") as fl: lines = fl.readlines() for line in lines: self.tags_list.append(line.strip().split()[0])
def __init__(self, spec: str, elmo_output_names: Optional[List] = None, dim: Optional[int] = None, pad_zero: bool = False, concat_last_axis: bool = True, max_token: Optional[int] = None, mini_batch_size: int = 32, **kwargs) -> None: self.spec = spec if '://' in spec else str(expand_path(spec)) self.elmo_output_dims = {'word_emb': 512, 'lstm_outputs1': 1024, 'lstm_outputs2': 1024, 'elmo': 1024, 'default': 1024} elmo_output_names = elmo_output_names if elmo_output_names else ['default'] self.elmo_output_names = elmo_output_names if elmo_output_names else ['default'] elmo_output_names_set = set(self.elmo_output_names) if elmo_output_names_set - set(self.elmo_output_dims.keys()): log.error(f'Incorrect elmo_output_names = {elmo_output_names} . You can use either ["default"] or some of'\ '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]') sys.exit(1) if elmo_output_names_set - set(['default']) and elmo_output_names_set - set(["word_emb", "lstm_outputs1", "lstm_outputs2", "elmo"]): log.error('Incompatible conditions: you can use either ["default"] or list of '\ '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"] ') sys.exit(1) self.pad_zero = pad_zero self.concat_last_axis = concat_last_axis self.max_token = max_token self.mini_batch_size = mini_batch_size self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load() self.dim = self._get_dims(self.elmo_output_names, dim, concat_last_axis)
def __init__(self, vocabs_path, save_path, load_path, max_sequence_length, padding="post", truncating="post", max_token_length=None, token_embeddings=True, char_embeddings=False, char_pad="post", char_trunc="post", tok_dynamic_batch=False, char_dynamic_batch=False, update_embeddings=False): super().__init__(save_path, load_path, max_sequence_length, max_token_length, padding, truncating, token_embeddings, char_embeddings, char_pad, char_trunc, tok_dynamic_batch, char_dynamic_batch, update_embeddings) vocabs_path = expand_path(vocabs_path) self.int2tok_fname = Path(vocabs_path) / 'vocabulary' self.response2ints_fname = Path( vocabs_path) / 'answers.label.token_idx' self.train_context2ints_fname = Path( vocabs_path) / 'question.train.token_idx.label' self.val_context2ints_fname = Path( vocabs_path) / 'question.dev.label.token_idx.pool' self.test_context2ints_fname = Path( vocabs_path) / 'question.test1.label.token_idx.pool'
def run_population(population, evolution, gpus): """ Change save and load paths for obtained population, save config.json with model config, run population via current python executor (with which evolve.py already run) and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs) Args: population: list of dictionaries - configs of current population evolution: ParamsEvolution gpus: list of given devices (list of integers) Returns: None """ population_size = len(population) for k in range(population_size // len(gpus) + 1): procs = [] for j in range(len(gpus)): i = k * len(gpus) + j if i < population_size: save_path = expand_path( evolution.get_value_from_config(parse_config(population[i]), evolution.path_to_models_save_path)) save_path.mkdir(parents=True, exist_ok=True) f_name = save_path / "config.json" save_json(population[i], f_name) with save_path.joinpath('out.txt').open('w', encoding='utf8') as outlog,\ save_path.joinpath('err.txt').open('w', encoding='utf8') as errlog: env = dict(os.environ) if len(gpus) > 1 or gpus[0] != -1: env['CUDA_VISIBLE_DEVICES'] = str(gpus[j]) procs.append(Popen("{} -m deeppavlov train {}".format(sys.executable, str(f_name)), shell=True, stdout=outlog, stderr=errlog, env=env)) for j, proc in enumerate(procs): i = k * len(gpus) + j log.info(f'Waiting on {i}th proc') if proc.wait() != 0: save_path = expand_path( evolution.get_value_from_config(parse_config(population[i]), evolution.path_to_models_save_path)) with save_path.joinpath('err.txt').open(encoding='utf8') as errlog: log.warning(f'Population {i} returned an error code {proc.returncode} and an error log:\n' + errlog.read()) return None
def __init__(self, spec: str, dim: int = 1024, pad_zero: bool = False, mean: bool = False, **kwargs) -> None: self.spec = spec if '://' in spec else str(expand_path(spec)) self.dim = dim self.pad_zero = pad_zero self.mean = mean self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load()
def _load_options(self, options_json_path): if options_json_path: options_json_path = expand_path(options_json_path) with open(options_json_path, 'r') as fin: options = json.load(fin) else: options = {} return options
def from_params(params: Dict, mode: str = 'infer', serialized: Any = None, **kwargs) -> Component: """Builds and returns the Component from corresponding dictionary of parameters.""" # what is passed in json: config_params = {k: _resolve(v) for k, v in params.items()} # get component by reference (if any) if 'ref' in config_params: try: component = _refs[config_params['ref']] if serialized is not None: component.deserialize(serialized) return component except KeyError: e = ConfigError('Component with id "{id}" was referenced but not initialized' .format(id=config_params['ref'])) log.exception(e) raise e elif 'config_path' in config_params: from deeppavlov.core.commands.infer import build_model refs = _refs.copy() _refs.clear() config = parse_config(expand_path(config_params['config_path'])) model = build_model(config, serialized=serialized) _refs.clear() _refs.update(refs) return model cls_name = config_params.pop('class_name', None) if not cls_name: e = ConfigError('Component config has no `class_name` nor `ref` fields') log.exception(e) raise e cls = get_model(cls_name) # find the submodels params recursively config_params = {k: _init_param(v, mode) for k, v in config_params.items()} try: spec = inspect.getfullargspec(cls) if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None: kwargs['mode'] = mode component = cls(**dict(config_params, **kwargs)) try: _refs[config_params['id']] = component except KeyError: pass except Exception: log.exception("Exception in {}".format(cls)) raise if serialized is not None: component.deserialize(serialized) return component
def __init__(self, save_path: Union[str, Path], load_path: Optional[Union[str, Path]] = None, mode: str = 'infer', *args, **kwargs) -> None: if save_path: self.save_path = expand_path(save_path) self.save_path.parent.mkdir(parents=True, exist_ok=True) else: self.save_path = None if load_path: self.load_path = expand_path(load_path) if mode != 'train' and self.save_path and self.load_path != self.save_path: log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}." .format(self.load_path, self.save_path, mode, self.__class__.__name__)) elif mode != 'train' and self.save_path: self.load_path = self.save_path log.warning("No load path is set for {} in '{}' mode. Using save path instead" .format(self.__class__.__name__, mode)) else: self.load_path = None log.warning("No load path is set for {}!".format(self.__class__.__name__))
def get_config_downloads(config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]: config = parse_config(config) downloads = set() if 'metadata' in config and 'download' in config['metadata']: for resource in config['metadata']['download']: if isinstance(resource, str): resource = { 'url': resource } url = resource['url'] dest = expand_path(resource.get('subdir', '')) downloads.add((url, dest)) config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')] downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)} return downloads
def __init__(self, data_dir: str = '', data_url: str = DB_URL, batch_size: int = None, shuffle: bool = None, seed: int = None, **kwargs): download_dir = expand_path(data_dir) download_path = download_dir.joinpath(data_url.split("/")[-1]) download(download_path, data_url, force_download=False) self.connect = sqlite3.connect(str(download_path), check_same_thread=False) self.db_name = self.get_db_name() self.doc_ids = self.get_doc_ids() self.doc2index = self.map_doc2idx() self.batch_size = batch_size self.shuffle = shuffle self.random = Random(seed)
def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str, context_limit: int = 450, question_limit: int = 150, char_limit: int = 16, level: str = 'token', *args, **kwargs): self.emb_folder = expand_path(emb_folder) self.level = level self.emb_url = emb_url self.emb_file_name = Path(emb_url).name self.save_path = expand_path(save_path) self.load_path = expand_path(load_path) self.context_limit = context_limit self.question_limit = question_limit self.char_limit = char_limit self.loaded = False self.NULL = "<NULL>" self.OOV = "<OOV>" self.emb_folder.mkdir(parents=True, exist_ok=True) self.emb_dim = self.emb_mat = self.token2idx_dict = None if self.load_path.exists(): self.load()
def __init__(self, tokenizer: Component, tracker: Tracker, network_parameters: Dict[str, Any], template_path: str, template_type: str = "DefaultTemplate", word_vocab: Component = None, bow_embedder: Component = None, embedder: Component = None, slot_filler: Component = None, intent_classifier: Component = None, database: Component = None, api_call_action: str = None, # TODO: make it unrequired use_action_mask: bool = False, debug: bool = False, load_path: str = None, save_path: str = None, **kwargs): super().__init__(load_path=load_path, save_path=save_path, **kwargs) self.tokenizer = tokenizer self.tracker = tracker self.bow_embedder = bow_embedder self.embedder = embedder self.slot_filler = slot_filler self.intent_classifier = intent_classifier self.use_action_mask = use_action_mask self.debug = debug self.word_vocab = word_vocab template_path = expand_path(template_path) template_type = getattr(templ, template_type) log.info("[loading templates from {}]".format(template_path)) self.templates = templ.Templates(template_type).load(template_path) self.n_actions = len(self.templates) log.info("{} templates loaded".format(self.n_actions)) self.database = database self.api_call_id = None if api_call_action is not None: self.api_call_id = self.templates.actions.index(api_call_action) self.intents = [] if callable(self.intent_classifier): # intent_classifier returns (y_labels, y_probs) self.intents = list(self.intent_classifier(["hi"])[1][0].keys()) self.network = self._init_network(network_parameters) self.reset()
def change_savepath_for_model(config): params_helper = ParamsSearch() dirs_for_saved_models = set() for p in params_helper.find_model_path(config, SAVE_PATH_ELEMENT_NAME): p.append(SAVE_PATH_ELEMENT_NAME) save_path = Path(params_helper.get_value_from_config(config, p)) new_save_path = save_path.parent / TEMP_DIR_FOR_CV / save_path.name dirs_for_saved_models.add(expand_path(new_save_path.parent)) params_helper.insert_value_or_dict_into_config(config, p, str(new_save_path)) return config, dirs_for_saved_models
def __init__(self, preprocess: Component, save_path: str, load_path: str, entropy_fields: list, min_similarity: float = 0.5, min_entropy: float = 0.5, **kwargs) -> None: self.preprocess = preprocess self.save_path = expand_path(save_path) if isinstance(load_path, list): self.load_path: List = [expand_path(path) for path in load_path] else: self.load_path: List = [expand_path(load_path)] self.min_similarity = min_similarity self.min_entropy = min_entropy self.entropy_fields = entropy_fields self.ec_data: List = [] if kwargs.get('mode') != 'train': self.load()
def run_population(population, evolution, gpus): """ Change save and load paths for obtained population, save config.json with model config, run population via current python executor (with which evolve.py already run) and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs) Args: population: list of dictionaries - configs of current population evolution: ParamsEvolution gpus: list of given devices (list of integers) Returns: None """ population_size = len(population) for k in range(population_size // len(gpus) + 1): procs = [] for j in range(len(gpus)): i = k * len(gpus) + j if i < population_size: save_path = expand_path(Path(evolution.get_value_from_config( population[i], evolution.main_model_path + ["save_path"])).parent) save_path.mkdir(parents=True, exist_ok=True) f_name = save_path.joinpath("config.json") save_json(population[i], f_name) if len(gpus) == 1 and gpus[0] == -1: procs.append(Popen("{} -m deeppavlov train {}" " 1>{}/out.txt 2>{}/err.txt".format(sys.executable, str(f_name), str(save_path), str(save_path) ), shell=True, stdout=PIPE, stderr=PIPE)) else: procs.append(Popen("CUDA_VISIBLE_DEVICES={} {} -m deeppavlov train {}" " 1>{}/out.txt 2>{}/err.txt".format(gpus[j], sys.executable, str(f_name), str(save_path), str(save_path) ), shell=True, stdout=PIPE, stderr=PIPE)) for j, proc in enumerate(procs): i = k * len(gpus) + j log.info(f'Waiting on {i}th proc') proc.wait() return None
def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the dataset for ranking or paraphrase identification with Siamese networks. Args: data_path: A path to a folder with dataset files. """ dataset = {'train': None, 'valid': None, 'test': None} data_path = expand_path(data_path) train_fname = data_path / 'train.csv' valid_fname = data_path / 'valid.csv' test_fname = data_path / 'test.csv' dataset["train"] = self._preprocess_data_train(train_fname) dataset["valid"] = self._preprocess_data_valid_test(valid_fname) dataset["test"] = self._preprocess_data_valid_test(test_fname) return dataset
def read(self, data_path: str, seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the paraphraser.ru dataset from files. Args: data_path: A path to a folder with dataset files. seed: Random seed. """ data_path = expand_path(data_path) train_fname = data_path / 'paraphrases.xml' test_fname = data_path / 'paraphrases_gold.xml' train_data = self.build_data(train_fname) test_data = self.build_data(test_fname) dataset = {"train": train_data, "valid": [], "test": test_data} return dataset
def read(self, data_path: str, seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the pretraining dataset for the paraphrase identification task from files. Args: data_path: A path to a folder with dataset files. seed: Random seed. """ data_path = expand_path(data_path) train_fname = data_path / 'paraphraser_pretrain_train.json' test_fname = data_path / 'paraphraser_pretrain_val.json' train_data = self.build_data(train_fname) test_data = self.build_data(test_fname) dataset = {"train": train_data, "valid": test_data, "test": test_data} return dataset
def __init__(self, data: List[Tuple], dataset_path: str, seed: int = None, shuffle: bool = False): self.shuffle = shuffle self.random = Random(seed) # TODO: include slot vals to dstc2.tar.gz dataset_path = expand_path(dataset_path) / 'slot_vals.json' self._build_slot_vals(dataset_path) with open(dataset_path, encoding='utf8') as f: self._slot_vals = json.load(f) for data_type in ['train', 'test', 'valid']: bio_markup_data = self._preprocess(data.get(data_type, [])) setattr(self, data_type, bio_markup_data) self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, 'all': self.train + self.test + self.valid } self.shuffle = shuffle
def __init__(self, vocabs_path, save_path, load_path, max_sequence_length, padding="post", truncating="post", max_token_length=None, token_embeddings=True, char_embeddings=False, char_pad="post", char_trunc="post", tok_dynamic_batch=False, char_dynamic_batch=False, update_embeddings = False): super().__init__(save_path, load_path, max_sequence_length, max_token_length, padding, truncating, token_embeddings, char_embeddings, char_pad, char_trunc, tok_dynamic_batch, char_dynamic_batch, update_embeddings) vocabs_path = expand_path(vocabs_path) self.int2tok_fname = Path(vocabs_path) / 'vocabulary' self.response2ints_fname = Path(vocabs_path) / 'answers.label.token_idx' self.train_context2ints_fname = Path(vocabs_path) / 'question.train.token_idx.label' self.val_context2ints_fname = Path(vocabs_path) / 'question.dev.label.token_idx.pool' self.test_context2ints_fname = Path(vocabs_path) / 'question.test1.label.token_idx.pool'
def read(self, data_path: str, seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: data_path = expand_path(data_path) fname = data_path / 'train.csv' contexts = [] responses = [] labels = [] with open(fname, 'r') as f: reader = csv.reader(f) next(reader) for el in reader: contexts.append(el[-3].replace('\n', '').lower()) responses.append(el[-2].replace('\n', '').lower()) labels.append(int(el[-1])) data = list(zip(contexts, responses)) data = list(zip(data, labels)) data = {"train": data, "valid": [], "test": []} return data
def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the Ubuntu V2 dataset from csv files. Args: data_path: A path to a folder with dataset csv files. """ data_path = expand_path(data_path) dataset = {'train': None, 'valid': None, 'test': None} train_fname = Path(data_path) / 'train.csv' valid_fname = Path(data_path) / 'valid.csv' test_fname = Path(data_path) / 'test.csv' self.sen2int_vocab = {} self.classes_vocab_train = {} self.classes_vocab_valid = {} self.classes_vocab_test = {} dataset["train"] = self.preprocess_data_train(train_fname) dataset["valid"] = self.preprocess_data_validation(valid_fname) dataset["test"] = self.preprocess_data_validation(test_fname) return dataset
def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.info('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i+1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.info('built') else: log.info('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]: """Read the InsuranceQA V1 dataset from files. Args: data_path: A path to a folder with dataset files. """ data_path = expand_path(data_path) dataset = {'train': None, 'valid': None, 'test': None} train_fname = data_path / 'insuranceQA-master/V1/question.train.token_idx.label' valid_fname = data_path / 'insuranceQA-master/V1/question.dev.label.token_idx.pool' test_fname = data_path / 'insuranceQA-master/V1/question.test1.label.token_idx.pool' int2tok_fname = data_path / 'insuranceQA-master/V1/vocabulary' response2ints_fname = data_path / 'insuranceQA-master/V1/answers.label.token_idx' self.int2tok_vocab = self._build_int2tok_vocab(int2tok_fname) self.idxs2cont_vocab = self._build_context2toks_vocab(train_fname, valid_fname, test_fname) self.response2str_vocab = self._build_response2str_vocab(response2ints_fname) dataset["valid"] = self._preprocess_data_valid_test(valid_fname) dataset["train"] = self._preprocess_data_train(train_fname) dataset["test"] = self._preprocess_data_valid_test(test_fname) return dataset
def __init__(self, load_path: Union[str, Path], batch_size: Optional[int] = None, shuffle: Optional[bool] = None, seed: Optional[int] = None, **kwargs) -> None: load_path = str(expand_path(load_path)) logger.info("Connecting to database, path: {}".format(load_path)) try: self.connect = sqlite3.connect(load_path, check_same_thread=False) except sqlite3.OperationalError as e: e.args = e.args + ("Check that DB path exists and is a valid DB file",) raise e try: self.db_name = self.get_db_name() except TypeError as e: e.args = e.args + ( 'Check that DB path was created correctly and is not empty. ' 'Check that a correct dataset_format is passed to the ODQAReader config',) raise e self.doc_ids = self.get_doc_ids() self.doc2index = self.map_doc2idx() self.batch_size = batch_size self.shuffle = shuffle self.random = Random(seed)
def predict_with_model(config_path: [Path, str]) -> List[Optional[List[str]]]: """Returns predictions of morphotagging model given in config :config_path:. Args: config_path: a path to config Returns: a list of morphological analyses for each sentence. Each analysis is either a list of tags or a list of full CONLL-U descriptions. """ config = parse_config(config_path) reader_config = config['dataset_reader'] reader = get_model(reader_config['class_name'])() data_path = expand_path(reader_config.get('data_path', '')) read_params = {k: v for k, v in reader_config.items() if k not in ['class_name', 'data_path']} data: Dict = reader.read(data_path, **read_params) iterator_config = config['dataset_iterator'] iterator: MorphoTaggerDatasetIterator = from_params(iterator_config, data=data) model = build_model(config, load_trained=True) answers = [None] * len(iterator.test) batch_size = config['predict'].get("batch_size", -1) for indexes, (x, _) in iterator.gen_batches( batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True): y = model(x) for i, elem in zip(indexes, y): answers[i] = elem outfile = config['predict'].get("outfile") if outfile is not None: outfile = Path(outfile) if not outfile.exists(): outfile.parent.mkdir(parents=True, exist_ok=True) with open(outfile, "w", encoding="utf8") as fout: for elem in answers: fout.write(elem + "\n") return answers
def read(self, data_path: str, catalog: list, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """Load data from specific catalog Parameters: data_path: where the dataset is located catalog: names of the specific subcategories Returns: dataset: loaded dataset """ logger.info(f"Ecommerce loader is loaded with catalog {catalog}") if not isinstance(catalog, list): catalog = [catalog] ec_data_global: List[Any] = [] data_path = Path(expand_path(data_path)) if not is_done(data_path): self._download_data(data_path) if data_path.is_dir(): for fname in data_path.rglob("*.txt"): if any(cat in fname.name for cat in catalog): logger.info(f"File {fname.name} is loaded") ec_data_global += self._load_amazon_ecommerce_file(fname) dataset = { 'train': [((item['Title'], [], {}), item) for item in ec_data_global], 'valid': [], 'test': [] } logger.info(f"In total {len(ec_data_global)} items are loaded") return dataset
def load(self) -> None: """Load classifier parameters""" logger.info("Loading faq_logreg_model from {}".format(self.load_path)) self.logreg = load_pickle(expand_path(self.load_path))
def save(self) -> None: """Save classifier parameters""" logger.info("Saving faq_logreg_model to {}".format(self.save_path)) path = expand_path(self.save_path) make_all_dirs(path) save_pickle(self.logreg, path)