def initialize_dictionary(self, namespace: str, unk_num: int, mode: MappingMode): if mode == MappingMode.token2index: if any( namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces): dict.__setitem__(self, namespace, {}) else: init_namespace_dictionary = RandomHashDict( unk_num=unk_num, oov_token=self.oov_token) init_namespace_dictionary.update({self.padding_token: 0}) init_namespace_dictionary.add_unk_tokens() dict.__setitem__(self, namespace, init_namespace_dictionary) elif mode == MappingMode.index2token: if any( namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces): dict.__setitem__(self, namespace, {}) else: init_namespace_dictionary = {0: self.padding_token} for i in range(unk_num): init_namespace_dictionary[ len(init_namespace_dictionary )] = f"@@{self.oov_token}#{str(i)}@@" dict.__setitem__(self, namespace, init_namespace_dictionary)
def from_files(cls, directory: str) -> 'Vocabulary': """ Loads a ``Vocabulary`` that was serialized using ``save_to_files``. Parameters ---------- directory : ``str`` The directory containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', 'utf-8') as namespace_file: non_padded_namespaces = [ namespace_str.strip() for namespace_str in namespace_file ] vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue namespace = namespace_filename.replace('.txt', '') if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace) return vocab
def __missing__(self, key: str): if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces): value = self._non_padded_function() else: value = self._padded_function() dict.__setitem__(self, key, value) return value
def __missing__(self, key: str): if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces): value = self._non_padded_function() else: value = self._padded_function() dict.__setitem__(self, key, value) return value
def from_files(cls, directory: str) -> 'Vocabulary': """ Loads a ``Vocabulary`` that was serialized using ``save_to_files``. Parameters ---------- directory : ``str`` The directory containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', 'utf-8') as namespace_file: non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file] vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue namespace = namespace_filename.replace('.txt', '') if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace) return vocab
def from_files( cls, directory: str, padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Loads a `Vocabulary` that was serialized either using `save_to_files` or inside a model archive file. # Parameters directory : `str` The directory or archive file containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN if not os.path.isdir(directory): base_directory = cached_path(directory, extract_archive=True) # For convenience we'll check for a 'vocabulary' subdirectory of the archive. # That way you can use model archives directly. vocab_subdir = os.path.join(base_directory, "vocabulary") if os.path.isdir(vocab_subdir): directory = vocab_subdir elif os.path.isdir(base_directory): directory = base_directory else: raise ConfigurationError(f"{directory} is neither a directory nor an archive") # We use a lock file to avoid race conditions where multiple processes # might be reading/writing from/to the same vocab files at once. with FileLock(os.path.join(directory, ".lock")): with codecs.open( os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8" ) as namespace_file: non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file] vocab = cls( non_padded_namespaces=non_padded_namespaces, padding_token=padding_token, oov_token=oov_token, ) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue if namespace_filename.startswith("."): continue namespace = namespace_filename.replace(".txt", "") if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token) return vocab
def from_files( cls, directory: str, padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Loads a `Vocabulary` that was serialized using `save_to_files`. # Parameters directory : `str` The directory containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN # We use a lock file to avoid race conditions where multiple processes # might be reading/writing from/to the same vocab files at once. with FileLock(os.path.join(directory, ".lock")): with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8") as namespace_file: non_padded_namespaces = [ namespace_str.strip() for namespace_str in namespace_file ] vocab = cls( non_padded_namespaces=non_padded_namespaces, padding_token=padding_token, oov_token=oov_token, ) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue if namespace_filename.startswith("."): continue namespace = namespace_filename.replace(".txt", "") if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token) return vocab
def load_vocab_from_directory(directory: str, padding_token: str = "[PAD]", oov_token: str = "[UNK]") -> Vocabulary: """ Load pre-trained vocabulary form a directory (since the original method does not work --> OOV problem) Args: directory (str) padding_token (str): default OOV token symbol ("[PAD]" our case, since we are using BERT) oov_token (str): default OOV token symbol ("[UNK]" our case, since we are using BERT) Returns: Vocabulary """ NAMESPACE_PADDING_FILE = 'non_padded_namespaces.txt' print("Loading token dictionary from", directory) with codecs.open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', 'utf-8') as namespace_file: non_padded_namespaces = [ namespace_str.strip() for namespace_str in namespace_file ] vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue if namespace_filename.startswith("."): continue namespace = namespace_filename.replace('.txt', '') if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, oov_token=oov_token, namespace=namespace) vocab._padding_token = padding_token return vocab
def from_files( cls, directory: str, padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Loads a ``Vocabulary`` that was serialized using ``save_to_files``. Parameters ---------- directory : ``str`` The directory containing the serialized vocabulary. """ logger.info("Loading token dictionary from %s.", directory) padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN with codecs.open( os.path.join(directory, NAMESPACE_PADDING_FILE), "r", "utf-8" ) as namespace_file: non_padded_namespaces = [namespace_str.strip() for namespace_str in namespace_file] vocab = cls( non_padded_namespaces=non_padded_namespaces, padding_token=padding_token, oov_token=oov_token, ) # Check every file in the directory. for namespace_filename in os.listdir(directory): if namespace_filename == NAMESPACE_PADDING_FILE: continue if namespace_filename.startswith("."): continue namespace = namespace_filename.replace(".txt", "") if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(directory, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace, oov_token=oov_token) return vocab
def set_vocab_from_filename(vocab: Vocabulary, namespace_filename: str, load_dir: str, non_padded_namespaces: str): """Set up the vocabulary from a file Arguments: vocab: The vocabulary namespace_filename: The file containing all the namespaces to be loaded load_dir: The directory to load the vocab from non_padded_namespaces: The namespaces that are not padded (like labels etc) Returns: ``Vocabulary``: The loaded vocabulary """ namespace = namespace_filename.replace('.txt', '') if any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces): is_padded = False else: is_padded = True filename = os.path.join(load_dir, namespace_filename) vocab.set_from_file(filename, is_padded, namespace=namespace) return vocab
def test_namespace_match(self): assert util.namespace_match("*tags", "tags") assert util.namespace_match("*tags", "passage_tags") assert util.namespace_match("*tags", "question_tags") assert util.namespace_match("tokens", "tokens") assert not util.namespace_match("tokens", "stemmed_tokens")
def test_namespace_match(self): assert util.namespace_match("*tags", "tags") assert util.namespace_match("*tags", "passage_tags") assert util.namespace_match("*tags", "question_tags") assert util.namespace_match("tokens", "tokens") assert not util.namespace_match("tokens", "stemmed_tokens")
def _extend(self, counter: Dict[str, Dict[str, int]] = None, min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> None: """ This method can be used for extending already generated vocabulary. It takes same parameters as Vocabulary initializer. The token2index and indextotoken mappings of calling vocabulary will be retained. It is an inplace operation so None will be returned. """ if not isinstance(max_vocab_size, dict): int_max_vocab_size = max_vocab_size max_vocab_size = defaultdict(lambda: int_max_vocab_size) # type: ignore min_count = min_count or {} pretrained_files = pretrained_files or {} min_pretrained_embeddings = min_pretrained_embeddings or {} non_padded_namespaces = set(non_padded_namespaces) counter = counter or {} tokens_to_add = tokens_to_add or {} self._retained_counter = counter # Make sure vocabulary extension is safe. current_namespaces = {*self._token_to_index} extension_namespaces = {*counter, *tokens_to_add} for namespace in current_namespaces & extension_namespaces: # if new namespace was already present # Either both should be padded or none should be. original_padded = not any(namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces) extension_padded = not any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces) if original_padded != extension_padded: raise ConfigurationError("Common namespace {} has conflicting ".format(namespace)+ "setting of padded = True/False. "+ "Hence extension cannot be done.") # Add new non-padded namespaces for extension self._token_to_index.add_non_padded_namespaces(non_padded_namespaces) self._index_to_token.add_non_padded_namespaces(non_padded_namespaces) self._non_padded_namespaces.update(non_padded_namespaces) for namespace in counter: if namespace in pretrained_files: pretrained_list = _read_pretrained_tokens(pretrained_files[namespace]) min_embeddings = min_pretrained_embeddings.get(namespace, 0) if min_embeddings > 0: tokens_old = tokens_to_add.get(namespace, []) tokens_new = pretrained_list[:min_embeddings] tokens_to_add[namespace] = tokens_old + tokens_new pretrained_set = set(pretrained_list) else: pretrained_set = None token_counts = list(counter[namespace].items()) token_counts.sort(key=lambda x: x[1], reverse=True) try: max_vocab = max_vocab_size[namespace] except KeyError: max_vocab = None if max_vocab: token_counts = token_counts[:max_vocab] for token, count in token_counts: if pretrained_set is not None: if only_include_pretrained_words: if token in pretrained_set and count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) elif token in pretrained_set or count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) elif count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) for namespace, tokens in tokens_to_add.items(): for token in tokens: self.add_token_to_namespace(token, namespace)
def _extend(self, counter: Dict[str, Dict[str, int]] = None, min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> None: """ This method can be used for extending already generated vocabulary. It takes same parameters as Vocabulary initializer. The token2index and indextotoken mappings of calling vocabulary will be retained. It is an inplace operation so None will be returned. """ if not isinstance(max_vocab_size, dict): int_max_vocab_size = max_vocab_size max_vocab_size = defaultdict(lambda: int_max_vocab_size) # type: ignore min_count = min_count or {} pretrained_files = pretrained_files or {} min_pretrained_embeddings = min_pretrained_embeddings or {} non_padded_namespaces = set(non_padded_namespaces) counter = counter or {} tokens_to_add = tokens_to_add or {} self._retained_counter = counter # Make sure vocabulary extension is safe. current_namespaces = {*self._token_to_index} extension_namespaces = {*counter, *tokens_to_add} for namespace in current_namespaces & extension_namespaces: # if new namespace was already present # Either both should be padded or none should be. original_padded = not any(namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces) extension_padded = not any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces) if original_padded != extension_padded: raise ConfigurationError("Common namespace {} has conflicting ".format(namespace)+ "setting of padded = True/False. "+ "Hence extension cannot be done.") # Add new non-padded namespaces for extension self._token_to_index.add_non_padded_namespaces(non_padded_namespaces) self._index_to_token.add_non_padded_namespaces(non_padded_namespaces) self._non_padded_namespaces.update(non_padded_namespaces) for namespace in counter: if namespace in pretrained_files: pretrained_list = _read_pretrained_tokens(pretrained_files[namespace]) min_embeddings = min_pretrained_embeddings.get(namespace, 0) if min_embeddings > 0: tokens_old = tokens_to_add.get(namespace, []) tokens_new = pretrained_list[:min_embeddings] tokens_to_add[namespace] = tokens_old + tokens_new pretrained_set = set(pretrained_list) else: pretrained_set = None token_counts = list(counter[namespace].items()) token_counts.sort(key=lambda x: x[1], reverse=True) try: max_vocab = max_vocab_size[namespace] except KeyError: max_vocab = None if max_vocab: token_counts = token_counts[:max_vocab] for token, count in token_counts: if pretrained_set is not None: if only_include_pretrained_words: if token in pretrained_set and count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) elif token in pretrained_set or count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) elif count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) for namespace, tokens in tokens_to_add.items(): for token in tokens: self.add_token_to_namespace(token, namespace)
def debug_vocab(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- parameter_filename : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', -1)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) vocab = model.vocab vocab_namespace_dict = vocab._token_to_index vocab_oov_token = vocab._oov_token vocab_non_padded_namespaces = vocab._non_padded_namespaces # this is a set vocab_tokens_dict = vocab_namespace_dict['tokens'] vocab_labels_dict = vocab_namespace_dict['labels'] print() print("Vocab's OOV token: " + vocab_oov_token) print("Non-padded namespaces in vocab: " + str(list(vocab_non_padded_namespaces))) print() print("Number of words in vocab's tokens dict: " + str(len(vocab_tokens_dict))) if any( namespace_match(pattern, 'tokens') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("tokens will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its tokens dict (should be True): " + str(vocab_oov_token in vocab_tokens_dict)) print() print("Number of words in vocab's labels dict: " + str(len(vocab_labels_dict))) if any( namespace_match(pattern, 'labels') for pattern in vocab_non_padded_namespaces): is_padded = False else: is_padded = True print("labels will return True for is_padded: " + str(is_padded)) print("Vocab's OOV token is in its labels dict (should be False): " + str(vocab_oov_token in vocab_labels_dict))
def _extend(self, counter=None, min_count=None, max_vocab_size=None, non_padded_namespaces=DEFAULT_NON_PADDED_NAMESPACES, pretrained_files=None, only_include_pretrained_words=False, tokens_to_add=None): u""" This method can be used for extending already generated vocabulary. It takes same parameters as Vocabulary initializer. The token2index and indextotoken mappings of calling vocabulary will be retained. It is an inplace operation so None will be returned. """ if not isinstance(max_vocab_size, dict): int_max_vocab_size = max_vocab_size max_vocab_size = defaultdict( lambda: int_max_vocab_size) # type: ignore min_count = min_count or {} pretrained_files = pretrained_files or {} non_padded_namespaces = set(non_padded_namespaces) counter = counter or {} tokens_to_add = tokens_to_add or {} self._retained_counter = counter # Make sure vocabulary extension is safe. current_namespaces = set(list(self._token_to_index)) extension_namespaces = set(list(counter) + list(tokens_to_add)) for namespace in current_namespaces & extension_namespaces: # if new namespace was already present # Either both should be padded or none should be. original_padded = not any( namespace_match(pattern, namespace) for pattern in self._non_padded_namespaces) extension_padded = not any( namespace_match(pattern, namespace) for pattern in non_padded_namespaces) if original_padded != extension_padded: raise ConfigurationError( u"Common namespace {} has conflicting ".format(namespace) + u"setting of padded = True/False. " + u"Hence extension cannot be done.") # Add new non-padded namespaces for extension self._token_to_index.add_non_padded_namespaces(non_padded_namespaces) self._index_to_token.add_non_padded_namespaces(non_padded_namespaces) self._non_padded_namespaces.update(non_padded_namespaces) for namespace in counter: if namespace in pretrained_files: pretrained_list = _read_pretrained_tokens( pretrained_files[namespace]) else: pretrained_list = None token_counts = list(counter[namespace].items()) token_counts.sort(key=lambda x: x[1], reverse=True) max_vocab = max_vocab_size[namespace] if max_vocab: token_counts = token_counts[:max_vocab] for token, count in token_counts: if pretrained_list is not None: if only_include_pretrained_words: if token in pretrained_list and count >= min_count.get( namespace, 1): self.add_token_to_namespace(token, namespace) elif token in pretrained_list or count >= min_count.get( namespace, 1): self.add_token_to_namespace(token, namespace) elif count >= min_count.get(namespace, 1): self.add_token_to_namespace(token, namespace) for namespace, tokens in list(tokens_to_add.items()): for token in tokens: self.add_token_to_namespace(token, namespace)