def build_vocabs(self, dataset, logger=None, transformer=False): rel_vocab = self.vocabs.get('rel', None) if rel_vocab is None: rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None)) self.vocabs.put(rel=rel_vocab) timer = CountdownTimer(len(dataset)) if transformer: token_vocab = None else: self.vocabs.token = token_vocab = VocabCounter( unk_token=self.config.get('unk', UNK)) for i, sample in enumerate(dataset): timer.log('Building vocab [blink][yellow]...[/yellow][/blink]', ratio_percentage=True) min_freq = self.config.get('min_freq', None) if min_freq: token_vocab.trim(min_freq) rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV self.vocabs.lock() self.vocabs.summary(logger=logger) if token_vocab: self.config.n_words = len(self.vocabs['token']) self.config.n_rels = len(self.vocabs['rel']) if token_vocab: self.config.pad_index = self.vocabs['token'].pad_idx self.config.unk_index = self.vocabs['token'].unk_idx
def fit(self, trn_path: str, **kwargs) -> int: self.vocab = Vocab() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples
def _load(path, vocab, normalize=False) -> Tuple[Vocab, Union[np.ndarray, None]]: if not vocab: vocab = Vocab() if not path: return vocab, None assert vocab.unk_idx is not None word2vec, dim = load_word2vec(path) for word in word2vec: vocab.get_idx(word) pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32) state = np.random.get_state() np.random.seed(0) bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32) scale = np.sqrt(3.0 / dim) for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) if vec is None: vec = word2vec.get(word.lower(), None) # if vec is not None: # vec += bias if vec is None: # vec = np.random.uniform(-scale, scale, [dim]) vec = np.zeros([dim], dtype=np.float32) pret_embs[idx] = vec # noinspection PyTypeChecker np.random.set_state(state) return vocab, pret_embs
def build_vocabs(self, dataset, logger): self.vocabs.tag = Vocab(unk_token=None, pad_token=None) self.vocabs[self.config.token_key] = Vocab() for each in dataset: pass self.vocabs.lock() self.vocabs.summary(logger)
def load_vocabs(transform, save_dir, filename='vocabs.json'): vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) for key, value in vocabs.items(): vocab = Vocab() vocab.copy_from(value) setattr(transform, key, vocab)
def __init__(self, path: str = None, vocab: Vocab = None, normalize: bool = False, load_all=True, mask_zero=True, trainable=False, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) if load_all and vocab and vocab.locked: vocab.unlock() self.vocab, self.array_np = self._load(path, vocab, normalize) self.vocab.lock() self.array_ks = tf.keras.layers.Embedding( input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable, embeddings_initializer=tf.keras.initializers.Constant( self.array_np), mask_zero=mask_zero) self.mask_zero = mask_zero self.supports_masking = mask_zero
def fit(self, trn_path: str, **kwargs) -> int: self.tag_vocab = Vocab(unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, gold=True): num_samples += 1 self.tag_vocab.update(tags) return num_samples
def fit(self, trn_path: str, **kwargs): self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) for ngrams, tags in self.file_to_samples(trn_path): for words in ngrams: self.word_vocab.update(words) self.tag_vocab.update(tags)
def transform(self, vocabs: VocabDict = None, **kwargs) -> Optional[Callable]: assert vocabs is not None if self.field not in vocabs: vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk) return super().transform(**kwargs)
def build_vocabs(self, dataset, logger=None, transformer=None): self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel, unk_token=self.config.pad_rel) if self.config.joint: self.vocabs['rel'] = rel_2nd super().build_vocabs(dataset, logger, transformer) self.config.n_rels_2nd = len(rel_2nd)
def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=False, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): if vocab is None: vocab = Vocab() self.vocab = vocab super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize, embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name, **kwargs)
def build_vocabs(self, trn, logger, **kwargs): self.vocabs.pos = Vocab(unk_token=None, pad_token=None) self.vocabs.rel = Vocab(unk_token=None, pad_token=None) self.vocabs.lemma = Vocab(unk_token=None, pad_token=None) self.vocabs.feat = Vocab(unk_token=None, pad_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 for each in trn: max_seq_len = max(max_seq_len, len(each['token'])) timer.log( f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})' ) for v in self.vocabs.values(): v.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = Vocab() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get(embeddings_initializer) with tf.device('cpu:0'): pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.rel = Vocab(pad_token=None, unk_token=None) self.vocabs.pos = Vocab(pad_token=None, unk_token=None) self.vocabs.label = label_vocab = Vocab(pad_token='', unk_token=None) label_vocab.add(trees.Sub_Head) for each in dataset: tree = each['hpsg'] nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.add('\t'.join(node.label)) nodes.extend(reversed(node.children)) self.vocabs['rel'].set_unk_as_safe_unk() label_vocab.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]: if isinstance(self.embed, Embedding): self.embed.transform(vocabs=vocabs) vocab_name = self.vocab_name if vocab_name not in vocabs: vocabs[vocab_name] = Vocab() return ToChar(self.field, vocab_name, min_word_length=self.min_word_length, pad=vocabs[vocab_name].safe_pad_token)
class WindowTokenTransform(TSVTaggingTransform): def fit(self, trn_path: str, **kwargs): self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) for ngrams, tags in self.file_to_samples(trn_path): for words in ngrams: self.word_vocab.update(words) self.tag_vocab.update(tags) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: window_radius = self.config.window_radius window_size = 2 * window_radius + 1 types = tf.string, tf.string shapes = [None, window_size], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): window_radius = self.config.window_radius for t in inputs: if gold: words, tags = t else: words, tags = t, [self.padding_values[-1]] * len(t) ngrams = [] for i, word in enumerate(words): features = [] for t in range(-window_radius, window_radius + 1): index = i + t if index < 0: feature = 'bos{}'.format(index) elif index >= len(words): feature = 'eos+{}'.format(index - len(words) + 1) else: feature = words[index] features.append(feature) ngrams.append(features) yield ngrams, tags def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])]) yield words
def build_vocabs(self, trn, logger, **kwargs): self.vocabs.tag = Vocab(pad_token=None, unk_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 token_key = self.config.token_key for each in trn: max_seq_len = max(max_seq_len, len(each[token_key])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') self.vocabs.tag.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def __init__(self, *args, **kwargs) -> None: """A dict holding :class:`hanlp.common.vocab.Vocab` instances. When used as a transform, it transforms the field corresponding to each :class:`hanlp.common.vocab.Vocab` into indices. Args: *args: A list of vocab names. **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances. """ vocabs = dict(kwargs) for each in args: vocabs[each] = Vocab() super().__init__(vocabs)
def build_vocabs(self, dataset, logger=None, transformer=None): rel_vocab = self.vocabs.get('rel', None) if rel_vocab is None: rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None)) self.vocabs.put(rel=rel_vocab) if self.config.get('feat', None) == 'pos' or self.config.get( 'use_pos', False): self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None) timer = CountdownTimer(len(dataset)) if transformer: token_vocab = None else: token_vocab = Vocab() self.vocabs.token = token_vocab unk = self.config.get('unk', None) if unk is not None: token_vocab.unk_token = unk if token_vocab and self.config.get('min_freq', None): counter = Counter() for sample in dataset: for form in sample['token']: counter[form] += 1 reserved_token = [token_vocab.pad_token, token_vocab.unk_token] if ROOT in token_vocab: reserved_token.append(ROOT) freq_words = reserved_token + [ token for token, freq in counter.items() if freq >= self.config.min_freq ] token_vocab.token_to_idx.clear() for word in freq_words: token_vocab(word) else: for i, sample in enumerate(dataset): timer.log('vocab building [blink][yellow]...[/yellow][/blink]', ratio_percentage=True) rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV self.vocabs.lock() self.vocabs.summary(logger=logger) if token_vocab: self.config.n_words = len(self.vocabs['token']) if 'pos' in self.vocabs: self.config.n_feats = len(self.vocabs['pos']) self.vocabs['pos'].set_unk_as_safe_unk() self.config.n_rels = len(self.vocabs['rel']) if token_vocab: self.config.pad_index = self.vocabs['token'].pad_idx self.config.unk_index = self.vocabs['token'].unk_idx
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[Vocab, Vocab, Vocab]: char_vocab, ngram_vocab, tag_vocab = Vocab(), Vocab(), Vocab(pad_token=None, unk_token=None) for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True): char_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) return char_vocab, ngram_vocab, tag_vocab
def index_word2vec_with_vocab(filepath: str, vocab: Vocab, extend_vocab=True, unk=None, lowercase=False, init='uniform', normalize=None) -> torch.Tensor: """ Args: filepath: The path to pretrained embedding. vocab: The vocabulary from training set. extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file. unk: UNK token. lowercase: Convert words in pretrained embeddings into lowercase. init: Indicate which initialization to use for oov tokens. normalize: ``True`` or a method to normalize the embedding matrix. Returns: An embedding matrix. """ pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath) if unk and unk in pret_vocab: pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk) if extend_vocab: vocab.unlock() for word in pret_vocab: vocab.get_idx(word.lower() if lowercase else word) vocab.lock() ids = [] unk_id_offset = 0 for word, idx in vocab.token_to_idx.items(): word_id = pret_vocab.get(word, None) # Retry lower case if word_id is None: word_id = pret_vocab.get(word.lower(), None) if word_id is None: word_id = len(pret_vocab) + unk_id_offset unk_id_offset += 1 ids.append(word_id) if unk_id_offset: unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1)) if init and init != 'zeros': if init == 'uniform': init = embedding_uniform else: raise ValueError(f'Unsupported init {init}') unk_embeds = init(unk_embeds) pret_matrix = torch.cat([pret_matrix, unk_embeds]) ids = torch.LongTensor(ids) embedding = pret_matrix.index_select(0, ids) if normalize == 'norm': embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) elif normalize == 'std': embedding /= torch.std(embedding) return embedding
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship self.vocabs.srl_label.add('<null>') timer = CountdownTimer(len(dataset)) max_seq_len = 0 for each in dataset: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]') pass timer.stop() timer.erase() self.vocabs['srl_label'].set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl = Vocab(pad_token=None, unk_token=None) timer = CountdownTimer(len(dataset)) max_seq_len = 0 for sample in dataset: max_seq_len = max(max_seq_len, len(sample['token_input_ids'])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') self.vocabs['srl'].set_unk_as_safe_unk() # C-ARGM-FRQ appears only in test set self.vocabs.lock() self.vocabs.summary(logger) if self.config.get('delimiter') is None: tokens = dataset[0]['token'] self.config.delimiter = guess_delimiter(tokens) logger.info(f'Guess the delimiter between tokens could be [blue]"{self.config.delimiter}"[/blue]. ' f'If not, specify `delimiter` in `fit()`')
def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger, **kwargs): char_min_freq = self.config.char_min_freq if char_min_freq: has_cache = dataset.cache is not None char_counter = Counter() for each in dataset: for c in each['char']: char_counter[c] += 1 self.vocabs.char = vocab = Vocab() for c, f in char_counter.items(): if f >= char_min_freq: vocab.add(c) if has_cache: dataset.purge_cache() for each in dataset: pass else: self.vocabs.char = Vocab() for each in dataset: pass self.config.eos_chars = dataset.eos_chars self.vocabs.lock() self.vocabs.summary(logger)
def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs): vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship label_vocab.add('<null>') timer = CountdownTimer(len(dataset)) for each in dataset: timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]') label_vocab.set_unk_as_safe_unk() if lock: vocabs.lock() vocabs.summary(logger)
def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = Vocab() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples
def __init__(self, data: str, batch_size, seq_len, tokenizer='char', eos='\n', strip=True, vocab=None, cache=False, transform: Union[Callable, List] = None) -> None: self.cache = cache self.eos = eos self.strip = strip super().__init__(transform) if isinstance(tokenizer, str): available_tokenizers = { 'char': ToChar('text', 'token'), 'whitespace': WhitespaceTokenizer('text', 'token') } assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} ' self.append_transform(available_tokenizers[tokenizer]) if vocab is None: vocab = Vocab() self.training = True else: self.training = vocab.mutable self.append_transform(AppendEOS('token', eos=eos)) self.append_transform(FieldToIndex('token', vocab)) self.batch_size = batch_size data = get_resource(data) self.data = data self.num_tokens = None self.load_file(data) self._fp = None if isinstance(seq_len, int): self.seq_len = lambda: seq_len else: self.seq_len = seq_len
def fit(self, trn_path: str, **kwargs): word_vocab, ngram_vocab, tag_vocab = Vocab(), Vocab(), Vocab( pad_token=None, unk_token=None) num_samples = 0 for X, Y in self.file_to_samples(trn_path, gold=True): num_samples += 1 word_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab if self.config.window_size: vocabs = word_vocab, ngram_vocab, tag_vocab else: vocabs = word_vocab, None, tag_vocab self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs return num_samples
def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 for words, tags in generator_words_tags(trn_path, gold=True, lower=self.config.get( 'lower', False)): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = Vocab() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples
def fit(self, trn_path: str, **kwargs) -> int: self.form_vocab = Vocab() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk self.cpos_vocab = Vocab(pad_token=None, unk_token=None) self.rel_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, (form, cpos, head, deprel) in enumerate(sent): if idx == 0: root = form else: counter[form] += 1 self.cpos_vocab.add(cpos) self.rel_vocab.update(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples