def build_vocab(self, *args, **kwargs): """Add unaligned_token to the list of special symbols.""" counter = Counter() sources = [] for arg in args: if isinstance(arg, data.Dataset): sources += [ getattr(arg, name) for name, field in arg.fields.items() if field is self ] else: sources.append(arg) for sample in sources: for x in sample: if not self.sequential: x = [x] try: counter.update(x) except TypeError: counter.update(chain.from_iterable(x)) specials = list( OrderedDict.fromkeys(tok for tok in [ self.unk_token, self.pad_token, self.init_token, self.eos_token, self.unaligned_token, ] if tok is not None)) self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
def build_vocab(self, *args, **kwargs): """Add unaligned_token to the list of special symbols.""" counter = Counter() sources = [] for arg in args: # arg是QEDataset类,里面包括examples和fields if isinstance(arg, data.Dataset): sources += [ getattr(arg, name) for name, field in arg.fields.items() if field is self ] # source是列表,列表中元素是迭代器 else: sources.append(arg) for sample in sources: for x in sample: # 每次循环读取一个样本,将样本处理成list形式,然后更新counter if not self.sequential: x = [x] try: counter.update(x) except TypeError: counter.update(chain.from_iterable(x)) specials = list( OrderedDict.fromkeys(tok for tok in [ self.unk_token, self.pad_token, self.init_token, self.eos_token, self.unaligned_token ] if tok is not None)) # ['<unk>', '<pad>', '<unaligned>'] self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
def fit_vocab( self, samples, vocab_size=None, vocab_min_freq=0, embeddings_name=None, keep_rare_words_with_embeddings=False, add_embeddings_vocab=False, ): tokens = Counter() for sample in samples: # TODO: subtokenize? tokens.update(self.tokenize(sample)) # We use our own Vocabulary class specials = list( OrderedDict.fromkeys( tok for tok in [self.unaligned_token] if tok is not None ) ) # TODO: handle embeddings/vectors self.vocab = Vocabulary( tokens, max_size=vocab_size, min_freq=vocab_min_freq, unk_token=self.unk_token, pad_token=self.pad_token, bos_token=self.bos_token, eos_token=self.eos_token, specials=specials, specials_first=self.specials_first, # TODO: missing vectors, etc. vectors=None, rare_with_vectors=keep_rare_words_with_embeddings, add_vectors_vocab=add_embeddings_vocab, )