示例#1
0
    def build_vocabs(self, dataset, logger=None, transformer=False):
        rel_vocab = self.vocabs.get('rel', None)
        if rel_vocab is None:
            rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None))
            self.vocabs.put(rel=rel_vocab)

        timer = CountdownTimer(len(dataset))
        if transformer:
            token_vocab = None
        else:
            self.vocabs.token = token_vocab = VocabCounter(unk_token=self.config.get('unk', UNK))
        for i, sample in enumerate(dataset):
            timer.log('Building vocab [blink][yellow]...[/yellow][/blink]', ratio_percentage=True)
        min_freq = self.config.get('min_freq', None)
        if min_freq:
            token_vocab.trim(min_freq)
        rel_vocab.set_unk_as_safe_unk()  # Some relation in dev set is OOV
        self.vocabs.lock()
        self.vocabs.summary(logger=logger)
        if token_vocab:
            self.config.n_words = len(self.vocabs['token'])
        self.config.n_rels = len(self.vocabs['rel'])
        if token_vocab:
            self.config.pad_index = self.vocabs['token'].pad_idx
            self.config.unk_index = self.vocabs['token'].unk_idx
示例#2
0
 def build_vocabs(self, dataset, logger):
     self.vocabs.tag = Vocab(unk_token=None, pad_token=None)
     self.vocabs[self.config.token_key] = Vocab()
     for each in dataset:
         pass
     self.vocabs.lock()
     self.vocabs.summary(logger)
示例#3
0
 def build_vocabs(self, dataset, logger=None, transformer=None):
     self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel,
                                              unk_token=self.config.pad_rel)
     if self.config.joint:
         self.vocabs['rel'] = rel_2nd
     super().build_vocabs(dataset, logger, transformer)
     self.config.n_rels_2nd = len(rel_2nd)
示例#4
0
文件: char_rnn.py 项目: emorynlp/elit
 def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
     if isinstance(self.embed, Embedding):
         self.embed.transform(vocabs=vocabs)
     vocab_name = self.vocab_name
     if vocab_name not in vocabs:
         vocabs[vocab_name] = Vocab()
     return ToChar(self.field, vocab_name, max_word_length=self.max_word_length)
示例#5
0
 def transform(self,
               vocabs: VocabDict = None,
               **kwargs) -> Optional[Callable]:
     assert vocabs is not None
     if self.field not in vocabs:
         vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk)
     return super().transform(**kwargs)
示例#6
0
 def build_vocabs(self, trn: torch.utils.data.Dataset,
                  logger: logging.Logger):
     additional_tokens = set()
     self.collect_additional_tokens(additional_tokens, trn)
     additional_tokens = sorted(additional_tokens)
     self.build_tokenizer(additional_tokens)
     self.vocabs['additional_tokens'] = Vocab(
         idx_to_token=list(additional_tokens))
示例#7
0
 def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs):
     vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None)
     # Use null to indicate no relationship
     label_vocab.add('<null>')
     timer = CountdownTimer(len(dataset))
     for each in dataset:
         timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]')
     label_vocab.set_unk_as_safe_unk()
     if lock:
         vocabs.lock()
         vocabs.summary(logger)
示例#8
0
    def build_vocabs(self, dataset, logger=None, transformer=None):
        rel_vocab = self.vocabs.get('rel', None)
        if rel_vocab is None:
            rel_vocab = Vocab(unk_token=None,
                              pad_token=self.config.get('pad_rel', None))
            self.vocabs.put(rel=rel_vocab)
        if self.config.get('feat', None) == 'pos' or self.config.get(
                'use_pos', False):
            self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None)

        timer = CountdownTimer(len(dataset))
        if transformer:
            token_vocab = None
        else:
            token_vocab = Vocab()
            self.vocabs.token = token_vocab
            unk = self.config.get('unk', None)
            if unk is not None:
                token_vocab.unk_token = unk
        if token_vocab and self.config.get('min_freq', None):
            counter = Counter()
            for sample in dataset:
                for form in sample['token']:
                    counter[form] += 1
            reserved_token = [token_vocab.pad_token, token_vocab.unk_token]
            if ROOT in token_vocab:
                reserved_token.append(ROOT)
            freq_words = reserved_token + [
                token for token, freq in counter.items()
                if freq >= self.config.min_freq
            ]
            token_vocab.token_to_idx.clear()
            for word in freq_words:
                token_vocab(word)
        else:
            for i, sample in enumerate(dataset):
                timer.log('vocab building [blink][yellow]...[/yellow][/blink]',
                          ratio_percentage=True)
        rel_vocab.set_unk_as_safe_unk()  # Some relation in dev set is OOV
        self.vocabs.lock()
        self.vocabs.summary(logger=logger)
        if token_vocab:
            self.config.n_words = len(self.vocabs['token'])
        if 'pos' in self.vocabs:
            self.config.n_feats = len(self.vocabs['pos'])
            self.vocabs['pos'].set_unk_as_safe_unk()
        self.config.n_rels = len(self.vocabs['rel'])
        if token_vocab:
            self.config.pad_index = self.vocabs['token'].pad_idx
            self.config.unk_index = self.vocabs['token'].unk_idx
示例#9
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.tag = Vocab(pad_token=None, unk_token=None)
     timer = CountdownTimer(len(trn))
     max_seq_len = 0
     token_key = self.config.token_key
     for each in trn:
         max_seq_len = max(max_seq_len, len(each[token_key]))
         timer.log(
             f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})'
         )
     self.vocabs.tag.set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
示例#10
0
 def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger,
                  **kwargs):
     char_min_freq = self.config.char_min_freq
     if char_min_freq:
         has_cache = dataset.cache is not None
         char_counter = Counter()
         for each in dataset:
             for c in each['char']:
                 char_counter[c] += 1
         self.vocabs.char = vocab = Vocab()
         for c, f in char_counter.items():
             if f >= char_min_freq:
                 vocab.add(c)
         if has_cache:
             dataset.purge_cache()
             for each in dataset:
                 pass
     else:
         self.vocabs.char = Vocab()
         for each in dataset:
             pass
     self.config.eos_chars = dataset.eos_chars
     self.vocabs.lock()
     self.vocabs.summary(logger)
示例#11
0
 def build_vocabs(self, dataset, logger, **kwargs):
     self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None)
     # Use null to indicate no relationship
     self.vocabs.srl_label.add('<null>')
     timer = CountdownTimer(len(dataset))
     max_seq_len = 0
     for each in dataset:
         max_seq_len = max(max_seq_len, len(each['token_input_ids']))
         timer.log(
             f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]'
         )
         pass
     timer.stop()
     timer.erase()
     self.vocabs['srl_label'].set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)
示例#12
0
    def __init__(self,
                 data: str,
                 batch_size,
                 seq_len,
                 tokenizer='char',
                 eos='\n',
                 strip=True,
                 vocab=None,
                 cache=False,
                 transform: Union[Callable, List] = None) -> None:
        self.cache = cache
        self.eos = eos
        self.strip = strip
        super().__init__(transform)
        if isinstance(tokenizer, str):
            available_tokenizers = {
                'char': ToChar('text', 'token'),
                'whitespace': WhitespaceTokenizer('text', 'token')
            }
            assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} '
            self.append_transform(available_tokenizers[tokenizer])

        if vocab is None:
            vocab = Vocab()
            self.training = True
        else:
            self.training = vocab.mutable
        self.append_transform(AppendEOS('token', eos=eos))
        self.append_transform(FieldToIndex('token', vocab))
        self.batch_size = batch_size
        data = get_resource(data)
        self.data = data
        self.num_tokens = None
        self.load_file(data)
        self._fp = None
        if isinstance(seq_len, int):
            self.seq_len = lambda: seq_len
        else:
            self.seq_len = seq_len
示例#13
0
def index_word2vec_with_vocab(filepath: str,
                              vocab: Vocab,
                              extend_vocab=True,
                              unk=None,
                              lowercase=False,
                              init='uniform',
                              normalize=None) -> torch.Tensor:
    pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath)
    if unk and unk in pret_vocab:
        pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk)
    if extend_vocab:
        vocab.unlock()
        for word in pret_vocab:
            vocab.get_idx(word.lower() if lowercase else word)
    vocab.lock()
    ids = []

    unk_id_offset = 0
    for word, idx in vocab.token_to_idx.items():
        word_id = pret_vocab.get(word, None)
        # Retry lower case
        if word_id is None:
            word_id = pret_vocab.get(word.lower(), None)
        if word_id is None:
            word_id = len(pret_vocab) + unk_id_offset
            unk_id_offset += 1
        ids.append(word_id)
    if unk_id_offset:
        unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1))
        if init and init != 'zeros':
            if init == 'uniform':
                init = embedding_uniform
            else:
                raise ValueError(f'Unsupported init {init}')
            unk_embeds = init(unk_embeds)
        pret_matrix = torch.cat([pret_matrix, unk_embeds])
    ids = torch.LongTensor(ids)
    embedding = pret_matrix.index_select(0, ids)
    if normalize == 'norm':
        embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
    elif normalize == 'std':
        embedding /= torch.std(embedding)
    return embedding
示例#14
0
 def __init__(self, vocab: Vocab = None) -> None:
     super().__init__()
     if vocab is None:
         vocab = Vocab()
     self.vocab = vocab
示例#15
0
 def __init__(self, *args, **kwargs) -> None:
     vocabs = dict(kwargs)
     for each in args:
         vocabs[each] = Vocab()
     super().__init__(vocabs)
示例#16
0
 def transform(self, **kwargs) -> Callable:
     vocab = Vocab()
     vocab.load(os.path.join(get_resource(self.path), 'vocab.json'))
     return TransformList(ContextualStringEmbeddingTransform(self.field),
                          FieldToIndex(f'{self.field}_f_char', vocab),
                          FieldToIndex(f'{self.field}_b_char', vocab))
示例#17
0
文件: encoder.py 项目: emorynlp/elit
        self.input_dim = input_dim
        self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2)
                                     for _ in range(layers)])
        self.reset_parameters()

    def reset_parameters(self):
        for layer in self.layers:
            nn.init.normal_(layer.weight, std=0.02)
            nn.init.constant_(layer.bias[self.input_dim:], 1)
            nn.init.constant_(layer.bias[:self.input_dim], 0)

    def forward(self, x):
        for layer in self.layers:
            new_x = layer(x)
            new_x, gate = new_x.chunk(2, dim=-1)
            new_x = F.relu(new_x)
            gate = torch.sigmoid(gate)
            x = gate * x + (1 - gate) * new_x
        return x


if __name__ == "__main__":
    from data import Vocab, CLS, DUM, END

    vocab = Vocab('../data/AMR/amr_1.0_reca/lem_vocab', 3, [CLS])
    embed = AMREmbedding(vocab, 300, pretrained_file='../data/glove.840B.300d.txt',
                         dump_file='../data/AMR/amr_1.0_reca/glove_lem_embed')
    vocab = Vocab('../data/AMR/amr_1.0_reca/concept_vocab', 3, [DUM, END])
    embed = AMREmbedding(vocab, 300, pretrained_file='../data/glove.840B.300d.txt', amr=True,
                         dump_file='../data/AMR/amr_1.0_reca/glove_concept_embed')
示例#18
0
文件: encoder.py 项目: emorynlp/elit
def AMREmbedding(vocab: Vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None):
    if pretrained_file is None:
        return Embedding(len(vocab), embedding_dim, vocab.pad_idx)

    tokens_to_keep = set()
    for idx in range(vocab.size):
        token = vocab.idx2token(idx)
        # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param.
        if amr:
            token = re.sub(r'-\d\d$', '', token)
        tokens_to_keep.add(token)

    embeddings = {}

    if dump_file is not None:
        fo = open(dump_file, 'w', encoding='utf8')

    with open(pretrained_file, encoding='utf8') as embeddings_file:
        for line in embeddings_file.readlines():
            fields = line.rstrip().split(' ')
            if len(fields) - 1 != embedding_dim:
                continue
            token = fields[0]
            if token in tokens_to_keep:
                if dump_file is not None:
                    fo.write(line)
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if dump_file is not None:
        fo.close()

    all_embeddings = np.asarray(list(embeddings.values()))
    print('pretrained', all_embeddings.shape)
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))

    all_embeddings -= embeddings_mean
    all_embeddings /= embeddings_std
    all_embeddings *= 0.02
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))
    print(embeddings_mean, embeddings_std)
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)

    for i in range(vocab.size):
        token = vocab.idx2token(i)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
        else:
            if amr:
                normalized_token = re.sub(r'-\d\d$', '', token)
                if normalized_token in embeddings:
                    embedding_matrix[i] = torch.FloatTensor(embeddings[normalized_token])
    embedding_matrix[vocab.padding_idx].fill_(0.)

    return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
示例#19
0
 def build_vocabs(self, trn, logger, **kwargs):
     self.vocabs.label = Vocab(pad_token=None, unk_token=None)
     for each in trn:
         pass
     self.vocabs.lock()
     self.vocabs.summary(logger)