Exemplo n.º 1
0
def load_embeddings(file_path, word_dim, vocab_size, num_copy_tokens):
    special_tokens = CasedWordVocab.SPECIAL_TOKENS

    base_embeds = SimpleEmbeddings.from_file(file_path, word_dim, vocab_size)
    _, embed_dim = base_embeds.array.shape

    def sample_embeds(num_embeds, seed):
        shape = (num_embeds, embed_dim)
        return emulate_distribution(shape, base_embeds.array, seed=seed)

    special_tokens_array = sample_embeds(len(special_tokens), seed=0)
    copy_tokens_array = sample_embeds(
        num_copy_tokens, seed=1)  # different seed to have different val

    # copy tokens are appended at the end
    new_array = np.concatenate(
        (special_tokens_array, base_embeds.array, copy_tokens_array), axis=0)
    new_vocab = HardCopyVocab(base_embeds.vocab.tokens, num_copy_tokens)

    # check that tokens come in the order that we assumed
    correct_tokens = list(special_tokens)  # special tokens first
    correct_tokens.extend(base_embeds.vocab.tokens)  # then base tokens
    correct_tokens.extend('<copy{}>'.format(i)
                          for i in xrange(num_copy_tokens))  # copy tokens last
    assert new_vocab.tokens == correct_tokens

    return SimpleEmbeddings(new_array, new_vocab)
Exemplo n.º 2
0
    def _build_editor(cls, config, num_iter, eps, momentum):
        """Build Editor.

        Args:
            config (Config): Editor config

        Returns:
            Editor
        """

        file_path = join(data.workspace.word_vectors, config.wvec_path)
        word_embeddings = SimpleEmbeddings.from_file(file_path, config.word_dim, vocab_size=config.vocab_size)
        word_embeddings = word_embeddings.with_special_tokens()
        source_token_embedder = TokenEmbedder(word_embeddings)
        target_token_embedder = TokenEmbedder(word_embeddings)

        if config.decoder_cell == 'SimpleDecoderCell':
            decoder_cell = SimpleDecoderCell(target_token_embedder, config.hidden_dim,
                                             config.word_dim, config.agenda_dim)
        elif config.decoder_cell == 'AttentionDecoderCell':
            decoder_cell = AttentionDecoderCell(target_token_embedder, config.agenda_dim,
                                                config.hidden_dim, config.hidden_dim,
                                                config.attention_dim, config.no_insert_delete_attn,
                                                num_layers=config.decoder_layers)
        else:
            raise ValueError('{} not implemented'.format(config.decoder_cell))
        editor = Editor(source_token_embedder, config.hidden_dim, config.agenda_dim, config.edit_dim, config.lamb_reg, config.norm_eps, config.norm_max, config.kill_edit, decoder_cell, config.encoder_layers, num_iter, eps, momentum)
        editor = try_gpu(editor)
        return editor
Exemplo n.º 3
0
    def _build_model(cls, config):
        file_path = join(data.workspace.word_vectors, config.model.wvec_path)
        word_embeddings = SimpleEmbeddings.from_file(
            file_path,
            config.model.word_dim,
            vocab_size=config.model.vocab_size)
        word_embeddings = word_embeddings.with_special_tokens()
        token_embedder = TokenEmbedder(word_embeddings)

        model = None
        if config.model.type == 0:  # regular language model
            model = LanguageModel(token_embedder, config.model.hidden_dim,
                                  config.model.agenda_dim,
                                  config.model.num_layers, cls._make_logger())
        elif config.model.type == 1:  # SVAE
            model = NoisyLanguageModel(
                token_embedder, config.model.hidden_dim,
                config.model.agenda_dim, config.model.num_layers,
                config.model.kl_weight_steps, config.model.kl_weight_rate,
                config.model.kl_weight_cap, config.model.dci_keep_rate,
                cls._make_logger())
        assert model is not None

        model = try_gpu(model)
        optimizer = optim.Adam(model.parameters(),
                               lr=config.optim.learning_rate)
        return model, optimizer
Exemplo n.º 4
0
    def _build_model(config, training_examples):
        # build scorer
        model_config = config.retriever
        embeds_path = join(data.workspace.word_vectors, 'glove.6B.{}d.txt'.format(model_config.word_dim))
        word_embeds = SimpleEmbeddings.from_file(embeds_path, model_config.word_dim, model_config.vocab_size)
        word_embeds = word_embeds.with_special_tokens()

        def seq_embedder(trainable):
            sent_dim = model_config.sent_dim
            token_embedder = TokenEmbedder(word_embeds, trainable)
            if trainable:
                transform = Linear(token_embedder.embed_dim, sent_dim)  # if trainable, also add a linear transform
            else:
                transform = lambda x: x
            return BOWSequenceEmbedder(token_embedder, embed_dim=sent_dim,
                                       pool=model_config.pool_method, transform=transform)

        neg_sampler = UniformNegativeSampler(training_examples)
        input_embedder = seq_embedder(trainable=model_config.train_input)
        output_embedder = seq_embedder(trainable=model_config.train_output)
        scorer = Seq2SeqScorer(input_embedder, output_embedder, neg_sampler,
                               score_method=model_config.score_method, loss_method=model_config.loss_method)
        scorer = try_gpu(scorer)

        # build optimizer
        optimizer = optim.Adam(scorer.parameters(), lr=config.optim.learning_rate)
        return scorer, optimizer
Exemplo n.º 5
0
 def model(self):
     array = np.array([
         [1, 2, 3],
         [2, 4, 6],
         [3, 5, 7],
     ], dtype=np.float32)
     vocab = SimpleVocab(u'a b c'.split())
     embeddings = SimpleEmbeddings(array, vocab)
     return TokenEmbedder(embeddings, 'token_embeds')
 def base_pred_embeddings(self):
     array = np.array([
         [0, 0, 0, 0],
         [1, 2, 3, 4],
         [0, 2, 0, 8],
     ],
                      dtype=np.float32)
     vocab = SimpleVocab(u'<unk> b0 b1'.split())
     return SimpleEmbeddings(array, vocab)
Exemplo n.º 7
0
    def embeddings(self):
        array = np.array([
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8],
            [9, 10, 11],
            [12, 13, 14],
            [15, 16, 17],
        ], dtype=np.float32)

        vocab = SimpleVocab(['<pad>', 'a', 'b', 'c', 'd', 'e'])
        return SimpleEmbeddings(array, vocab)
Exemplo n.º 8
0
 def token_embedder(self, base_vocab, embeds_array, dynamic_vocabs):
     word_embeds = SimpleEmbeddings(embeds_array, base_vocab)
     base_embedder = TokenEmbedder(word_embeds)
     return DynamicMultiVocabTokenEmbedder(base_embedder, dynamic_vocabs, base_vocab)
Exemplo n.º 9
0
def embeds(vocab):
    array = np.eye(len(vocab))
    return SimpleEmbeddings(array, vocab)