Пример #1
0
    def _build_editor(cls, config, num_iter, eps, momentum):
        """Build Editor.

        Args:
            config (Config): Editor config

        Returns:
            Editor
        """

        file_path = join(data.workspace.word_vectors, config.wvec_path)
        word_embeddings = SimpleEmbeddings.from_file(file_path, config.word_dim, vocab_size=config.vocab_size)
        word_embeddings = word_embeddings.with_special_tokens()
        source_token_embedder = TokenEmbedder(word_embeddings)
        target_token_embedder = TokenEmbedder(word_embeddings)

        if config.decoder_cell == 'SimpleDecoderCell':
            decoder_cell = SimpleDecoderCell(target_token_embedder, config.hidden_dim,
                                             config.word_dim, config.agenda_dim)
        elif config.decoder_cell == 'AttentionDecoderCell':
            decoder_cell = AttentionDecoderCell(target_token_embedder, config.agenda_dim,
                                                config.hidden_dim, config.hidden_dim,
                                                config.attention_dim, config.no_insert_delete_attn,
                                                num_layers=config.decoder_layers)
        else:
            raise ValueError('{} not implemented'.format(config.decoder_cell))
        editor = Editor(source_token_embedder, config.hidden_dim, config.agenda_dim, config.edit_dim, config.lamb_reg, config.norm_eps, config.norm_max, config.kill_edit, decoder_cell, config.encoder_layers, num_iter, eps, momentum)
        editor = try_gpu(editor)
        return editor
    def _build_model(cls, model_config, optim_config, data_config):
        """Build Editor.

        Args:
            model_config (Config): Editor config
            optim_config (Config): optimization config
            data_config (Config): dataset config

        Returns:
            Editor
        """
        file_path = join(data.workspace.word_vectors, model_config.wvec_path)
        word_embeddings = load_embeddings(file_path, model_config.word_dim,
                                          model_config.vocab_size,
                                          model_config.num_copy_tokens)
        word_dim = word_embeddings.embed_dim

        source_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_source_embeds)
        target_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_target_embeds)

        # number of input channels
        num_inputs = len(data_config.source_cols)

        decoder_cell = AttentionDecoderCell(
            target_token_embedder,
            2 *
            word_dim,  # 2 * word_dim because we concat base and copy vectors
            model_config.agenda_dim,
            model_config.hidden_dim,
            model_config.hidden_dim,
            model_config.attention_dim,
            num_layers=model_config.decoder_layers,
            num_inputs=num_inputs,
            dropout_prob=model_config.decoder_dropout_prob,
            disable_attention=False)

        encoder = Encoder(word_dim, model_config.agenda_dim,
                          model_config.hidden_dim, model_config.encoder_layers,
                          num_inputs, model_config.encoder_dropout_prob, False)

        copy_len = [5, 5, 40]
        model = Editor(source_token_embedder,
                       encoder,
                       decoder_cell,
                       copy_lens=copy_len)
        model = try_gpu(model)

        optimizer = optim.Adam(model.parameters(),
                               lr=optim_config.learning_rate)

        return model, optimizer
Пример #3
0
    def input_embeds_list(self):
        sequences = [
            [1, 2, 3],
            [8, 4, 2, 1, 1],
            [],
        ]

        # token 1 maps to embedding [1], 2 maps to [2] and so on...
        vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8])
        array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1)
        token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array))

        seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab))
        return seq_embeds.split()
Пример #4
0
    def __init__(self,
                 utterance_embedder,
                 tag_embed_dim,
                 value_embed_dim,
                 tampered_embed_dim,
                 classes_embed_dim,
                 max_classes=2000,
                 max_tags=100):
        """
        Args:
            utterance_embedder (UtteranceEmbedder)
            tag_embed_dim (int): embedding dim of tags
            tampered_embed_dim (int): embedding dim of the tampered attribute
            classes_embed_dim (int): embedding dim of the classes
            max_classes (int): maximum number of supported classes to embed
        """
        super(BaseDOMElementEmbedder, self).__init__()

        self._utterance_embedder = utterance_embedder
        self._tag_embedder = TokenEmbedder(LazyInitEmbeddings(
            tag_embed_dim, max_tags),
                                           trainable=True)
        self._value_embedder = TokenEmbedder(
            DOMValueEmbeddings(value_embed_dim), trainable=True)
        self._tampered_embedder = TokenEmbedder(
            BoolEmbeddings(tampered_embed_dim), trainable=True)
        self._classes_embedder = TokenEmbedder(LazyInitEmbeddings(
            classes_embed_dim, max_classes),
                                               trainable=True)
        self._colors_dim = 8  # 4 (rgba) for fg and 4 for bg
        self._coords_dim = 2  # left and top
Пример #5
0
    def _build_editor(cls, model_config, data_config, word_embeddings,
                      word_dim, vae_mode):
        source_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_source_embeds)
        target_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_target_embeds)

        # number of input channels
        if vae_mode:
            num_inputs = len(data_config.source_cols)
        else:  #edit model uses num_inputs + num_inputs + 1
            num_inputs = len(data_config.source_cols) * 2 + 1

        decoder_cell = AttentionDecoderCell(
            target_token_embedder,
            2 * word_dim,
            # 2 * word_dim because we concat base and copy vectors
            model_config.agenda_dim,
            model_config.hidden_dim,
            model_config.hidden_dim,
            model_config.attention_dim,
            num_layers=model_config.decoder_layers,
            num_inputs=num_inputs,
            dropout_prob=model_config.decoder_dropout_prob,
            disable_attention=vae_mode)

        if vae_mode:
            encoder = Encoder(word_dim, model_config.agenda_dim,
                              model_config.hidden_dim,
                              model_config.encoder_layers, num_inputs,
                              model_config.encoder_dropout_prob, vae_mode,
                              model_config.vae_kappa)
        else:
            encoder = Encoder(word_dim, model_config.agenda_dim,
                              model_config.hidden_dim,
                              model_config.encoder_layers, num_inputs,
                              model_config.encoder_dropout_prob, vae_mode)

        vae_copy_len = [5, 10, 185]
        editor_copy_len = [5, 10, 10, 5, 10, 10, 150]
        if vae_mode:
            model = Editor(source_token_embedder, encoder, decoder_cell,
                           vae_copy_len)
        else:
            model = Editor(source_token_embedder, encoder, decoder_cell,
                           editor_copy_len)
        model = try_gpu(model)
        return model
Пример #6
0
def get_stupid_embedder(config):
    """Create a new StupidEmbedder based on the config

    Args:
        config (Config): the root config
    Returns:
        StupidEmbedder
    """
    cm = config.model
    cmu = cm.utterance_embedder
    cmt = cm.node_embedder.token_embedder

    # Token embedder
    magnitude_filename = cmt.magnitude_filename
    vocab_filename = cmt.vocab_filename
    word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename, cmt.vocab_size, cmt.word_embed_dim)
    token_embedder = TokenEmbedder(word_embeddings, trainable=cmu.trainable)

    # Utterance embedder
    if cmu.type == 'average':
        utterance_embedder = AverageUtteranceEmbedder(token_embedder, cmu.max_words)
    elif cmu.type == 'lstm':
        utterance_embedder = LSTMUtteranceEmbedder(token_embedder, cmu.lstm_dim, cmu.max_words)
    else:
        raise ValueError('Unknown UtteranceEmbedder type {}'.format(cmu.type))
    # Embedder
    embedder = StupidEmbedder(cm.dim, utterance_embedder, cm.dropout)
    return embedder
Пример #7
0
    def _build_model(cls, config):
        file_path = join(data.workspace.word_vectors, config.model.wvec_path)
        word_embeddings = SimpleEmbeddings.from_file(
            file_path,
            config.model.word_dim,
            vocab_size=config.model.vocab_size)
        word_embeddings = word_embeddings.with_special_tokens()
        token_embedder = TokenEmbedder(word_embeddings)

        model = None
        if config.model.type == 0:  # regular language model
            model = LanguageModel(token_embedder, config.model.hidden_dim,
                                  config.model.agenda_dim,
                                  config.model.num_layers, cls._make_logger())
        elif config.model.type == 1:  # SVAE
            model = NoisyLanguageModel(
                token_embedder, config.model.hidden_dim,
                config.model.agenda_dim, config.model.num_layers,
                config.model.kl_weight_steps, config.model.kl_weight_rate,
                config.model.kl_weight_cap, config.model.dci_keep_rate,
                cls._make_logger())
        assert model is not None

        model = try_gpu(model)
        optimizer = optim.Adam(model.parameters(),
                               lr=config.optim.learning_rate)
        return model, optimizer
Пример #8
0
 def seq_embedder(trainable):
     sent_dim = model_config.sent_dim
     token_embedder = TokenEmbedder(word_embeds, trainable)
     if trainable:
         transform = Linear(token_embedder.embed_dim, sent_dim)  # if trainable, also add a linear transform
     else:
         transform = lambda x: x
     return BOWSequenceEmbedder(token_embedder, embed_dim=sent_dim,
                                pool=model_config.pool_method, transform=transform)
Пример #9
0
    def test_embedding_from_array(self):
        emb = TokenEmbedder._embedding_from_array(np.array([[9, 9], [8, 7]], dtype=np.float32))
        assert isinstance(emb, Embedding)
        values = emb(GPUVariable(torch.LongTensor([[0, 0], [1, 0]])))

        assert_tensor_equal(values,
                            [
                                [[9, 9], [9, 9]],
                                [[8, 7], [9, 9]],
                            ])
Пример #10
0
 def __init__(self, dim, utterance_embedder, recursive_texts,
              attr_embed_dim, max_attr_tokens, min_id_freq, min_class_freq,
              dropout):
     """
     Args:
         dim (int): Target embedding dimension
         utterance_embedder (UtteranceEmbedder)
         recursive_texts (bool): For node text, whether to recursively combine the
             texts of the descendants
         attr_embed_dim (int): Size of each attribute embedding
         max_attr_tokens (int): Limit the number of attribute tokens to embed
         min_id_freq (int): Minimum token frequency of tokens in id vocab
         min_class_freq (int): Minimum token frequency of tokens in class vocab
         dropout (float): Dropout rate
     """
     super(ProppyBaseEmbedder, self).__init__()
     self._dim = dim
     # Text embedder
     self._utterance_embedder = utterance_embedder
     self._max_words = utterance_embedder.max_words
     self._recursive_texts = recursive_texts
     # Attribute embedders
     self._attr_embed_dim = attr_embed_dim
     tags = [UNK, EOS] + TAGS
     self._tag_embedder = \
             TokenEmbedder(RandomEmbeddings(tags, attr_embed_dim))
     ids = read_frequency_vocab('frequent-ids', min_id_freq)
     self._id_embedder = AverageUtteranceEmbedder(
         TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)),
         max_attr_tokens)
     classes = read_frequency_vocab('frequent-classes', min_class_freq)
     self._classes_embedder = AverageUtteranceEmbedder(
         TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)),
         max_attr_tokens)
     coords_dim = 3
     # Combine
     input_dim = (self._utterance_embedder.embed_dim + 3 * attr_embed_dim +
                  coords_dim)
     self.dropout = nn.Dropout(dropout)
     #self.fc = nn.Linear(self._utterance_embedder.embed_dim, dim)
     self.fc = nn.Linear(input_dim, dim)
Пример #11
0
def get_proppy_embedder(config):
    """Create a new ProppyEmbedder based on the config

    Args:
        config (Config): the root config
    Returns:
        ProppyEmbedder
    """
    cm = config.model
    cmu = cm.utterance_embedder
    cmt = cm.node_embedder.token_embedder

    # Token embedder
    magnitude_filename = cmt.magnitude_filename
    vocab_filename = cmt.vocab_filename
    word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename,
                                          cmt.vocab_size, cmt.word_embed_dim)
    token_embedder = TokenEmbedder(word_embeddings, trainable=cmu.trainable)
    # Utterance embedder
    if cmu.type == 'average':
        utterance_embedder = AverageUtteranceEmbedder(token_embedder,
                                                      cmu.max_words)
    elif cmu.type == 'lstm':
        utterance_embedder = LSTMUtteranceEmbedder(token_embedder,
                                                   cmu.lstm_dim, cmu.max_words)
    elif cmu.type == 'attention_lstm':
        utterance_embedder = AttentionUtteranceEmbedder(
            token_embedder, cmu.lstm_dim, cmu.max_words)
    else:
        raise ValueError('Unknown UtteranceEmbedder type {}'.format(cmu.type))
    # Base node embedder
    cmb = cm.node_embedder.base_embedder
    base_embedder = ProppyBaseEmbedder(cm.dim, utterance_embedder,
                                       cmb.recursive_texts, cmb.attr_embed_dim,
                                       cmb.max_attr_tokens, cmb.min_id_freq,
                                       cmb.min_class_freq, cm.dropout)
    # Aggregator
    cmpr = cm.node_embedder.propagation
    if cmpr.iterations == 0:
        return base_embedder
    if cmpr.aggregator == 'pool_mlp':
        aggregator = PoolMLPAggregator(cm.dim, cm.dropout)
    elif cmpr.aggregator == 'mlp_pool':
        aggregator = MLPPoolAggregator(cm.dim, cm.dropout)
    else:
        raise ValueError('Unknown Aggregator {}'.format(cmpr.aggregator))
    # Information propagation
    full_embedder = ProppyEmbedder(cm.dim, base_embedder, cmpr.iterations,
                                   cmpr.neighbor_rels, cmpr.max_neighbors,
                                   aggregator)
    return full_embedder
Пример #12
0
    def from_config(cls, config):
        """Constructs the appropriate UtteranceEmbedder from a config.

        Args:
            config (Config)

        Returns:
            UtteranceEmbedder
        """
        if config.type == "glove":
            glove_embeddings = GloveEmbeddings(config.vocab_size)
            token_embedder = TokenEmbedder(glove_embeddings, trainable=False)
            utterance_embedder = cls(token_embedder, config.lstm_dim)
            return utterance_embedder
        else:
            raise ValueError(
                "{} not a supported type of utterance embedder".format(
                    config.type))
Пример #13
0
def get_allan_embedder(config):
    """Create a new AllanEmbedder based on the config

    Args:
        config (Config): the root config
    Returns:
        AllanEmbedder
    """
    cm = config.model
    cmu = cm.utterance_embedder
    cmt = cm.node_embedder.token_embedder
    # cma = cm.node_embedder.attr_embedder
    cmb = cm.node_embedder.base_embedder

    # Token embedder
    magnitude_filename = cmt.magnitude_filename
    vocab_filename = cmt.vocab_filename
    word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename,
                                          cmt.vocab_size, cmt.word_embed_dim)
    token_embedder = TokenEmbedder(word_embeddings, trainable=cmt.trainable)

    lang = cmt.lang
    # Utterance embedder
    utterance_embedder = make_embedder(token_embedder, cmu, lang)

    # Attribute embedder
    # attr_embedder = make_embedder(attr_token_embedder, cma)
    # AverageUtteranceEmbedder(TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)), max_attr_tokens)
    attr_embedder = utterance_embedder

    # Base node embedder
    base_embedder = AllanBaseEmbedder(cm.dim,
                                      utterance_embedder,
                                      attr_embedder,
                                      cmb.recursive_texts,
                                      cmt.word_embed_dim,
                                      cmb.max_attr_tokens,
                                      cmb.min_id_freq,
                                      cmb.min_class_freq,
                                      cm.dropout,
                                      ablate_text=cm.ablate_text,
                                      ablate_attrs=cm.ablate_attrs)
    return base_embedder
Пример #14
0
 def embedder(self, request):
     vocab = SimpleVocab(['<unk>', '<start>', '<stop>'] + ['a', 'b', 'c'])
     arr = np.eye(len(vocab), dtype=np.float32)
     word_embeddings = Bunch(vocab=vocab, array=arr)
     return TokenEmbedder(word_embeddings, trainable=request.param)
Пример #15
0
class ProppyBaseEmbedder(nn.Module):
    def __init__(self, dim, utterance_embedder, recursive_texts,
                 attr_embed_dim, max_attr_tokens, min_id_freq, min_class_freq,
                 dropout):
        """
        Args:
            dim (int): Target embedding dimension
            utterance_embedder (UtteranceEmbedder)
            recursive_texts (bool): For node text, whether to recursively combine the
                texts of the descendants
            attr_embed_dim (int): Size of each attribute embedding
            max_attr_tokens (int): Limit the number of attribute tokens to embed
            min_id_freq (int): Minimum token frequency of tokens in id vocab
            min_class_freq (int): Minimum token frequency of tokens in class vocab
            dropout (float): Dropout rate
        """
        super(ProppyBaseEmbedder, self).__init__()
        self._dim = dim
        # Text embedder
        self._utterance_embedder = utterance_embedder
        self._max_words = utterance_embedder.max_words
        self._recursive_texts = recursive_texts
        # Attribute embedders
        self._attr_embed_dim = attr_embed_dim
        tags = [UNK, EOS] + TAGS
        self._tag_embedder = TokenEmbedder(
            RandomEmbeddings(tags, attr_embed_dim))
        ids = read_frequency_vocab('frequent-ids', min_id_freq)
        self._id_embedder = AverageUtteranceEmbedder(
            TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)),
            max_attr_tokens)
        classes = read_frequency_vocab('frequent-classes', min_class_freq)
        self._classes_embedder = AverageUtteranceEmbedder(
            TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)),
            max_attr_tokens)
        coords_dim = 3
        # Combine
        input_dim = (self._utterance_embedder.embed_dim + 3 * attr_embed_dim +
                     coords_dim)
        self.dropout = nn.Dropout(dropout)
        # self.fc = nn.Linear(self._utterance_embedder.embed_dim, dim)
        self.fc = nn.Linear(input_dim, dim)

    @property
    def embed_dim(self):
        return self._dim

    @property
    def token_embedder(self):
        return self._utterance_embedder.token_embedder

    @property
    def utterance_embedder(self):
        return self._utterance_embedder

    def forward(self, nodes):
        """Embeds a batch of Nodes.

        Args:
            nodes (list[Node])
        Returns:
            embeddings (Tensor): num_nodes x embed_dim
        """
        texts = []
        utterance_embedder = self._utterance_embedder
        for node in nodes:
            if self._recursive_texts:
                text = ' '.join(node.all_texts(max_words=self._max_words))
            else:
                text = node.text or ''
            texts.append(utterance_embedder.tokenize(text.lower()))
        text_embeddings = self._utterance_embedder(texts)

        # num_nodes x attr_embed_dim
        tag_embeddings = self._tag_embedder.embed_tokens(
            [node.tag for node in nodes])

        # num_nodes x attr_embed_dim
        id_embedder = self._id_embedder
        id_embeddings = self._id_embedder(
            [id_embedder.tokenize(node.id_) for node in nodes])

        # num_nodes x attr_embed_dim
        classes_embedder = self._classes_embedder
        class_embeddings = self._classes_embedder([
            classes_embedder.tokenize(' '.join(node.classes)) for node in nodes
        ])

        # num_nodes x 3
        coords = V(
            torch.tensor([[elem.x_ratio, elem.y_ratio,
                           float(elem.visible)] for elem in nodes],
                         dtype=torch.float32))

        # num_nodes x dom_embed_dim
        dom_embeddings = torch.cat((text_embeddings, tag_embeddings,
                                    id_embeddings, class_embeddings, coords),
                                   dim=1)
        # dom_embeddings = text_embeddings
        return self.fc(dom_embeddings)
Пример #16
0
    def _build_model(cls, model_config, optim_config, data_config):
        """Build Editor.

        Args:
            model_config (Config): Editor config
            optim_config (Config): optimization config
            data_config (Config): dataset config

        Returns:
            Editor
        """

        file_path = join(data.workspace.word_vectors, model_config.wvec_path)
        word_embeddings = load_embeddings(file_path, model_config.word_dim,
                                          model_config.vocab_size,
                                          model_config.num_copy_tokens)
        word_dim = word_embeddings.embed_dim

        edit_model = cls._build_editor(model_config,
                                       data_config,
                                       word_embeddings,
                                       word_dim,
                                       vae_mode=False)

        #VAEretreiver
        vocab_dict = word_embeddings.vocab._word2index
        encoder = Encoder(word_dim,
                          model_config.agenda_dim,
                          model_config.hidden_dim,
                          model_config.encoder_layers,
                          len(data_config.source_cols),
                          model_config.encoder_dropout_prob,
                          use_vae=True,
                          kappa=model_config.vae_kappa,
                          use_target=False)
        source_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_source_embeds)
        target_token_embedder = TokenEmbedder(word_embeddings,
                                              model_config.train_target_embeds)
        ret_copy_len = [5, 10, 165]
        num_inputs = len(data_config.source_cols)
        decoder_cell = AttentionDecoderCell(
            target_token_embedder,
            2 * word_dim,
            # 2 * word_dim because we concat base and copy vectors
            model_config.agenda_dim,
            model_config.hidden_dim,
            model_config.hidden_dim,
            model_config.attention_dim,
            num_layers=model_config.decoder_layers,
            num_inputs=num_inputs,
            dropout_prob=model_config.decoder_dropout_prob,
            disable_attention=True)
        vae_model = VAERetriever(source_token_embedder, encoder, decoder_cell,
                                 ret_copy_len)
        ret_model = vae_model

        vae_ret_model = EditRetriever(vae_model, ret_model, edit_model)
        vae_ret_model = try_gpu(vae_ret_model)

        optimizer = optim.Adam(vae_ret_model.parameters(),
                               lr=optim_config.learning_rate)
        #optimizer = optim.SGD(vae_ret_model.parameters(), lr=optim_config.learning_rate)

        return vae_ret_model, optimizer
Пример #17
0
class AlignmentModel(nn.Module):

    def __init__(self, phrase_embedder, token_embedder,
                 max_words, node_filter, top_k=5, dropout=0.3,
                 ablate_text=False, ablate_attrs=False, use_neighbors=False, use_tags=False,
                 neighbor_rels=['above', 'left'], max_neighbors=1):
            # neighbor_rels=['above','below','left','right'], max_neighbors=1):
        """
        Args:
            node_filter (callable[(WebPage, web_page_code) -> list]):
                A function that returns a mask array of length len(web_page.nodes)
                indicating whether the node is a valid candidate
            top_k (int): Number of predictions to return
        """
        super(AlignmentModel, self).__init__()

        self.phrase_embedder = phrase_embedder

        self.ablate_text = ablate_text
        self.ablate_attrs = ablate_attrs
        self.use_neighbors = use_neighbors

        conv_dim = 3
        dilation = 2
        pool_dim = 2
        # doesn't change the dimension
        self.conv2d = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1)
        self.conv2d_dilated = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1, dilation=dilation)
        self.pooler = nn.MaxPool2d(pool_dim)
        self.score_dim = int(math.pow(math.ceil((max_words+1) / float(pool_dim)), 2))
        self.scorer = nn.Linear(self.score_dim, 1)

        # idea: compute a bunch of latent score vectors before computing
        # logits, take a linear layer down to 1 score
        # purpose: if you want to compute scores with neighbors, you can now
        # average neighbor score vectors and Linear down to 1 score

        # neighbor_score_dim = 10

        if self.use_neighbors:
            self._max_neighbors = max_neighbors
            self._neighbor_rels = {x: i for (i, x) in enumerate(sorted(set(neighbor_rels)))}
            self.num_rels = len(neighbor_rels)
            assert all(x in GraphRels.LOOKUP for x in self._neighbor_rels)

            # score_embed_dim = int(math.ceil((self.score_dim) / float(pool_dim)))
            score_dim = self.score_dim * (self.num_rels*max_neighbors + 1)
            # self.pool_neighbors = nn.MaxPool1d(pool_dim)
            self._final_neighbor_linear = nn.Linear(score_dim, 1)
            # extra_nodes = self.num_rels * max_neighbors
        else:
            # extra_nodes = 0
            pass

        self.dropout = nn.Dropout(dropout)

        self.token_embedder = token_embedder
        self.max_words = max_words
        self.node_filter = node_filter
        self.loss = nn.CrossEntropyLoss(reduction="none")
        self.top_k = top_k

        self.use_tags = use_tags
        if self.use_tags:
            tags = [UNK, EOS] + TAGS
            tag_dim = 10
            self._tag_embedder = TokenEmbedder(RandomEmbeddings(tags, tag_dim))
            self.project_tag = nn.Linear(tag_dim + self.score_dim, self.score_dim)

    def forward(self, web_page, examples, logits_only=False):
        """Compute predictions and loss.

        Args:
            web_page (WebPage): The web page of the examples
            examples (list[PhraseNodeExample]): Must be from the same web page.
            logits_only (bool)
        Returns:
            logits (Tensor): num_phrases x num_nodes
                Each entry (i,j) is the logit for p(node_j | phrase_i)
            losses (Tensor): num_phrases
            predictions (Tensor): num_phrases
        """
        phrase_embedder = self.phrase_embedder

        def max_scorer(pairwise_scores):
            """
            Args:
                pairwise_scores: num_nodes x phrase_len x max_text_len
            """
            scores = torch.max(pairwise_scores, dim=1)[0]
            return torch.max(scores, dim=1)[0]

        def cnn_scorer(pairwise_scores):
            """
            Args:
                pairwise_scores: num_nodes x phrase_len x max_text_len
            """
            scores = torch.unsqueeze(pairwise_scores, dim=1)
            scores = self.conv2d(scores)
            scores = self.conv2d_dilated(scores)
            scores = self.pooler(scores)
            scores = torch.squeeze(scores, dim=1)
            # dim = scores.shape[1]*scores.shape[2]
            scores = scores.view(-1,self.score_dim)
            if self.use_tags:
                tags = [node.tag for node in web_page.nodes]
                tag_embeddings = self._tag_embedder.embed_tokens(tags)
                scores = torch.cat((scores,tag_embeddings), dim=1)
                scores = self.project_tag(scores)
            scores = self.scorer(scores)
            scores = torch.squeeze(scores, dim=1)
            return scores

        def neighbor_cnn_scorer(pairwise_scores):
            """
            Args:
                pairwise_scores: num_nodes x phrase_len x max_text_len
            """
            scores = torch.unsqueeze(pairwise_scores, dim=1)
            scores = self.conv2d(scores)
            scores = self.conv2d_dilated(scores)
            scores = self.pooler(scores)
            scores = torch.squeeze(scores, dim=1)
            # dim = scores.shape[1]*scores.shape[2]
            scores = scores.view(-1,self.score_dim)
            if self.use_tags:
                tags = [node.tag for node in web_page.nodes]
                tag_embeddings = self._tag_embedder.embed_tokens(tags)
                scores = torch.cat((scores,tag_embeddings), dim=1)
                scores = self.project_tag(scores)
            return scores

        # Tokenize the nodes
        # num_nodes x text_length x embed_dim
        texts = []
        for node in web_page.nodes:
            text = ' '.join(node.all_texts(max_words=self.max_words))
            output = []
            if not self.ablate_text:
                output += phrase_embedder.tokenize(text)
            if not self.ablate_attrs:
                # TODO better way to include attributes?
                output += phrase_embedder.tokenize(semantic_attrs(node.attributes))
            texts.append(output)

        embedded_texts = embed_tokens(self.token_embedder, self.max_words, texts)
        embedded_texts_values = self.dropout(embedded_texts.values)

        embedded_texts = embedded_texts_values * embedded_texts.mask.unsqueeze(2)

        # Tokenize the phrases
        # num_phrases x phrase_length x embed_dim
        logits = []

        if not self.use_neighbors:
            for example in examples:
                phrase = [phrase_embedder.tokenize(example.phrase)]
                embedded_phrase = embed_tokens(self.token_embedder, self.max_words, phrase)

                embedded_phrase_values = self.dropout(embedded_phrase.values)

                # expand: num_nodes x phrase_len x embed_dim
                batch_phrase = embedded_phrase_values.expand(len(texts), -1, -1)
                # permute embedded_texts: num_nodes x embed_dim x max_text_len
                pairwise_scores = torch.bmm(batch_phrase, embedded_texts.permute(0, 2, 1))

                # compute scores
                scores = cnn_scorer(pairwise_scores)
                logits.append(torch.unsqueeze(scores, dim=0))
        else:
            intermediate_scores = []
            for example in examples:
                phrase = [phrase_embedder.tokenize(example.phrase)]
                embedded_phrase = embed_tokens(self.token_embedder, self.max_words, phrase)

                embedded_phrase_values = self.dropout(embedded_phrase.values)

                # expand: num_nodes x phrase_len x embed_dim
                batch_phrase = embedded_phrase_values.expand(len(texts), -1, -1)
                # permuted embedded_texts: num_nodes x embed_dim x max_text_len
                pairwise_scores = torch.bmm(batch_phrase, embedded_texts.permute(0, 2, 1))
                node_score = neighbor_cnn_scorer(pairwise_scores)
                intermediate_scores.append(node_score)

            neighbors, masks = web_page.get_spatial_neighbors()
            neighbors, masks = V(torch.tensor(neighbors, dtype=torch.long)), V(torch.tensor(masks, dtype=torch.float32))
            masks = masks.unsqueeze(dim=2)

            # each node_score tensor is parameterized by phrase
            for node_score in intermediate_scores:
                # get pairwise_scores for all neighbors...
                # neighbors, rels = self._get_neighbors(web_page)
                batch_size = len(node_score)
                neighbor_scores = torch.index_select(node_score, 0, neighbors.view(-1))
                neighbor_scores = neighbor_scores.view(batch_size, neighbors.shape[1], -1)
                neighbor_scores = neighbor_scores * masks

                if neighbor_scores.shape[1] < self.num_rels:
                    more = self.num_rels - neighbor_scores.shape[1]
                    num_nodes, _, embed_dim = neighbor_scores.shape
                    padding = V(torch.zeros(num_nodes, more, embed_dim))
                    neighbor_scores = torch.cat((neighbor_scores, padding), dim=1)
                # num_nodes x num_neighbors x intermediate_score_dim

                node_score = torch.unsqueeze(node_score, dim=1)
                scores = torch.cat((node_score, neighbor_scores), dim=1)

                scores = scores.view(node_score.shape[0], -1)
                scores = self._final_neighbor_linear(scores)
                scores = torch.squeeze(scores, dim=1)

                logits.append(torch.unsqueeze(scores, dim=0))

        logits = torch.cat(logits, dim=0)

        # Filter the candidates
        node_filter_mask = self.node_filter(web_page, examples[0].web_page_code)  # what does this do?
        log_node_filter_mask = V(torch.tensor([0. if x else -999999. for x in node_filter_mask], dtype=torch.float32))
        logits = logits + log_node_filter_mask
        if logits_only:
            return logits

        # Losses and predictions
        targets = V(torch.tensor([web_page.xid_to_ref.get(x.target_xid, 0) for x in examples], dtype=torch.long))
        mask = V(torch.tensor([int(x.target_xid in web_page.xid_to_ref and node_filter_mask[web_page.xid_to_ref[x.target_xid]])
                     for x in examples], dtype=torch.float32))
        losses = self.loss(logits, targets) * mask
        # print '=' * 20, examples[0].web_page_code
        # print [node_filter_mask[web_page.xid_to_ref.get(x.target_xid, 0)] for x in examples]
        # print [logits.detach()[i, web_page.xid_to_ref.get(x.target_xid, 0)] for (i, x) in enumerate(examples)]
        # print logits, targets, mask, losses
        if not isfinite(losses.detach().sum()):
            # raise ValueError('Losses has NaN')
            logging.warning('Losses has NaN')
            # print losses
        # num_phrases x top_k
        top_k = min(self.top_k, len(web_page.nodes))
        predictions = torch.topk(logits, top_k, dim=1)[1]
        return logits, losses, predictions

    def _get_neighbors(self, web_page):
        """Get indices of at most |max_neighbors| neighbors for each relation

        Args:
            web_page (WebPage)
        Returns:
            neighbors: SequenceBatch of shape num_nodes x ???
                containing the neighbor refs
                (??? is at most max_neighbors * len(neighbor_rels))
            rels: SequenceBatch of shape num_nodes x ???
                containing the relation indices
        """
        g = web_page.graph
        batch_neighbors = [[] for _ in range(len(web_page.nodes))]
        batch_rels = [[] for _ in range(len(web_page.nodes))]
        for src, tgts in g.nodes.items():
            # Group by relation
            rel_to_tgts = defaultdict(list)
            for tgt, rels in tgts.items():
                for rel in rels:
                    rel_to_tgts[rel].append(tgt)
            # Sample if needed
            for rel, index in self._neighbor_rels.items():
                tgts = rel_to_tgts[rel]
                random.shuffle(tgts)
                if not tgts:
                    continue
                if len(tgts) > self._max_neighbors:
                    tgts = tgts[:self._max_neighbors]
                batch_neighbors[src].extend(tgts)
                batch_rels[src].extend([index] * len(tgts))
        # Create SequenceBatches
        max_len = max(len(x) for x in batch_neighbors)
        batch_mask = []
        for neighbors, rels in zip(batch_neighbors, batch_rels):
            assert len(neighbors) == len(rels)
            this_len = len(neighbors)
            batch_mask.append([1.] * this_len + [0.] * (max_len - this_len))
            neighbors.extend([0] * (max_len - this_len))
            rels.extend([0] * (max_len - this_len))
        return (SequenceBatch(V(torch.tensor(batch_neighbors, dtype=torch.long)),
                              V(torch.tensor(batch_mask, dtype=torch.float32))),
                SequenceBatch(V(torch.tensor(batch_rels, dtype=torch.long)),
                              V(torch.tensor(batch_mask, dtype=torch.float32))))
Пример #18
0
class BaseDOMElementEmbedder(Embedder):
    """Embeds a single DOMElement based on its text, tag and value."""
    def __init__(self,
                 utterance_embedder,
                 tag_embed_dim,
                 value_embed_dim,
                 tampered_embed_dim,
                 classes_embed_dim,
                 max_classes=2000,
                 max_tags=100):
        """
        Args:
            utterance_embedder (UtteranceEmbedder)
            tag_embed_dim (int): embedding dim of tags
            tampered_embed_dim (int): embedding dim of the tampered attribute
            classes_embed_dim (int): embedding dim of the classes
            max_classes (int): maximum number of supported classes to embed
        """
        super(BaseDOMElementEmbedder, self).__init__()

        self._utterance_embedder = utterance_embedder
        self._tag_embedder = TokenEmbedder(LazyInitEmbeddings(
            tag_embed_dim, max_tags),
                                           trainable=True)
        self._value_embedder = TokenEmbedder(
            DOMValueEmbeddings(value_embed_dim), trainable=True)
        self._tampered_embedder = TokenEmbedder(
            BoolEmbeddings(tampered_embed_dim), trainable=True)
        self._classes_embedder = TokenEmbedder(LazyInitEmbeddings(
            classes_embed_dim, max_classes),
                                               trainable=True)
        self._colors_dim = 8  # 4 (rgba) for fg and 4 for bg
        self._coords_dim = 2  # left and top

    @classmethod
    def from_config(cls, utterance_embedder, config):
        """Constructs a BaseDOMElementEmbedder from a config.

        Args:
            utterance_embedder (UtteranceEmbedder): the utterance embedder
            config (Config): has tag_embed_dim, value_embed_dim,
                tampered_embed_dim, classes_embed_dim

        Returns:
            BaseDOMElementEmbedder
        """
        return cls(utterance_embedder, config.tag_embed_dim,
                   config.value_embed_dim, config.tampered_embed_dim,
                   config.classes_embed_dim)

    def forward(self, dom_elem):
        """Embeds a batch of DOMElements.

        Args:
            dom_elem (list[list[DOMElement]]): batch of list of DOM. Each
                batch must already be padded to have the same number of DOM
                elements.

        Returns:
            Variable(FloatTensor): batch x num_dom_elems x embed_dim
        """
        # Check that the batches are rectangular
        for dom_list in dom_elem:
            assert len(dom_list) == len(dom_elem[0])

        num_dom_elems = len(dom_elem[0])
        dom_elem = flatten(dom_elem)

        # (batch * max_dom_num) x lstm_dim
        text_embeddings = []
        for batch in as_batches(dom_elem, 100):
            final_states, combined_states = self._utterance_embedder(
                [word_tokenize(dom.text) for dom in batch])
            text_embeddings.append(final_states)
        text_embeddings = torch.cat(text_embeddings, 0)

        # (batch * max_dom_num) x tag_embed_dim
        tag_embeddings = self._tag_embedder.embed_tokens(
            [dom.tag for dom in dom_elem])

        value_embeddings = self._value_embedder.embed_tokens(
            [bool(dom.value) for dom in dom_elem])

        tampered_embeddings = self._tampered_embedder.embed_tokens(
            [dom.tampered for dom in dom_elem])

        class_embeddings = self._classes_embedder.embed_tokens(
            [dom.classes for dom in dom_elem])

        # (batch * max_dom_num) x 4
        fg_colors = [
            GPUVariable(torch.FloatTensor(elem.fg_color)) for elem in dom_elem
        ]
        fg_colors = torch.stack(fg_colors)
        bg_colors = [
            GPUVariable(torch.FloatTensor(elem.bg_color)) for elem in dom_elem
        ]
        bg_colors = torch.stack(bg_colors)

        # (batch * max_dom_num) x 2
        coords = [
            GPUVariable(
                torch.FloatTensor((float(elem.left) / positions.IMAGE_COLS,
                                   float(elem.top) / positions.IMAGE_ROWS)))
            for elem in dom_elem
        ]
        coords = torch.stack(coords)

        # (batch * max_dom_num) * dom_embed_dim
        dom_embeddings = torch.cat(
            (text_embeddings, tag_embeddings, value_embeddings,
             tampered_embeddings, class_embeddings, coords, fg_colors,
             bg_colors),
            dim=1)

        # batch x max_dom_num x dom_embed_dim
        return dom_embeddings.view(-1, num_dom_elems, self.embed_dim)

    @property
    def embed_dim(self):
        return self._tag_embedder.embed_dim + \
               self._utterance_embedder.embed_dim + \
               self._value_embedder.embed_dim + \
               self._tampered_embedder.embed_dim + \
               self._colors_dim + self._coords_dim + \
               self._classes_embedder.embed_dim
Пример #19
0
class AllanBaseEmbedder(nn.Module):
    def __init__(self,
                 dim,
                 utterance_embedder,
                 recursive_texts,
                 attr_embed_dim,
                 max_attr_tokens,
                 min_id_freq,
                 min_class_freq,
                 dropout,
                 ablate_text=False,
                 ablate_attrs=False):
        """
        Args:
            dim (int): Target embedding dimension
            utterance_embedder (UtteranceEmbedder)
            recursive_texts (bool): For node text, whether to recursively combine the
                texts of the descendants
            attr_embed_dim (int): Size of each attribute embedding
            max_attr_tokens (int): Limit the number of attribute tokens to embed
            min_id_freq (int): Minimum token frequency of tokens in id vocab
            min_class_freq (int): Minimum token frequency of tokens in class vocab
            dropout (float): Dropout rate
        """
        super(AllanBaseEmbedder, self).__init__()
        self._dim = dim

        # Text embedder
        self._utterance_embedder = utterance_embedder
        self._max_words = utterance_embedder.max_words
        self._recursive_texts = recursive_texts
        self.ablate_text = ablate_text
        self.ablate_attrs = ablate_attrs

        # Attribute embedders
        self._attr_embed_dim = attr_embed_dim

        tags = [UNK, EOS] + TAGS
        self._tag_embedder = TokenEmbedder(
            RandomEmbeddings(tags, attr_embed_dim))

        ids = read_frequency_vocab('frequent-ids', min_id_freq)
        self._id_embedder = AverageUtteranceEmbedder(
            TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)),
            max_attr_tokens)
        # self._id_embedder = attr_embedder

        classes = read_frequency_vocab('frequent-classes', min_class_freq)
        self._classes_embedder = AverageUtteranceEmbedder(
            TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)),
            max_attr_tokens)
        # self._classes_embedder = attr_embedder
        coords_dim = 3

        self._other_embedder = self.utterance_embedder

        # Combine
        input_dim = (2 * self._utterance_embedder.embed_dim +
                     3 * attr_embed_dim + coords_dim)
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(input_dim, dim)

    @property
    def embed_dim(self):
        return self._dim

    @property
    def token_embedder(self):
        return self._utterance_embedder.token_embedder

    @property
    def utterance_embedder(self):
        return self._utterance_embedder

    def forward(self, nodes):
        """Embeds a batch of Nodes.

        Args:
            nodes (list[Node])
        Returns:
            embeddings (Tensor): num_nodes x embed_dim
        """
        texts = []
        for node in nodes:
            if not self.ablate_text:
                if self._recursive_texts:
                    text = ' '.join(node.all_texts(max_words=self._max_words))
                else:
                    text = node.text or ''
                texts.append(word_tokenize2(text))
            else:
                texts.append([])
        text_embeddings = self._utterance_embedder(texts)

        # num_nodes x attr_embed_dim
        tags = [node.tag for node in nodes]
        tag_embeddings = self._tag_embedder.embed_tokens(tags)
        # num_nodes x attr_embed_dim
        if not self.ablate_attrs:
            ids = [word_tokenize2(node.id_) for node in nodes]
        else:
            ids = [[] for node in nodes]
        id_embeddings = self._id_embedder(ids)
        # num_nodes x attr_embed_dim
        if not self.ablate_attrs:
            classes = [
                word_tokenize2(' '.join(node.classes)) for node in nodes
            ]
        else:
            classes = [[] for node in nodes]
        class_embeddings = self._classes_embedder(classes)

        if not self.ablate_attrs:
            other = [
                word_tokenize2(semantic_attrs(node.attributes))
                for node in nodes
            ]
        else:
            other = [[] for node in nodes]
        other_embeddings = self._other_embedder(other)
        # num_nodes x 3
        coords = V(
            FT([[node.x_ratio, node.y_ratio,
                 float(node.visible)] for node in nodes]))

        # num_nodes x dom_embed_dim
        dom_embeddings = torch.cat(
            (text_embeddings, tag_embeddings, id_embeddings, class_embeddings,
             other_embeddings, coords),
            dim=1)
        #dom_embeddings = text_embeddings
        return self.fc(dom_embeddings)
Пример #20
0
    def __init__(self, phrase_embedder, token_embedder,
                 max_words, node_filter, top_k=5, dropout=0.3,
                 ablate_text=False, ablate_attrs=False, use_neighbors=False, use_tags=False,
                 neighbor_rels=['above', 'left'], max_neighbors=1):
            # neighbor_rels=['above','below','left','right'], max_neighbors=1):
        """
        Args:
            node_filter (callable[(WebPage, web_page_code) -> list]):
                A function that returns a mask array of length len(web_page.nodes)
                indicating whether the node is a valid candidate
            top_k (int): Number of predictions to return
        """
        super(AlignmentModel, self).__init__()

        self.phrase_embedder = phrase_embedder

        self.ablate_text = ablate_text
        self.ablate_attrs = ablate_attrs
        self.use_neighbors = use_neighbors

        conv_dim = 3
        dilation = 2
        pool_dim = 2
        # doesn't change the dimension
        self.conv2d = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1)
        self.conv2d_dilated = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1, dilation=dilation)
        self.pooler = nn.MaxPool2d(pool_dim)
        self.score_dim = int(math.pow(math.ceil((max_words+1) / float(pool_dim)), 2))
        self.scorer = nn.Linear(self.score_dim, 1)

        # idea: compute a bunch of latent score vectors before computing
        # logits, take a linear layer down to 1 score
        # purpose: if you want to compute scores with neighbors, you can now
        # average neighbor score vectors and Linear down to 1 score

        # neighbor_score_dim = 10

        if self.use_neighbors:
            self._max_neighbors = max_neighbors
            self._neighbor_rels = {x: i for (i, x) in enumerate(sorted(set(neighbor_rels)))}
            self.num_rels = len(neighbor_rels)
            assert all(x in GraphRels.LOOKUP for x in self._neighbor_rels)

            # score_embed_dim = int(math.ceil((self.score_dim) / float(pool_dim)))
            score_dim = self.score_dim * (self.num_rels*max_neighbors + 1)
            # self.pool_neighbors = nn.MaxPool1d(pool_dim)
            self._final_neighbor_linear = nn.Linear(score_dim, 1)
            # extra_nodes = self.num_rels * max_neighbors
        else:
            # extra_nodes = 0
            pass

        self.dropout = nn.Dropout(dropout)

        self.token_embedder = token_embedder
        self.max_words = max_words
        self.node_filter = node_filter
        self.loss = nn.CrossEntropyLoss(reduction="none")
        self.top_k = top_k

        self.use_tags = use_tags
        if self.use_tags:
            tags = [UNK, EOS] + TAGS
            tag_dim = 10
            self._tag_embedder = TokenEmbedder(RandomEmbeddings(tags, tag_dim))
            self.project_tag = nn.Linear(tag_dim + self.score_dim, self.score_dim)
Пример #21
0
 def token_embedder(self, base_vocab, embeds_array, dynamic_vocabs):
     word_embeds = SimpleEmbeddings(embeds_array, base_vocab)
     base_embedder = TokenEmbedder(word_embeds)
     return DynamicMultiVocabTokenEmbedder(base_embedder, dynamic_vocabs, base_vocab)