def _build_editor(cls, config, num_iter, eps, momentum): """Build Editor. Args: config (Config): Editor config Returns: Editor """ file_path = join(data.workspace.word_vectors, config.wvec_path) word_embeddings = SimpleEmbeddings.from_file(file_path, config.word_dim, vocab_size=config.vocab_size) word_embeddings = word_embeddings.with_special_tokens() source_token_embedder = TokenEmbedder(word_embeddings) target_token_embedder = TokenEmbedder(word_embeddings) if config.decoder_cell == 'SimpleDecoderCell': decoder_cell = SimpleDecoderCell(target_token_embedder, config.hidden_dim, config.word_dim, config.agenda_dim) elif config.decoder_cell == 'AttentionDecoderCell': decoder_cell = AttentionDecoderCell(target_token_embedder, config.agenda_dim, config.hidden_dim, config.hidden_dim, config.attention_dim, config.no_insert_delete_attn, num_layers=config.decoder_layers) else: raise ValueError('{} not implemented'.format(config.decoder_cell)) editor = Editor(source_token_embedder, config.hidden_dim, config.agenda_dim, config.edit_dim, config.lamb_reg, config.norm_eps, config.norm_max, config.kill_edit, decoder_cell, config.encoder_layers, num_iter, eps, momentum) editor = try_gpu(editor) return editor
def _build_model(cls, model_config, optim_config, data_config): """Build Editor. Args: model_config (Config): Editor config optim_config (Config): optimization config data_config (Config): dataset config Returns: Editor """ file_path = join(data.workspace.word_vectors, model_config.wvec_path) word_embeddings = load_embeddings(file_path, model_config.word_dim, model_config.vocab_size, model_config.num_copy_tokens) word_dim = word_embeddings.embed_dim source_token_embedder = TokenEmbedder(word_embeddings, model_config.train_source_embeds) target_token_embedder = TokenEmbedder(word_embeddings, model_config.train_target_embeds) # number of input channels num_inputs = len(data_config.source_cols) decoder_cell = AttentionDecoderCell( target_token_embedder, 2 * word_dim, # 2 * word_dim because we concat base and copy vectors model_config.agenda_dim, model_config.hidden_dim, model_config.hidden_dim, model_config.attention_dim, num_layers=model_config.decoder_layers, num_inputs=num_inputs, dropout_prob=model_config.decoder_dropout_prob, disable_attention=False) encoder = Encoder(word_dim, model_config.agenda_dim, model_config.hidden_dim, model_config.encoder_layers, num_inputs, model_config.encoder_dropout_prob, False) copy_len = [5, 5, 40] model = Editor(source_token_embedder, encoder, decoder_cell, copy_lens=copy_len) model = try_gpu(model) optimizer = optim.Adam(model.parameters(), lr=optim_config.learning_rate) return model, optimizer
def input_embeds_list(self): sequences = [ [1, 2, 3], [8, 4, 2, 1, 1], [], ] # token 1 maps to embedding [1], 2 maps to [2] and so on... vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8]) array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1) token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array)) seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab)) return seq_embeds.split()
def __init__(self, utterance_embedder, tag_embed_dim, value_embed_dim, tampered_embed_dim, classes_embed_dim, max_classes=2000, max_tags=100): """ Args: utterance_embedder (UtteranceEmbedder) tag_embed_dim (int): embedding dim of tags tampered_embed_dim (int): embedding dim of the tampered attribute classes_embed_dim (int): embedding dim of the classes max_classes (int): maximum number of supported classes to embed """ super(BaseDOMElementEmbedder, self).__init__() self._utterance_embedder = utterance_embedder self._tag_embedder = TokenEmbedder(LazyInitEmbeddings( tag_embed_dim, max_tags), trainable=True) self._value_embedder = TokenEmbedder( DOMValueEmbeddings(value_embed_dim), trainable=True) self._tampered_embedder = TokenEmbedder( BoolEmbeddings(tampered_embed_dim), trainable=True) self._classes_embedder = TokenEmbedder(LazyInitEmbeddings( classes_embed_dim, max_classes), trainable=True) self._colors_dim = 8 # 4 (rgba) for fg and 4 for bg self._coords_dim = 2 # left and top
def _build_editor(cls, model_config, data_config, word_embeddings, word_dim, vae_mode): source_token_embedder = TokenEmbedder(word_embeddings, model_config.train_source_embeds) target_token_embedder = TokenEmbedder(word_embeddings, model_config.train_target_embeds) # number of input channels if vae_mode: num_inputs = len(data_config.source_cols) else: #edit model uses num_inputs + num_inputs + 1 num_inputs = len(data_config.source_cols) * 2 + 1 decoder_cell = AttentionDecoderCell( target_token_embedder, 2 * word_dim, # 2 * word_dim because we concat base and copy vectors model_config.agenda_dim, model_config.hidden_dim, model_config.hidden_dim, model_config.attention_dim, num_layers=model_config.decoder_layers, num_inputs=num_inputs, dropout_prob=model_config.decoder_dropout_prob, disable_attention=vae_mode) if vae_mode: encoder = Encoder(word_dim, model_config.agenda_dim, model_config.hidden_dim, model_config.encoder_layers, num_inputs, model_config.encoder_dropout_prob, vae_mode, model_config.vae_kappa) else: encoder = Encoder(word_dim, model_config.agenda_dim, model_config.hidden_dim, model_config.encoder_layers, num_inputs, model_config.encoder_dropout_prob, vae_mode) vae_copy_len = [5, 10, 185] editor_copy_len = [5, 10, 10, 5, 10, 10, 150] if vae_mode: model = Editor(source_token_embedder, encoder, decoder_cell, vae_copy_len) else: model = Editor(source_token_embedder, encoder, decoder_cell, editor_copy_len) model = try_gpu(model) return model
def get_stupid_embedder(config): """Create a new StupidEmbedder based on the config Args: config (Config): the root config Returns: StupidEmbedder """ cm = config.model cmu = cm.utterance_embedder cmt = cm.node_embedder.token_embedder # Token embedder magnitude_filename = cmt.magnitude_filename vocab_filename = cmt.vocab_filename word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename, cmt.vocab_size, cmt.word_embed_dim) token_embedder = TokenEmbedder(word_embeddings, trainable=cmu.trainable) # Utterance embedder if cmu.type == 'average': utterance_embedder = AverageUtteranceEmbedder(token_embedder, cmu.max_words) elif cmu.type == 'lstm': utterance_embedder = LSTMUtteranceEmbedder(token_embedder, cmu.lstm_dim, cmu.max_words) else: raise ValueError('Unknown UtteranceEmbedder type {}'.format(cmu.type)) # Embedder embedder = StupidEmbedder(cm.dim, utterance_embedder, cm.dropout) return embedder
def _build_model(cls, config): file_path = join(data.workspace.word_vectors, config.model.wvec_path) word_embeddings = SimpleEmbeddings.from_file( file_path, config.model.word_dim, vocab_size=config.model.vocab_size) word_embeddings = word_embeddings.with_special_tokens() token_embedder = TokenEmbedder(word_embeddings) model = None if config.model.type == 0: # regular language model model = LanguageModel(token_embedder, config.model.hidden_dim, config.model.agenda_dim, config.model.num_layers, cls._make_logger()) elif config.model.type == 1: # SVAE model = NoisyLanguageModel( token_embedder, config.model.hidden_dim, config.model.agenda_dim, config.model.num_layers, config.model.kl_weight_steps, config.model.kl_weight_rate, config.model.kl_weight_cap, config.model.dci_keep_rate, cls._make_logger()) assert model is not None model = try_gpu(model) optimizer = optim.Adam(model.parameters(), lr=config.optim.learning_rate) return model, optimizer
def seq_embedder(trainable): sent_dim = model_config.sent_dim token_embedder = TokenEmbedder(word_embeds, trainable) if trainable: transform = Linear(token_embedder.embed_dim, sent_dim) # if trainable, also add a linear transform else: transform = lambda x: x return BOWSequenceEmbedder(token_embedder, embed_dim=sent_dim, pool=model_config.pool_method, transform=transform)
def test_embedding_from_array(self): emb = TokenEmbedder._embedding_from_array(np.array([[9, 9], [8, 7]], dtype=np.float32)) assert isinstance(emb, Embedding) values = emb(GPUVariable(torch.LongTensor([[0, 0], [1, 0]]))) assert_tensor_equal(values, [ [[9, 9], [9, 9]], [[8, 7], [9, 9]], ])
def __init__(self, dim, utterance_embedder, recursive_texts, attr_embed_dim, max_attr_tokens, min_id_freq, min_class_freq, dropout): """ Args: dim (int): Target embedding dimension utterance_embedder (UtteranceEmbedder) recursive_texts (bool): For node text, whether to recursively combine the texts of the descendants attr_embed_dim (int): Size of each attribute embedding max_attr_tokens (int): Limit the number of attribute tokens to embed min_id_freq (int): Minimum token frequency of tokens in id vocab min_class_freq (int): Minimum token frequency of tokens in class vocab dropout (float): Dropout rate """ super(ProppyBaseEmbedder, self).__init__() self._dim = dim # Text embedder self._utterance_embedder = utterance_embedder self._max_words = utterance_embedder.max_words self._recursive_texts = recursive_texts # Attribute embedders self._attr_embed_dim = attr_embed_dim tags = [UNK, EOS] + TAGS self._tag_embedder = \ TokenEmbedder(RandomEmbeddings(tags, attr_embed_dim)) ids = read_frequency_vocab('frequent-ids', min_id_freq) self._id_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)), max_attr_tokens) classes = read_frequency_vocab('frequent-classes', min_class_freq) self._classes_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)), max_attr_tokens) coords_dim = 3 # Combine input_dim = (self._utterance_embedder.embed_dim + 3 * attr_embed_dim + coords_dim) self.dropout = nn.Dropout(dropout) #self.fc = nn.Linear(self._utterance_embedder.embed_dim, dim) self.fc = nn.Linear(input_dim, dim)
def get_proppy_embedder(config): """Create a new ProppyEmbedder based on the config Args: config (Config): the root config Returns: ProppyEmbedder """ cm = config.model cmu = cm.utterance_embedder cmt = cm.node_embedder.token_embedder # Token embedder magnitude_filename = cmt.magnitude_filename vocab_filename = cmt.vocab_filename word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename, cmt.vocab_size, cmt.word_embed_dim) token_embedder = TokenEmbedder(word_embeddings, trainable=cmu.trainable) # Utterance embedder if cmu.type == 'average': utterance_embedder = AverageUtteranceEmbedder(token_embedder, cmu.max_words) elif cmu.type == 'lstm': utterance_embedder = LSTMUtteranceEmbedder(token_embedder, cmu.lstm_dim, cmu.max_words) elif cmu.type == 'attention_lstm': utterance_embedder = AttentionUtteranceEmbedder( token_embedder, cmu.lstm_dim, cmu.max_words) else: raise ValueError('Unknown UtteranceEmbedder type {}'.format(cmu.type)) # Base node embedder cmb = cm.node_embedder.base_embedder base_embedder = ProppyBaseEmbedder(cm.dim, utterance_embedder, cmb.recursive_texts, cmb.attr_embed_dim, cmb.max_attr_tokens, cmb.min_id_freq, cmb.min_class_freq, cm.dropout) # Aggregator cmpr = cm.node_embedder.propagation if cmpr.iterations == 0: return base_embedder if cmpr.aggregator == 'pool_mlp': aggregator = PoolMLPAggregator(cm.dim, cm.dropout) elif cmpr.aggregator == 'mlp_pool': aggregator = MLPPoolAggregator(cm.dim, cm.dropout) else: raise ValueError('Unknown Aggregator {}'.format(cmpr.aggregator)) # Information propagation full_embedder = ProppyEmbedder(cm.dim, base_embedder, cmpr.iterations, cmpr.neighbor_rels, cmpr.max_neighbors, aggregator) return full_embedder
def from_config(cls, config): """Constructs the appropriate UtteranceEmbedder from a config. Args: config (Config) Returns: UtteranceEmbedder """ if config.type == "glove": glove_embeddings = GloveEmbeddings(config.vocab_size) token_embedder = TokenEmbedder(glove_embeddings, trainable=False) utterance_embedder = cls(token_embedder, config.lstm_dim) return utterance_embedder else: raise ValueError( "{} not a supported type of utterance embedder".format( config.type))
def get_allan_embedder(config): """Create a new AllanEmbedder based on the config Args: config (Config): the root config Returns: AllanEmbedder """ cm = config.model cmu = cm.utterance_embedder cmt = cm.node_embedder.token_embedder # cma = cm.node_embedder.attr_embedder cmb = cm.node_embedder.base_embedder # Token embedder magnitude_filename = cmt.magnitude_filename vocab_filename = cmt.vocab_filename word_embeddings = MagnitudeEmbeddings(magnitude_filename, vocab_filename, cmt.vocab_size, cmt.word_embed_dim) token_embedder = TokenEmbedder(word_embeddings, trainable=cmt.trainable) lang = cmt.lang # Utterance embedder utterance_embedder = make_embedder(token_embedder, cmu, lang) # Attribute embedder # attr_embedder = make_embedder(attr_token_embedder, cma) # AverageUtteranceEmbedder(TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)), max_attr_tokens) attr_embedder = utterance_embedder # Base node embedder base_embedder = AllanBaseEmbedder(cm.dim, utterance_embedder, attr_embedder, cmb.recursive_texts, cmt.word_embed_dim, cmb.max_attr_tokens, cmb.min_id_freq, cmb.min_class_freq, cm.dropout, ablate_text=cm.ablate_text, ablate_attrs=cm.ablate_attrs) return base_embedder
def embedder(self, request): vocab = SimpleVocab(['<unk>', '<start>', '<stop>'] + ['a', 'b', 'c']) arr = np.eye(len(vocab), dtype=np.float32) word_embeddings = Bunch(vocab=vocab, array=arr) return TokenEmbedder(word_embeddings, trainable=request.param)
class ProppyBaseEmbedder(nn.Module): def __init__(self, dim, utterance_embedder, recursive_texts, attr_embed_dim, max_attr_tokens, min_id_freq, min_class_freq, dropout): """ Args: dim (int): Target embedding dimension utterance_embedder (UtteranceEmbedder) recursive_texts (bool): For node text, whether to recursively combine the texts of the descendants attr_embed_dim (int): Size of each attribute embedding max_attr_tokens (int): Limit the number of attribute tokens to embed min_id_freq (int): Minimum token frequency of tokens in id vocab min_class_freq (int): Minimum token frequency of tokens in class vocab dropout (float): Dropout rate """ super(ProppyBaseEmbedder, self).__init__() self._dim = dim # Text embedder self._utterance_embedder = utterance_embedder self._max_words = utterance_embedder.max_words self._recursive_texts = recursive_texts # Attribute embedders self._attr_embed_dim = attr_embed_dim tags = [UNK, EOS] + TAGS self._tag_embedder = TokenEmbedder( RandomEmbeddings(tags, attr_embed_dim)) ids = read_frequency_vocab('frequent-ids', min_id_freq) self._id_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)), max_attr_tokens) classes = read_frequency_vocab('frequent-classes', min_class_freq) self._classes_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)), max_attr_tokens) coords_dim = 3 # Combine input_dim = (self._utterance_embedder.embed_dim + 3 * attr_embed_dim + coords_dim) self.dropout = nn.Dropout(dropout) # self.fc = nn.Linear(self._utterance_embedder.embed_dim, dim) self.fc = nn.Linear(input_dim, dim) @property def embed_dim(self): return self._dim @property def token_embedder(self): return self._utterance_embedder.token_embedder @property def utterance_embedder(self): return self._utterance_embedder def forward(self, nodes): """Embeds a batch of Nodes. Args: nodes (list[Node]) Returns: embeddings (Tensor): num_nodes x embed_dim """ texts = [] utterance_embedder = self._utterance_embedder for node in nodes: if self._recursive_texts: text = ' '.join(node.all_texts(max_words=self._max_words)) else: text = node.text or '' texts.append(utterance_embedder.tokenize(text.lower())) text_embeddings = self._utterance_embedder(texts) # num_nodes x attr_embed_dim tag_embeddings = self._tag_embedder.embed_tokens( [node.tag for node in nodes]) # num_nodes x attr_embed_dim id_embedder = self._id_embedder id_embeddings = self._id_embedder( [id_embedder.tokenize(node.id_) for node in nodes]) # num_nodes x attr_embed_dim classes_embedder = self._classes_embedder class_embeddings = self._classes_embedder([ classes_embedder.tokenize(' '.join(node.classes)) for node in nodes ]) # num_nodes x 3 coords = V( torch.tensor([[elem.x_ratio, elem.y_ratio, float(elem.visible)] for elem in nodes], dtype=torch.float32)) # num_nodes x dom_embed_dim dom_embeddings = torch.cat((text_embeddings, tag_embeddings, id_embeddings, class_embeddings, coords), dim=1) # dom_embeddings = text_embeddings return self.fc(dom_embeddings)
def _build_model(cls, model_config, optim_config, data_config): """Build Editor. Args: model_config (Config): Editor config optim_config (Config): optimization config data_config (Config): dataset config Returns: Editor """ file_path = join(data.workspace.word_vectors, model_config.wvec_path) word_embeddings = load_embeddings(file_path, model_config.word_dim, model_config.vocab_size, model_config.num_copy_tokens) word_dim = word_embeddings.embed_dim edit_model = cls._build_editor(model_config, data_config, word_embeddings, word_dim, vae_mode=False) #VAEretreiver vocab_dict = word_embeddings.vocab._word2index encoder = Encoder(word_dim, model_config.agenda_dim, model_config.hidden_dim, model_config.encoder_layers, len(data_config.source_cols), model_config.encoder_dropout_prob, use_vae=True, kappa=model_config.vae_kappa, use_target=False) source_token_embedder = TokenEmbedder(word_embeddings, model_config.train_source_embeds) target_token_embedder = TokenEmbedder(word_embeddings, model_config.train_target_embeds) ret_copy_len = [5, 10, 165] num_inputs = len(data_config.source_cols) decoder_cell = AttentionDecoderCell( target_token_embedder, 2 * word_dim, # 2 * word_dim because we concat base and copy vectors model_config.agenda_dim, model_config.hidden_dim, model_config.hidden_dim, model_config.attention_dim, num_layers=model_config.decoder_layers, num_inputs=num_inputs, dropout_prob=model_config.decoder_dropout_prob, disable_attention=True) vae_model = VAERetriever(source_token_embedder, encoder, decoder_cell, ret_copy_len) ret_model = vae_model vae_ret_model = EditRetriever(vae_model, ret_model, edit_model) vae_ret_model = try_gpu(vae_ret_model) optimizer = optim.Adam(vae_ret_model.parameters(), lr=optim_config.learning_rate) #optimizer = optim.SGD(vae_ret_model.parameters(), lr=optim_config.learning_rate) return vae_ret_model, optimizer
class AlignmentModel(nn.Module): def __init__(self, phrase_embedder, token_embedder, max_words, node_filter, top_k=5, dropout=0.3, ablate_text=False, ablate_attrs=False, use_neighbors=False, use_tags=False, neighbor_rels=['above', 'left'], max_neighbors=1): # neighbor_rels=['above','below','left','right'], max_neighbors=1): """ Args: node_filter (callable[(WebPage, web_page_code) -> list]): A function that returns a mask array of length len(web_page.nodes) indicating whether the node is a valid candidate top_k (int): Number of predictions to return """ super(AlignmentModel, self).__init__() self.phrase_embedder = phrase_embedder self.ablate_text = ablate_text self.ablate_attrs = ablate_attrs self.use_neighbors = use_neighbors conv_dim = 3 dilation = 2 pool_dim = 2 # doesn't change the dimension self.conv2d = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1) self.conv2d_dilated = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1, dilation=dilation) self.pooler = nn.MaxPool2d(pool_dim) self.score_dim = int(math.pow(math.ceil((max_words+1) / float(pool_dim)), 2)) self.scorer = nn.Linear(self.score_dim, 1) # idea: compute a bunch of latent score vectors before computing # logits, take a linear layer down to 1 score # purpose: if you want to compute scores with neighbors, you can now # average neighbor score vectors and Linear down to 1 score # neighbor_score_dim = 10 if self.use_neighbors: self._max_neighbors = max_neighbors self._neighbor_rels = {x: i for (i, x) in enumerate(sorted(set(neighbor_rels)))} self.num_rels = len(neighbor_rels) assert all(x in GraphRels.LOOKUP for x in self._neighbor_rels) # score_embed_dim = int(math.ceil((self.score_dim) / float(pool_dim))) score_dim = self.score_dim * (self.num_rels*max_neighbors + 1) # self.pool_neighbors = nn.MaxPool1d(pool_dim) self._final_neighbor_linear = nn.Linear(score_dim, 1) # extra_nodes = self.num_rels * max_neighbors else: # extra_nodes = 0 pass self.dropout = nn.Dropout(dropout) self.token_embedder = token_embedder self.max_words = max_words self.node_filter = node_filter self.loss = nn.CrossEntropyLoss(reduction="none") self.top_k = top_k self.use_tags = use_tags if self.use_tags: tags = [UNK, EOS] + TAGS tag_dim = 10 self._tag_embedder = TokenEmbedder(RandomEmbeddings(tags, tag_dim)) self.project_tag = nn.Linear(tag_dim + self.score_dim, self.score_dim) def forward(self, web_page, examples, logits_only=False): """Compute predictions and loss. Args: web_page (WebPage): The web page of the examples examples (list[PhraseNodeExample]): Must be from the same web page. logits_only (bool) Returns: logits (Tensor): num_phrases x num_nodes Each entry (i,j) is the logit for p(node_j | phrase_i) losses (Tensor): num_phrases predictions (Tensor): num_phrases """ phrase_embedder = self.phrase_embedder def max_scorer(pairwise_scores): """ Args: pairwise_scores: num_nodes x phrase_len x max_text_len """ scores = torch.max(pairwise_scores, dim=1)[0] return torch.max(scores, dim=1)[0] def cnn_scorer(pairwise_scores): """ Args: pairwise_scores: num_nodes x phrase_len x max_text_len """ scores = torch.unsqueeze(pairwise_scores, dim=1) scores = self.conv2d(scores) scores = self.conv2d_dilated(scores) scores = self.pooler(scores) scores = torch.squeeze(scores, dim=1) # dim = scores.shape[1]*scores.shape[2] scores = scores.view(-1,self.score_dim) if self.use_tags: tags = [node.tag for node in web_page.nodes] tag_embeddings = self._tag_embedder.embed_tokens(tags) scores = torch.cat((scores,tag_embeddings), dim=1) scores = self.project_tag(scores) scores = self.scorer(scores) scores = torch.squeeze(scores, dim=1) return scores def neighbor_cnn_scorer(pairwise_scores): """ Args: pairwise_scores: num_nodes x phrase_len x max_text_len """ scores = torch.unsqueeze(pairwise_scores, dim=1) scores = self.conv2d(scores) scores = self.conv2d_dilated(scores) scores = self.pooler(scores) scores = torch.squeeze(scores, dim=1) # dim = scores.shape[1]*scores.shape[2] scores = scores.view(-1,self.score_dim) if self.use_tags: tags = [node.tag for node in web_page.nodes] tag_embeddings = self._tag_embedder.embed_tokens(tags) scores = torch.cat((scores,tag_embeddings), dim=1) scores = self.project_tag(scores) return scores # Tokenize the nodes # num_nodes x text_length x embed_dim texts = [] for node in web_page.nodes: text = ' '.join(node.all_texts(max_words=self.max_words)) output = [] if not self.ablate_text: output += phrase_embedder.tokenize(text) if not self.ablate_attrs: # TODO better way to include attributes? output += phrase_embedder.tokenize(semantic_attrs(node.attributes)) texts.append(output) embedded_texts = embed_tokens(self.token_embedder, self.max_words, texts) embedded_texts_values = self.dropout(embedded_texts.values) embedded_texts = embedded_texts_values * embedded_texts.mask.unsqueeze(2) # Tokenize the phrases # num_phrases x phrase_length x embed_dim logits = [] if not self.use_neighbors: for example in examples: phrase = [phrase_embedder.tokenize(example.phrase)] embedded_phrase = embed_tokens(self.token_embedder, self.max_words, phrase) embedded_phrase_values = self.dropout(embedded_phrase.values) # expand: num_nodes x phrase_len x embed_dim batch_phrase = embedded_phrase_values.expand(len(texts), -1, -1) # permute embedded_texts: num_nodes x embed_dim x max_text_len pairwise_scores = torch.bmm(batch_phrase, embedded_texts.permute(0, 2, 1)) # compute scores scores = cnn_scorer(pairwise_scores) logits.append(torch.unsqueeze(scores, dim=0)) else: intermediate_scores = [] for example in examples: phrase = [phrase_embedder.tokenize(example.phrase)] embedded_phrase = embed_tokens(self.token_embedder, self.max_words, phrase) embedded_phrase_values = self.dropout(embedded_phrase.values) # expand: num_nodes x phrase_len x embed_dim batch_phrase = embedded_phrase_values.expand(len(texts), -1, -1) # permuted embedded_texts: num_nodes x embed_dim x max_text_len pairwise_scores = torch.bmm(batch_phrase, embedded_texts.permute(0, 2, 1)) node_score = neighbor_cnn_scorer(pairwise_scores) intermediate_scores.append(node_score) neighbors, masks = web_page.get_spatial_neighbors() neighbors, masks = V(torch.tensor(neighbors, dtype=torch.long)), V(torch.tensor(masks, dtype=torch.float32)) masks = masks.unsqueeze(dim=2) # each node_score tensor is parameterized by phrase for node_score in intermediate_scores: # get pairwise_scores for all neighbors... # neighbors, rels = self._get_neighbors(web_page) batch_size = len(node_score) neighbor_scores = torch.index_select(node_score, 0, neighbors.view(-1)) neighbor_scores = neighbor_scores.view(batch_size, neighbors.shape[1], -1) neighbor_scores = neighbor_scores * masks if neighbor_scores.shape[1] < self.num_rels: more = self.num_rels - neighbor_scores.shape[1] num_nodes, _, embed_dim = neighbor_scores.shape padding = V(torch.zeros(num_nodes, more, embed_dim)) neighbor_scores = torch.cat((neighbor_scores, padding), dim=1) # num_nodes x num_neighbors x intermediate_score_dim node_score = torch.unsqueeze(node_score, dim=1) scores = torch.cat((node_score, neighbor_scores), dim=1) scores = scores.view(node_score.shape[0], -1) scores = self._final_neighbor_linear(scores) scores = torch.squeeze(scores, dim=1) logits.append(torch.unsqueeze(scores, dim=0)) logits = torch.cat(logits, dim=0) # Filter the candidates node_filter_mask = self.node_filter(web_page, examples[0].web_page_code) # what does this do? log_node_filter_mask = V(torch.tensor([0. if x else -999999. for x in node_filter_mask], dtype=torch.float32)) logits = logits + log_node_filter_mask if logits_only: return logits # Losses and predictions targets = V(torch.tensor([web_page.xid_to_ref.get(x.target_xid, 0) for x in examples], dtype=torch.long)) mask = V(torch.tensor([int(x.target_xid in web_page.xid_to_ref and node_filter_mask[web_page.xid_to_ref[x.target_xid]]) for x in examples], dtype=torch.float32)) losses = self.loss(logits, targets) * mask # print '=' * 20, examples[0].web_page_code # print [node_filter_mask[web_page.xid_to_ref.get(x.target_xid, 0)] for x in examples] # print [logits.detach()[i, web_page.xid_to_ref.get(x.target_xid, 0)] for (i, x) in enumerate(examples)] # print logits, targets, mask, losses if not isfinite(losses.detach().sum()): # raise ValueError('Losses has NaN') logging.warning('Losses has NaN') # print losses # num_phrases x top_k top_k = min(self.top_k, len(web_page.nodes)) predictions = torch.topk(logits, top_k, dim=1)[1] return logits, losses, predictions def _get_neighbors(self, web_page): """Get indices of at most |max_neighbors| neighbors for each relation Args: web_page (WebPage) Returns: neighbors: SequenceBatch of shape num_nodes x ??? containing the neighbor refs (??? is at most max_neighbors * len(neighbor_rels)) rels: SequenceBatch of shape num_nodes x ??? containing the relation indices """ g = web_page.graph batch_neighbors = [[] for _ in range(len(web_page.nodes))] batch_rels = [[] for _ in range(len(web_page.nodes))] for src, tgts in g.nodes.items(): # Group by relation rel_to_tgts = defaultdict(list) for tgt, rels in tgts.items(): for rel in rels: rel_to_tgts[rel].append(tgt) # Sample if needed for rel, index in self._neighbor_rels.items(): tgts = rel_to_tgts[rel] random.shuffle(tgts) if not tgts: continue if len(tgts) > self._max_neighbors: tgts = tgts[:self._max_neighbors] batch_neighbors[src].extend(tgts) batch_rels[src].extend([index] * len(tgts)) # Create SequenceBatches max_len = max(len(x) for x in batch_neighbors) batch_mask = [] for neighbors, rels in zip(batch_neighbors, batch_rels): assert len(neighbors) == len(rels) this_len = len(neighbors) batch_mask.append([1.] * this_len + [0.] * (max_len - this_len)) neighbors.extend([0] * (max_len - this_len)) rels.extend([0] * (max_len - this_len)) return (SequenceBatch(V(torch.tensor(batch_neighbors, dtype=torch.long)), V(torch.tensor(batch_mask, dtype=torch.float32))), SequenceBatch(V(torch.tensor(batch_rels, dtype=torch.long)), V(torch.tensor(batch_mask, dtype=torch.float32))))
class BaseDOMElementEmbedder(Embedder): """Embeds a single DOMElement based on its text, tag and value.""" def __init__(self, utterance_embedder, tag_embed_dim, value_embed_dim, tampered_embed_dim, classes_embed_dim, max_classes=2000, max_tags=100): """ Args: utterance_embedder (UtteranceEmbedder) tag_embed_dim (int): embedding dim of tags tampered_embed_dim (int): embedding dim of the tampered attribute classes_embed_dim (int): embedding dim of the classes max_classes (int): maximum number of supported classes to embed """ super(BaseDOMElementEmbedder, self).__init__() self._utterance_embedder = utterance_embedder self._tag_embedder = TokenEmbedder(LazyInitEmbeddings( tag_embed_dim, max_tags), trainable=True) self._value_embedder = TokenEmbedder( DOMValueEmbeddings(value_embed_dim), trainable=True) self._tampered_embedder = TokenEmbedder( BoolEmbeddings(tampered_embed_dim), trainable=True) self._classes_embedder = TokenEmbedder(LazyInitEmbeddings( classes_embed_dim, max_classes), trainable=True) self._colors_dim = 8 # 4 (rgba) for fg and 4 for bg self._coords_dim = 2 # left and top @classmethod def from_config(cls, utterance_embedder, config): """Constructs a BaseDOMElementEmbedder from a config. Args: utterance_embedder (UtteranceEmbedder): the utterance embedder config (Config): has tag_embed_dim, value_embed_dim, tampered_embed_dim, classes_embed_dim Returns: BaseDOMElementEmbedder """ return cls(utterance_embedder, config.tag_embed_dim, config.value_embed_dim, config.tampered_embed_dim, config.classes_embed_dim) def forward(self, dom_elem): """Embeds a batch of DOMElements. Args: dom_elem (list[list[DOMElement]]): batch of list of DOM. Each batch must already be padded to have the same number of DOM elements. Returns: Variable(FloatTensor): batch x num_dom_elems x embed_dim """ # Check that the batches are rectangular for dom_list in dom_elem: assert len(dom_list) == len(dom_elem[0]) num_dom_elems = len(dom_elem[0]) dom_elem = flatten(dom_elem) # (batch * max_dom_num) x lstm_dim text_embeddings = [] for batch in as_batches(dom_elem, 100): final_states, combined_states = self._utterance_embedder( [word_tokenize(dom.text) for dom in batch]) text_embeddings.append(final_states) text_embeddings = torch.cat(text_embeddings, 0) # (batch * max_dom_num) x tag_embed_dim tag_embeddings = self._tag_embedder.embed_tokens( [dom.tag for dom in dom_elem]) value_embeddings = self._value_embedder.embed_tokens( [bool(dom.value) for dom in dom_elem]) tampered_embeddings = self._tampered_embedder.embed_tokens( [dom.tampered for dom in dom_elem]) class_embeddings = self._classes_embedder.embed_tokens( [dom.classes for dom in dom_elem]) # (batch * max_dom_num) x 4 fg_colors = [ GPUVariable(torch.FloatTensor(elem.fg_color)) for elem in dom_elem ] fg_colors = torch.stack(fg_colors) bg_colors = [ GPUVariable(torch.FloatTensor(elem.bg_color)) for elem in dom_elem ] bg_colors = torch.stack(bg_colors) # (batch * max_dom_num) x 2 coords = [ GPUVariable( torch.FloatTensor((float(elem.left) / positions.IMAGE_COLS, float(elem.top) / positions.IMAGE_ROWS))) for elem in dom_elem ] coords = torch.stack(coords) # (batch * max_dom_num) * dom_embed_dim dom_embeddings = torch.cat( (text_embeddings, tag_embeddings, value_embeddings, tampered_embeddings, class_embeddings, coords, fg_colors, bg_colors), dim=1) # batch x max_dom_num x dom_embed_dim return dom_embeddings.view(-1, num_dom_elems, self.embed_dim) @property def embed_dim(self): return self._tag_embedder.embed_dim + \ self._utterance_embedder.embed_dim + \ self._value_embedder.embed_dim + \ self._tampered_embedder.embed_dim + \ self._colors_dim + self._coords_dim + \ self._classes_embedder.embed_dim
class AllanBaseEmbedder(nn.Module): def __init__(self, dim, utterance_embedder, recursive_texts, attr_embed_dim, max_attr_tokens, min_id_freq, min_class_freq, dropout, ablate_text=False, ablate_attrs=False): """ Args: dim (int): Target embedding dimension utterance_embedder (UtteranceEmbedder) recursive_texts (bool): For node text, whether to recursively combine the texts of the descendants attr_embed_dim (int): Size of each attribute embedding max_attr_tokens (int): Limit the number of attribute tokens to embed min_id_freq (int): Minimum token frequency of tokens in id vocab min_class_freq (int): Minimum token frequency of tokens in class vocab dropout (float): Dropout rate """ super(AllanBaseEmbedder, self).__init__() self._dim = dim # Text embedder self._utterance_embedder = utterance_embedder self._max_words = utterance_embedder.max_words self._recursive_texts = recursive_texts self.ablate_text = ablate_text self.ablate_attrs = ablate_attrs # Attribute embedders self._attr_embed_dim = attr_embed_dim tags = [UNK, EOS] + TAGS self._tag_embedder = TokenEmbedder( RandomEmbeddings(tags, attr_embed_dim)) ids = read_frequency_vocab('frequent-ids', min_id_freq) self._id_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(ids, attr_embed_dim)), max_attr_tokens) # self._id_embedder = attr_embedder classes = read_frequency_vocab('frequent-classes', min_class_freq) self._classes_embedder = AverageUtteranceEmbedder( TokenEmbedder(RandomEmbeddings(classes, attr_embed_dim)), max_attr_tokens) # self._classes_embedder = attr_embedder coords_dim = 3 self._other_embedder = self.utterance_embedder # Combine input_dim = (2 * self._utterance_embedder.embed_dim + 3 * attr_embed_dim + coords_dim) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(input_dim, dim) @property def embed_dim(self): return self._dim @property def token_embedder(self): return self._utterance_embedder.token_embedder @property def utterance_embedder(self): return self._utterance_embedder def forward(self, nodes): """Embeds a batch of Nodes. Args: nodes (list[Node]) Returns: embeddings (Tensor): num_nodes x embed_dim """ texts = [] for node in nodes: if not self.ablate_text: if self._recursive_texts: text = ' '.join(node.all_texts(max_words=self._max_words)) else: text = node.text or '' texts.append(word_tokenize2(text)) else: texts.append([]) text_embeddings = self._utterance_embedder(texts) # num_nodes x attr_embed_dim tags = [node.tag for node in nodes] tag_embeddings = self._tag_embedder.embed_tokens(tags) # num_nodes x attr_embed_dim if not self.ablate_attrs: ids = [word_tokenize2(node.id_) for node in nodes] else: ids = [[] for node in nodes] id_embeddings = self._id_embedder(ids) # num_nodes x attr_embed_dim if not self.ablate_attrs: classes = [ word_tokenize2(' '.join(node.classes)) for node in nodes ] else: classes = [[] for node in nodes] class_embeddings = self._classes_embedder(classes) if not self.ablate_attrs: other = [ word_tokenize2(semantic_attrs(node.attributes)) for node in nodes ] else: other = [[] for node in nodes] other_embeddings = self._other_embedder(other) # num_nodes x 3 coords = V( FT([[node.x_ratio, node.y_ratio, float(node.visible)] for node in nodes])) # num_nodes x dom_embed_dim dom_embeddings = torch.cat( (text_embeddings, tag_embeddings, id_embeddings, class_embeddings, other_embeddings, coords), dim=1) #dom_embeddings = text_embeddings return self.fc(dom_embeddings)
def __init__(self, phrase_embedder, token_embedder, max_words, node_filter, top_k=5, dropout=0.3, ablate_text=False, ablate_attrs=False, use_neighbors=False, use_tags=False, neighbor_rels=['above', 'left'], max_neighbors=1): # neighbor_rels=['above','below','left','right'], max_neighbors=1): """ Args: node_filter (callable[(WebPage, web_page_code) -> list]): A function that returns a mask array of length len(web_page.nodes) indicating whether the node is a valid candidate top_k (int): Number of predictions to return """ super(AlignmentModel, self).__init__() self.phrase_embedder = phrase_embedder self.ablate_text = ablate_text self.ablate_attrs = ablate_attrs self.use_neighbors = use_neighbors conv_dim = 3 dilation = 2 pool_dim = 2 # doesn't change the dimension self.conv2d = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1) self.conv2d_dilated = nn.Conv2d(1, 1, conv_dim, padding=conv_dim-1, dilation=dilation) self.pooler = nn.MaxPool2d(pool_dim) self.score_dim = int(math.pow(math.ceil((max_words+1) / float(pool_dim)), 2)) self.scorer = nn.Linear(self.score_dim, 1) # idea: compute a bunch of latent score vectors before computing # logits, take a linear layer down to 1 score # purpose: if you want to compute scores with neighbors, you can now # average neighbor score vectors and Linear down to 1 score # neighbor_score_dim = 10 if self.use_neighbors: self._max_neighbors = max_neighbors self._neighbor_rels = {x: i for (i, x) in enumerate(sorted(set(neighbor_rels)))} self.num_rels = len(neighbor_rels) assert all(x in GraphRels.LOOKUP for x in self._neighbor_rels) # score_embed_dim = int(math.ceil((self.score_dim) / float(pool_dim))) score_dim = self.score_dim * (self.num_rels*max_neighbors + 1) # self.pool_neighbors = nn.MaxPool1d(pool_dim) self._final_neighbor_linear = nn.Linear(score_dim, 1) # extra_nodes = self.num_rels * max_neighbors else: # extra_nodes = 0 pass self.dropout = nn.Dropout(dropout) self.token_embedder = token_embedder self.max_words = max_words self.node_filter = node_filter self.loss = nn.CrossEntropyLoss(reduction="none") self.top_k = top_k self.use_tags = use_tags if self.use_tags: tags = [UNK, EOS] + TAGS tag_dim = 10 self._tag_embedder = TokenEmbedder(RandomEmbeddings(tags, tag_dim)) self.project_tag = nn.Linear(tag_dim + self.score_dim, self.score_dim)
def token_embedder(self, base_vocab, embeds_array, dynamic_vocabs): word_embeds = SimpleEmbeddings(embeds_array, base_vocab) base_embedder = TokenEmbedder(word_embeds) return DynamicMultiVocabTokenEmbedder(base_embedder, dynamic_vocabs, base_vocab)