def score(self, features, axis): """ Calculate score for each label :param features: extracted feature values, of size input_size :param axis: axis of the label we are predicting :return: array with score for each label """ super().score(features, axis) num_labels = self.num_labels[axis] if self.updates > 0 and num_labels > 1: if dynet_config.gpu(): # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first value = dy.to_device(self.evaluate(features, axis), 'CPU') # then, we move it back to GPU (if the device name is '', the default device will be selected) value = dy.to_device( dy.log_softmax(value, restrict=list(range(num_labels))), '').npvalue() else: value = dy.log_softmax(self.evaluate(features, axis), restrict=list( range(num_labels))).npvalue() return value[:num_labels] self.config.print(" no updates done yet, returning zero vector.", level=4) return np.zeros(num_labels)
def predict_chunks_by_tokens(self, w_t, chunk_batch): ender = [self.lattice_vocab.chunk_end.i] * self.BATCH_SIZE lps = [] state = self.lattice_rnn.initial_state(dropout=self.DROPOUT) cs = [[self.lattice_vocab.chunk_start.i] * self.BATCH_SIZE ] + chunk_batch cum_lp = dynet.scalarInput(0.0, device=self.args.param_device) for i, (cc, nc) in enumerate(zip(cs, cs[1:])): if self.args.concat_context_vector: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) else: if i == 0: state.add_input(self.project_main_to_lattice_init_R * w_t) else: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) y_t = state.output() y_t = dynet.to_device(y_t, self.args.param_device) if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_lattice_y_t) if self.args.concat_context_vector: y_t = dynet.concatenate([y_t, w_t]) r_t = dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh( dynet.affine_transform( [self.lattice_bias, self.lattice_R, y_t])) ]) if i > 0: lps.append(cum_lp + -dynet.pickneglogsoftmax_batch(r_t, ender)) cum_lp = cum_lp + -dynet.pickneglogsoftmax_batch(r_t, nc) lps.append(cum_lp) return lps
def compress_chunk(self, chunks, masks=None): compression_batch_size = len(chunks[0]) # token_embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, tokens), (self.args.dim,), compression_batch_size) # token_embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, tokens)), (self.args.dim,), compression_batch_size) token_embeddings = [ dynet.pick_batch(self.vocab_R, tokens) for tokens in chunks ] fwd_state = self.lattice_fwd_comp_rnn.initial_state( mb_size=compression_batch_size, dropout=self.DROPOUT) bwd_state = self.lattice_bwd_comp_rnn.initial_state( mb_size=compression_batch_size, dropout=self.DROPOUT) if masks is None: fwd_emb = fwd_state.transduce(token_embeddings)[-1] bwd_emb = bwd_state.transduce(list(reversed(token_embeddings)))[-1] else: masks = [ dynet.inputTensor( mask, batched=True, device=self.args.param_device) if min(mask) == 0 else None for mask in masks ] fwd_emb = fwd_state.transduce(token_embeddings, masks)[-1] bwd_emb = bwd_state.transduce(reversed(token_embeddings), reversed(masks))[-1] emb = dynet.concatenate([fwd_emb, bwd_emb]) emb = dynet.to_device(emb, self.args.param_device) return emb
def score(self, features, axis): """ Calculate score for each label :param features: extracted feature values, of size input_size :param axis: axis of the label we are predicting :return: array with score for each label """ super().score(features, axis) num_labels = self.num_labels[axis] if self.updates > 0 and num_labels > 1: if dynet_config.gpu(): # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first value = dy.to_device(self.evaluate(features, axis), 'CPU') # then, we move it back to GPU (if the device name is '', the default device will be selected) value = dy.to_device(dy.log_softmax(value, restrict=list(range(num_labels))), '').npvalue() else: value = dy.log_softmax(self.evaluate(features, axis), restrict=list(range(num_labels))).npvalue() return value[:num_labels] self.config.print(" no updates done yet, returning zero vector.", level=4) return np.zeros(num_labels)
def calculate_c_t(self): if self.c_t is None: if len(self.c_t_sources) == 1: self.c_t = self.c_t_sources[0] elif self.path_dropout: self.c_t = self.c_t_sources[self.get_path( [w.scalar_value() for w in self.weights])] else: self.c_t = dynet.concatenate_cols( self.c_t_sources) * dynet.to_device( dynet.softmax(self.weights), self.device) return self.c_t
def process_batch(self, batch, training=False): self.TRAINING_ITER = training self.DROPOUT = self.args.dropout if ( self.TRAINING_ITER and self.args.dropout > 0) else None self.BATCH_SIZE = len(batch) sents, masks = self.vocab.batchify(batch) self.instantiate_parameters() init_state = self.rnn.initial_state(mb_size=self.BATCH_SIZE, dropout=self.DROPOUT) # embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, toks), (self.args.dim,), self.BATCH_SIZE) # embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, toks)), (self.args.dim*2,), self.BATCH_SIZE) embeddings = [dynet.pick_batch(self.vocab_R, toks) for toks in sents] outputs = init_state.transduce(embeddings) outputs = [ dynet.to_device(out, self.args.param_device) for out in outputs ] if self.DROPOUT: y_ts = [dynet.cmult(y_t, self.dropout_mask_y_t) for y_t in outputs] else: y_ts = outputs r_ts = [ dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh(dynet.affine_transform([self.bias, self.R, y_t])) ]) for y_t in y_ts ] errs = [ dynet.pickneglogsoftmax_batch(r_t, toks) for r_t, toks in zip(r_ts, sents[1:]) ] for tok_i, (err, mask) in enumerate(zip(errs, masks[1:])): if min(mask) == 0: errs[tok_i] = err * dynet.inputTensor( mask, batched=True, device=self.args.param_device) err = dynet.esum(errs) char_count = [1 + len(self.vocab.pp(sent[1:-1])) for sent in batch] word_count = [len(sent[1:]) for sent in batch] # word_count = [2+self.vocab.pp(sent[1:-1]).count(' ') for sent in batch] return {"loss": err, "charcount": char_count, "wordcount": word_count}
def add_input(self, x_t, mask=None): x_t = dynet.to_device(x_t, self.device) if self.dropout is None: x_t = x_t h_t = self.h_t bias = self.bias else: x_t = dynet.cmult(x_t, self.dropout_mask_x) h_t = dynet.cmult(self.h_t, self.dropout_mask_h) bias = self.bias # calculate all information for all gates in one big matrix multiplication gates = self.W * dynet.concatenate([x_t, h_t, bias]) # input gate i = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # forget gate f = 1.0 - i # output gate o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim * 2)) # input modulation gate g = dynet.tanh(dynet.pickrange(gates, self.dim * 2, self.dim * 3)) # cell state c_t = dynet.cmult(f, self.c_t) + dynet.cmult(i, g) # hidden state h_t = dynet.cmult(o, dynet.tanh(c_t)) if mask is None: self.c_t = c_t self.h_t = h_t else: self.c_t = (c_t * mask) + (self.c_t * (1.0 - mask)) self.h_t = (h_t * mask) + (self.h_t * (1.0 - mask)) if self.next_layer is not None: self.next_layer.add_input(self.h_t, mask)
pb = m.add_parameters(HIDDEN_SIZE, device="GPU:0") pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU") pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) W = dy.parameter(pW) b = dy.parameter(pb) V = dy.parameter(pV) a = dy.parameter(pa) x = dy.vecInput(2, "GPU:0") y = dy.scalarInput(0, "CPU") h = dy.tanh((W * x) + b) h_cpu = dy.to_device(h, "CPU") if xsent: y_pred = dy.logistic((V * h_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V * h_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0 for mi in range(4): x1 = mi % 2
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1") pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0") pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0") pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU") pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) dy.renew_cg() W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa) x = dy.vecInput(2, "GPU:1") y = dy.scalarInput(0, "CPU") h1 = dy.tanh((W1 * x) + b1) h1_gpu0 = dy.to_device(h1, "GPU:0") h2 = dy.tanh((W2 * h1_gpu0) + b2) h2_cpu = dy.to_device(h2, "CPU") if xsent: y_pred = dy.logistic((V * h2_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V * h2_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0
def add_input(self, x_t): x_t = dynet.to_device(x_t, self.device) h_t = self.calculate_h_t() if self.dropout: x_t = dynet.cmult(x_t, self.dropout_mask_x) h_t = dynet.cmult(h_t, self.dropout_mask_h) # bias bias = self.bias # calculate all information for all gates in one big matrix multiplication gates = self.W * dynet.concatenate([x_t, h_t, bias]) # input gate # i = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # output gate # o = dynet.logistic(dynet.pickrange(gates, self.dim, self.dim*2)) # input modulation gate # g = dynet.tanh(dynet.pickrange(gates, self.dim*2, self.dim*3)) # output gate o = dynet.logistic(dynet.pickrange(gates, 0, self.dim)) # input modulation gate g = dynet.tanh(dynet.pickrange(gates, self.dim, self.dim * 2)) # forget gate Wfx = self.Wf * dynet.concatenate([x_t, bias]) if len(self.h_t_sources) == 1 or self.path_dropout: if len(self.h_t_sources) == 1: idx = 0 else: idx = self.get_path() c_t = self.c_t_sources[idx] f_k = dynet.logistic(Wfx + self.Uf * h_t) # input gate i = 1. - f_k # cell state c_t = dynet.cmult(f_k, c_t) + dynet.cmult(i, g) else: weights = dynet.to_device(dynet.softmax(self.weights), self.device) if self.dropout: f_k = [ dynet.logistic(Wfx + self.Uf * dynet.cmult(h, self.dropout_mask_h)) * w for h, w in zip(self.h_t_sources, weights) ] else: f_k = [ dynet.logistic(Wfx + self.Uf * h) * w for h, w in zip(self.h_t_sources, weights) ] # input gate i = 1. - dynet.esum(f_k) # cell state c_t = dynet.esum( [dynet.cmult(f, c) for f, c in zip(f_k, self.c_t_sources)]) + dynet.cmult(i, g) # hidden state h_t = dynet.cmult(o, dynet.tanh(c_t)) if self.next_layer is not None: c_stack, h_stack = self.next_layer.add_input(h_t) return [c_t] + c_stack, [h_t] + h_stack else: return [c_t], [h_t]
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1") pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0") pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0") pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU") pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) dy.renew_cg() W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa) x = dy.vecInput(2, "GPU:1") y = dy.scalarInput(0, "CPU") h1 = dy.tanh((W1*x) + b1) h1_gpu0 = dy.to_device(h1, "GPU:0") h2 = dy.tanh((W2*h1_gpu0) + b2) h2_cpu = dy.to_device(h2, "CPU") if xsent: y_pred = dy.logistic((V*h2_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h2_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS):
def process_batch_internal(self, batch, training=False, debug=False): self.TRAINING_ITER = training self.DROPOUT = self.args.dropout if ( self.TRAINING_ITER and self.args.dropout > 0) else None self.BATCH_SIZE = len(batch) self.instantiate_parameters() if self.args.use_cache: self.initialize_cache(batch) sents, masks = self.vocab.batchify(batch) # paths represent the different connections within the lattice. paths[i] contains all the state/chunk pairs that # end at index i paths = [[] for _ in range(len(sents))] paths[0] = [(self.rnn.fresh_state(init_to_zero=True), sents[0], dynet.scalarInput(0.0, device=self.args.param_device))] for tok_i in range(len(sents) - 1): # calculate the total probability of reaching this state _, _, lps = zip(*paths[tok_i]) if len(lps) == 1: cum_lp = lps[0] else: cum_lp = dynet.logsumexp(list(lps)) # add all previous state/chunk pairs to the tree_lstm new_state = self.rnn.fresh_state() if self.TRAINING_ITER and self.args.train_with_random and not self.first_time_memory_test: raise Exception("bruh") else: self.first_time_memory_test = False for state, c_t, lp in paths[tok_i]: x_t = dynet.pick_batch(self.vocab_R, c_t) h_t_stack, c_t_stack = state.add_input(x_t) new_state.add_history(h_t_stack, c_t_stack, lp) # treeLSTM state merging new_state.concat_weights() if self.args.gumbel_sample: new_state.apply_gumbel_noise_to_weights( temperature=max(.25, self.args.temperature)) if not self.TRAINING_ITER or self.args.sample_train: new_state.weights_to_argmax() # new_state.weights_to_argmax() # output of tree_lstm y_t = dynet.to_device(new_state.output(), self.args.param_device) if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_y_t) # get the list of next tokens to consider base_is = sents[tok_i + 1] n_ts = [[nt + (i * self.vocab.size) for nt in base_is] for i in range(self.args.multi_size)] r_t = dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh(dynet.affine_transform([self.bias, self.R, y_t])) ]) for n_t in n_ts: lp = -dynet.pickneglogsoftmax_batch(r_t, n_t) paths[tok_i + 1].append((new_state, n_t, cum_lp + lp)) ending_masks = [[0.0] * self.BATCH_SIZE for _ in range(len(masks))] for sent_i in range(len(batch)): ending_masks[batch[sent_i].index( self.vocab.end_token.s)][sent_i] = 1.0 # put together all of the final path states to get the final error cum_lp = dynet.scalarInput(0.0, device=self.args.param_device) for path, mask in zip(paths, ending_masks): if max(mask) == 1: assert len(path) != 0 _, _, lps = zip(*path) if len(lps) == 1: local_cum_lp = lps[0] else: local_cum_lp = dynet.logsumexp(list(lps)) cum_lp += local_cum_lp * dynet.inputTensor( mask, batched=True, device=self.args.param_device) if debug: return paths err = -cum_lp char_count = [1 + len(self.vocab.pp(sent[1:-1])) for sent in batch] word_count = [len(sent[1:]) for sent in batch] # word_count = [2+self.lattice_vocab.pp(sent[1:-1]).count(' ') for sent in batch] return {"loss": err, "charcount": char_count, "wordcount": word_count}
def process_batch_internal(self, batch, training=False, debug=False): self.TRAINING_ITER = training self.DROPOUT = self.args.dropout if ( self.TRAINING_ITER and self.args.dropout > 0) else None self.BATCH_SIZE = len(batch) self.instantiate_parameters() if self.args.use_cache: self.initialize_cache(batch) sents, masks = self.lattice_vocab.batchify(batch) # paths represent the different connections within the lattice. paths[i] contains all the state/chunk pairs that # end at index i paths = [[] for _ in range(len(sents))] paths[0] = [(self.rnn.fresh_state(init_to_zero=True), [sents[0]], dynet.scalarInput(0.0, device=self.args.param_device))] for tok_i in range(len(sents) - 1): # calculate the total probability of reaching this state _, _, lps = zip(*paths[tok_i]) if len(lps) == 1: cum_lp = lps[0] else: cum_lp = dynet.logsumexp(list(lps)) # add all previous state/chunk pairs to the tree_lstm new_state = self.rnn.fresh_state() if self.TRAINING_ITER and self.args.train_with_random and not self.first_time_memory_test: state, c_t, lp = random.choice(paths[tok_i]) if self.args.use_cache: x_t = self.cached_embedding_lookup(c_t) else: x_t = self.get_chunk_embedding(c_t) h_t_stack, c_t_stack = state.add_input(x_t) new_state.add_history(h_t_stack, c_t_stack, lp) else: self.first_time_memory_test = False for state, c_t, lp in paths[tok_i]: if self.args.use_cache: x_t = self.cached_embedding_lookup(c_t) else: x_t = self.get_chunk_embedding(c_t) h_t_stack, c_t_stack = state.add_input(x_t) new_state.add_history(h_t_stack, c_t_stack, lp) # treeLSTM state merging new_state.concat_weights() if self.args.gumbel_sample: new_state.apply_gumbel_noise_to_weights( temperature=max(.25, self.args.temperature)) if not self.TRAINING_ITER: new_state.weights_to_argmax() # new_state.weights_to_argmax() # output of tree_lstm y_t = new_state.output() y_t = dynet.to_device(y_t, self.args.param_device) if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_y_t) # based on lattice_size, decide what set of chunks to consider from here if self.args.lattice_size < 1: end_tok_i = len(sents) else: end_tok_i = min(tok_i + 1 + self.args.lattice_size, len(sents)) next_chunks = sents[tok_i + 1:end_tok_i] # for each chunk, calculate the probability of that chunk, and then add a pointer to the state/chunk into # the place in the sentence where the chunk will end assert not (self.args.no_fixed_preds and self.args.no_dynamic_preds) if not self.args.no_fixed_preds: fixed_chunk_lps, use_dynamic_lp = self.predict_chunks( y_t, next_chunks) if not self.args.no_dynamic_preds: dynamic_chunk_lps = self.predict_chunks_by_tokens( y_t, next_chunks) for chunk_i, tok_loc in enumerate(range(tok_i + 1, end_tok_i)): if self.args.no_fixed_preds: lp = dynamic_chunk_lps[chunk_i] elif self.args.no_dynamic_preds: lp = fixed_chunk_lps[chunk_i] else: # we are using both fixed & dynamic predictions lp = dynet.logsumexp([ fixed_chunk_lps[chunk_i], use_dynamic_lp + dynamic_chunk_lps[chunk_i] ]) paths[tok_loc].append( (new_state, sents[tok_i + 1:tok_loc + 1], cum_lp + lp)) ending_masks = [[0.0] * self.BATCH_SIZE for _ in range(len(masks))] for sent_i in range(len(batch)): ending_masks[batch[sent_i].index( self.lattice_vocab.end_token.s)][sent_i] = 1.0 # put together all of the final path states to get the final error cum_lp = dynet.scalarInput(0.0, device=self.args.param_device) for path, mask in zip(paths, ending_masks): if max(mask) == 1: assert len(path) != 0 _, _, lps = zip(*path) if len(lps) == 1: local_cum_lp = lps[0] else: local_cum_lp = dynet.logsumexp(list(lps)) cum_lp += local_cum_lp * dynet.inputTensor( mask, batched=True, device=self.args.param_device) if debug: return paths err = -cum_lp char_count = [ 1 + len(self.lattice_vocab.pp(sent[1:-1])) for sent in batch ] word_count = [len(sent[1:]) for sent in batch] # word_count = [2+self.lattice_vocab.pp(sent[1:-1]).count(' ') for sent in batch] return {"loss": err, "charcount": char_count, "wordcount": word_count}